election/openstack_election/owners.py
Jeremy Stanley 54d0a77b94 Properly short-circuit OSF member lookup loop
When iterating over Gerrit E-mail addresses to find a matching OSF
member Id, stop once one is found. That was the original intent, but
a continue was used instead of a break within the conditional
causing (perhaps very many) unnecessary additional queries against
the member lookup API.

Change-Id: I025170b726c2edd9dcb4996de99eb541b4e8abcf
2018-08-23 20:02:27 +00:00

559 lines
22 KiB
Python

# Copyright (c) 2016 OpenStack Foundation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an "AS
# IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language
# governing permissions and limitations under the License.
# Description: When run using OpenStack's Gerrit server, this builds
# YAML representations of aggregate change owner details and change
# counts for each governance project-team, as well as a combined set
# for all teams.
from __future__ import print_function
import csv
import datetime
import os
import sys
import yaml
from openstack_election import utils
try:
from string import maketrans
except ImportError: # Python3
maketrans = bytes.maketrans
def dumper(data, stream):
"""Convenience wrapper to consistently set YAML formatting"""
return yaml.safe_dump(data, allow_unicode=True, default_flow_style=False,
encoding='utf-8', stream=stream)
def normalize_email(email):
"""Normalize email addresses to make it easier to spot duplicates
Lower-case the domain part of E-mail addresses to better spot
duplicate entries, since the domain part is case-insensitive
courtesy of DNS while the local part is not necessarily
"""
local, domain = email.split('@')
domain = domain.lower()
return '%s@%s' % (local, domain)
def normalize_project(project):
"""Normalize project names for consistent failnames
Replace spaces and hyphens with underscores in project teams
and then lower-case them, for more convenient filenames
"""
return project.translate(maketrans(' -', '__')).lower()
def date_merged(change, after=None, before=None):
"""Determine the date and time a specific change merged"""
date = change.get('submitted', None)
if not date:
# Something's terribly wrong with any changes matching this now
print(
'SKIPPING DATELESS MERGE: change %s for account %s'
% (change['_number'], change['owner']['_account_id']),
file=sys.stderr)
return None
# Strip superfluous subsecond values as Gerrit always just
# reports .000000000 for them anyway
date = date.split('.')[0]
# Pass back an invalid result if it falls after the requested
# cutoff
if before and date >= before:
return None
# Sanity check for completeness, but since "after" is also used
# in the Gerrit query this shouldn't ever actually be reached
if after and date < after:
return None
return date
def main(options):
"""The giant pile of spaghetti which does everything else"""
# Record the start time for use later
start = datetime.datetime.utcnow()
# If we're supplied a configuration file, use it
if options.config:
config = yaml.safe_load(open(options.config))
# Otherwise, use nothing
else:
config = {}
# Start of the match timeframe for change merges
if options.after:
after = options.after
elif 'after' in config:
after = config['after']
else:
after = None
# End of the match timeframe for change merges
if options.before:
before = options.before
elif 'before' in config:
before = config['before']
else:
before = None
# Owner Ids for whom to ignore changes
if options.ignore:
ignore = [int(i) for i in options.ignore]
elif 'ignore' in config:
ignore = config['ignore']
else:
ignore = []
# Whether to omit "extra ATCs"
if options.no_extra_atcs:
no_extra_atcs = options.no_extra_atcs
elif 'no-extra-atcs' in config:
no_extra_atcs = config['no-extra-atcs']
else:
no_extra_atcs = False
# Output file directory
if options.outdir:
outdir = options.outdir
elif 'outdir' in config:
outdir = config['outdir']
else:
outdir = '.'
if not os.path.isdir(outdir):
os.makedirs(outdir)
# Governance Git repository ref object for reference lists
if options.ref:
ref = options.ref
elif 'ref' in config:
ref = config['ref']
else:
ref = 'refs/heads/master'
# Gerrit change query additions
if options.sieve:
sieve = options.sieve
elif 'sieve' in config:
sieve = config['sieve']
else:
sieve = None
# The query identifying relevant changes
match = 'status:merged'
if after:
match = '%s after:"%s"' % (match, after)
if sieve:
match = '%s %s' % (match, sieve)
# Retrieve the governance projects list, needs a Git refname as a
# parameter
# TODO(fungi): make this a configurable option so that you can
# for example supply a custom project list for running elections
# in unofficial teams
gov_projects = utils.get_from_cgit('openstack/governance',
'reference/projects.yaml',
{'h': ref})
# The set of retired or removed "legacy" projects from governance
# are merged into the main dict if their retired-on date falls
# later than the after parameter for the qualifying time period
# TODO(fungi): make this a configurable option
old_projects = utils.get_from_cgit('openstack/governance',
'reference/legacy.yaml',
{'h': ref})
for project in old_projects:
for deliverable in old_projects[project]['deliverables']:
retired = old_projects[project]['deliverables'][deliverable].get(
'retired-on',
old_projects[project].get('retired-on')
)
if retired:
retired = retired.isoformat()
if after and after > retired:
continue
if project not in gov_projects:
gov_projects[project] = {'deliverables': {}}
if deliverable in gov_projects[project]['deliverables']:
print(('Skipping duplicate/partially retired deliverable:'
' %s') % (deliverable),
file=sys.stderr)
continue
gov_projects[project]['deliverables'][deliverable] = \
old_projects[project]['deliverables'][deliverable]
# A mapping of short (no prefix) to full repo names existing in
# Gerrit, used to handle repos which have a different namespace
# in governance during transitions and also to filter out repos
# listed in governance which don't actually exist
ger_repos = dict(
[(x.split('/')[-1], x) for x in utils.query_gerrit('projects/')])
# This will be populated with change owners mapped to the
# project-teams maintaining their respective Git repositories
projects = {}
# This will be populated with all change owners and their
# account details
owners = {}
# This will be populated with discovered duplicate owners
duplicates = {}
# This will be populated with all individual E-mail addresses of
# change owners, to facilitate finding and merging duplicate
# accounts
all_emails = {}
# Iterate over all governance project-teams only at filename
# generation time
for project in gov_projects:
# This will be populated with change owner Ids and counts
projects[project] = {}
# Governance project-teams have one or more deliverables
for deliverable in gov_projects[project]['deliverables']:
# Each deliverable can have multiple repos
repos = gov_projects[project]['deliverables'][deliverable]['repos']
# Operate on repo short-names (no namespace) to avoid
# potential namespace mismatches between governance
# and Gerrit
for repo in [r.split('/')[-1] for r in repos]:
# Only process repos which actually exist in Gerrit,
# otherwise spew a warning if skipping
if repo not in ger_repos:
print('MISSING: %s' % repo, file=sys.stderr)
else:
# Query for an arbitrary change set and get
# detailed account information about the most
# recent patchset, paginating at 100 changes
offset = 0
changes = []
while offset >= 0:
changes += utils.query_gerrit('changes/', params={
'q': 'project:%s %s' % (ger_repos[repo], match),
'n': '100',
'start': offset,
'o': [
'CURRENT_COMMIT',
'CURRENT_REVISION',
'DETAILED_ACCOUNTS',
],
})
if changes and changes[-1].get('_more_changes', False):
offset += 100
else:
offset = -1
# Iterate over each matched change in the repo
for change in changes:
# Get the merge date and skip if it's
# outside any requested date range
merged = date_merged(change, after, before)
if not merged:
continue
# We index owners by their unique Gerrit
# account Id numbers
owner = change['owner']['_account_id']
# If this owner is in the blacklist of Ids
# to skip, then move on to the next change
if owner in ignore:
continue
# Seen this owner already?
new_owner = owner
new = False
if owner in duplicates:
owner = duplicates[owner]
elif owner not in owners:
new = True
# For new additions, initialize this as
# their first and record specific account
# details
if new:
# Get the set of all E-mail addresses
# Gerrit knows for this owner's account
emails = utils.query_gerrit(
'accounts/%s/emails'
% change['owner']['_account_id'])
# Find duplicate addresses and merge
# accounts when that happens
for email in emails:
address = normalize_email(email['email'])
if address in all_emails:
owner = all_emails[address]
duplicates[new_owner] = owner
print(
'MERGING DUPLICATE ACCOUNT: %s into %s'
% (new_owner, owner), file=sys.stderr)
break
# For newly found non-duplicate owners,
# initialize the global change count,
# newest/oldest merged dates, and an empty
# list where extra E-mail addresses can be
# added; also track their full name and
# Gerrit username
if new and owner == new_owner:
# TODO(fungi): this is a prime candidate
# to become a struct, or maybe a class
owners[owner] = {
'count': 1,
'extra': [],
'name': change['owner'].get('name'),
'newest': merged,
'oldest': merged,
'username': change['owner'].get('username'),
}
# If we've seen this owner on another change
# in any repo then just iterate their global
# change counter and update newest/oldest
# dates
else:
owners[owner]['count'] += 1
if merged > owners[owner]['newest']:
owners[owner]['newest'] = merged
elif merged < owners[owner]['oldest']:
owners[owner]['oldest'] = merged
# We only want to add addresses if this is a
# new owner or a new duplicate
if new:
# Iterate over each E-mail address
for email in emails:
# Normalize the address before
# performing any matching since
# Gerrit doesn't do a great job of
# this on its own
address = normalize_email(email['email'])
# Track this in the full list of all
# known E-mail addresses
all_emails[address] = owner
# Whether Gerrit considers this the
# preferred E-mail address
preferred = email.get('preferred', False)
# Store the preferred E-mail address
# under its own key since it has a
# special status, but only if this
# is not a duplicate account
if preferred and owner == new_owner:
owners[owner]['preferred'] = address
# If this was already added to
# the extras list due to an
# additional pre-normalized
# copy, remove it there
if address in owners[owner]['extra']:
owners[owner]['extra'].remove(address)
# Store a list of non-preferred
# addresses, deduplicating them in
# case they match post-normalization
# and treating duplicate preferred
# addresses as # non-preferred
else:
if ((address not in owners[owner]['extra'])
and (address != owners[owner].get(
'preferred', ''))):
owners[owner]['extra'].append(address)
# If we've seen this owner on another change
# in a repo under this project-team then
# just iterate their team change counter and
# update newest/oldest dates
if owner in projects[project]:
projects[project][owner]['count'] += 1
if merged > projects[project][owner]['newest']:
projects[project][owner]['newest'] = merged
elif merged < projects[project][owner]['oldest']:
projects[project][owner]['oldest'] = merged
# ...otherwise initialize this as their
# first
else:
# TODO(fungi): another potential struct
projects[project][owner] = {
'count': 1,
'newest': merged,
'oldest': merged,
}
# The negative counter will be used as a makeshift account Id
# for non-code contributors; those with owned changes use their
# Gerrit account Id instead
counter = 1
# Use the before time as the only contribution time for non-code
# contributors, falling back on the script start time if before
# was not specified
if before:
if len(before) == 10:
stamp = before + ' 00:00:00'
else:
stamp = before
else:
stamp = start.isoformat(sep=' ').split('.')[0]
# Iterate over all extra-atcs entries
if not no_extra_atcs:
for project in gov_projects:
for extra_atc in gov_projects[project].get('extra-atcs', []):
name = extra_atc['name']
email = extra_atc['email']
address = normalize_email(email)
if address in all_emails:
owner = all_emails[address]
else:
owner = -counter
all_emails[address] = owner
owners[owner] = {
'count': -1,
'extra': [],
'name': name,
'newest': stamp,
'oldest': stamp,
'preferred': address,
'username': '_non_code_contributor',
}
if owner not in projects[project]:
projects[project][owner] = {
'count': -1,
'newest': stamp,
'oldest': stamp,
}
counter += 1
# This will hold an address list for TC electorate rolls
electorate = []
# A table of owners for summit invites
invites = []
# A fresh pass through the owners to build some other datasets
for owner in owners:
# Sort extra E-mail address lists for ease of comparison
owners[owner]['extra'].sort()
# Build the data used for an invite
if 'name' not in owners[owner] or not owners[owner]['name']:
print(
'SKIPPING MALFORMED OWNER: no fullname found for account %s' %
owner, file=sys.stderr)
continue
if 'preferred' not in owners[owner]:
if 'extra' in owners[owner] and owners[owner]['extra']:
owners[owner]['preferred'] = owners[owner]['extra'][0]
owners[owner]['extra'] = owners[owner]['extra'][1:]
print(
'MISSING PREFERRED EMAIL: used first extra address as '
'account %s preferred' % owner, file=sys.stderr)
else:
print(
'SKIPPING MALFORMED OWNER: no preferred or extra '
'addresses found for account %s' % owner, file=sys.stderr)
continue
for email in [owners[owner]['preferred']] + owners[owner]['extra']:
member = utils.lookup_member(email)
if member['data']:
owners[owner]['member'] = member['data'][0]['id']
break
invite = [owners[owner].get('member', '0')]
invite.append(owners[owner]['name'].encode('utf-8'))
invite.append(owners[owner]['preferred'])
invite += owners[owner]['extra']
invites.append(invite)
# Append preferred addresses to the TC electorate for members only
if 'member' in owners[owner]:
electorate.append(owners[owner]['preferred'] + '\n')
# Write out a YAML file covering all change owners
fd = open(os.path.join(outdir, '_all_owners.yaml'), 'w')
dumper(owners, stream=fd)
fd.close()
# Write out a YAML file covering tracked duplicate accounts
fd = open(os.path.join(outdir, '_duplicate_owners.yaml'), 'w')
dumper(duplicates, stream=fd)
fd.close()
# Write out a team-specific electoral roll for CIVS
fd = open(os.path.join(outdir, '_electorate.txt'), 'w')
fd.writelines(electorate)
fd.close()
# Write out a CSV file appropriate for the invite2summit tool
fd = open(os.path.join(outdir, '_invites.csv'), 'w')
csv.writer(fd).writerows(invites)
fd.close()
# Make another pass through the projects so they can be dumped
# to our output files
for project in projects:
# This will hold team-specific info for writing
output = {}
# This will hold an address list for PTL electoral rolls
electorate = []
# Use a normalized project name for output file names
normalized_project = normalize_project(project)
# Iterate over each change owner for the current team
for owner in projects[project]:
# Copy the global owner details into our output since
# we're going to modify some
output[owner] = dict(owners[owner])
# Replace the owner change count and newest/oldest
# merged dates with the team-specific value rather than
# using the count from the global set
for field in ('count', 'newest', 'oldest'):
output[owner][field] = projects[project][owner][field]
# Append preferred member addresses to the PTL electoral rolls
if 'member' in owners[owner]:
electorate.append(owners[owner]['preferred'] + '\n')
# Write out a team-specific YAML file
fd = open(os.path.join(outdir, '%s.yaml' % normalized_project), 'w')
dumper(output, stream=fd)
fd.close()
# Write out a team-specific electoral roll for CIVS
fd = open(os.path.join(outdir, '%s.txt' % normalized_project), 'w')
fd.writelines(electorate)
fd.close()