%matplotlib inline
import sys
print(f'Python {sys.version}')
import IPython
print(f'IPython {IPython.__version__}')
print('\nLibraries:\n')
import matplotlib
import matplotlib.pyplot as plt
print(f'matplotlib {matplotlib.__version__}')
import numpy as np
print(f'numpy {np.__version__}')
import pandas as pd
from pandas.plotting import register_matplotlib_converters
print(f'pandas {pd.__version__}')
import requests
print(f'requests {requests.__version__}')
Python 3.7.4 (default, Jul 8 2019, 18:31:06) [GCC 7.4.0] IPython 7.6.1 Libraries: matplotlib 3.1.1 numpy 1.17.0 pandas 0.25.0 requests 2.22.0
api_token = ''
def send_rest_request(url):
headers = {'Authorization': f'token {api_token}'}
r = requests.get(url=url, headers=headers)
r.raise_for_status() # Abort if unsuccessful request
return r.json()
def send_graphql_request(query, variables):
headers = {'Authorization': f'token {api_token}'}
url = 'https://api.github.com/graphql'
json = {'query':query, 'variables':variables}
r = requests.post(url=url, json=json, headers=headers)
r.raise_for_status() # Abort if unsuccessful request
return r.json()
GitHub only provide two APIs to get a list of organization: a REST endpoint that allows to get the full list, but requires many requests, given that there are more than 2,000,000 organizations on GitHub (https://developer.github.com/changes/2015-06-17-organizations-endpoint/) and given that this first type of request will only provide the list of organization logins and descriptions, but nothing more, or the Search API that is limited to browsing 1000 results.
We choose to use the second to limit the number of requests, but this imposes to find ways of querying for less than 1000 results at a time, using the limited filters that search queries provide.
Our first restriction will be to limit ourselves to organizations with at least 5 public repositories. We are aware that this is an arbitrary restriction that will exclude community organizations that are just starting and have not yet reached that number.
Our second restriction will be to search by keywords. We list as many keywords as we could think that could appear in the names or the descriptions of this type of organizations:
keywords = [
# To add next time: 'addon', 'addons',
'app', 'apps', 'application', 'applications',
'care', 'caring',
'collab', 'collaboration', 'collaborative',
'collection', 'collective',
'common', 'commons',
'community',
'component', 'components',
# To add next time: 'contribs'
'contrib', 'contribution', 'contributions', 'contributing',
'distribute', 'distribution', 'distributions',
'ecosystem', 'ecosystems',
'extension', 'extensions',
'gather',
'give', 'giving',
'group',
'help', 'helper', 'helpers',
'library', 'libraries',
'maintain', 'maintainer', 'maintainers', 'maintenance', 'maintaining',
'member', 'members',
'module', 'modules',
'open source',
'org', 'organization',
'package', 'packages',
'participate', 'participant', 'participants', 'participation',
'people',
'place',
'plugin', 'plugins',
'projects',
# Not project singular because that would give too many results
# and this is not about organizations focused on a single project
'quality',
'repository', 'repositories',
'reuse', 'reusable',
'share', 'shared', 'sharing',
'support', 'supporter', 'supporters', 'supporting',
'together',
# To add next time: tool, tools
'unofficial',
'user', 'users'
]
len(keywords)
75
For some keywords, this still gives too many results so we additionally partition using language filters:
language_filters = [
'language:JavaScript',
'language:Java',
'language:Python',
'language:PHP',
'language:HTML',
'language:C#',
'language:C++',
'language:C',
'language:CSS',
'-language:JavaScript -language:Java -language:Python -language:PHP -language:HTML -language:C# -language:C++ -language:C -language:Ruby -language:CSS'
]
query = '''
query searchOrganizations($query: String!,$cursor: String) {
search(type:USER,query:$query, first: 50, after: $cursor) {
userCount
pageInfo {
endCursor
hasNextPage
}
nodes {
... on Organization {
login
name
description
websiteUrl
membersWithRole {
totalCount
}
repositories(first: 1, orderBy: {field: STARGAZERS, direction: DESC}) {
totalCount
nodes {
stargazers {
totalCount
}
assignableUsers {
totalCount
}
}
}
}
}
}
}
'''
columns = [
'name',
'description',
'url',
'members', # Number of public members
'repositories', # Number of public repositories
'stars', # Number of stars of the most starred repository
'collaborators' # Number of assignable users of the most starred repository
]
keyword_columns = list(map(lambda keyword: f'keyword {keyword}', keywords))
values = pd.DataFrame(columns=columns + keyword_columns).astype({
'members': 'UInt32',
'repositories': 'UInt32',
'stars': 'UInt32',
'collaborators': 'UInt32'
})
def paged_query(keyword, language=''):
if keyword == 'repository' or keyword == 'user':
exclude = 'NOT aur-archive'
elif keyword == 'collaborative':
exclude = 'NOT GITenberg'
else:
exclude = ''
next_page = True
cursor = None
while next_page:
searchQuery = f'type:organization repos:>=5 {keyword} {exclude} {language}'
print(f'Search query: {searchQuery}')
json = send_graphql_request(
query,
{'query': searchQuery, 'cursor': cursor}
)
search_json = json['data']['search']
nb_results = search_json['userCount']
if nb_results > 1000:
raise ValueError('Query not restricted enough: more than 1000 results.')
page_info = search_json['pageInfo']
next_page = page_info['hasNextPage']
cursor = page_info['endCursor']
for node in search_json['nodes']:
# Index
login = node['login']
# Fields
name = node['name']
values.loc[login, 'name'] = name
values.loc[login, 'description'] = node['description']
values.loc[login, 'url'] = node['websiteUrl']
values.loc[login, 'members'] = node['membersWithRole']['totalCount']
repos_json = node['repositories']
repos_nb = repos_json['totalCount']
values.loc[login, 'repositories'] = repos_nb
if repos_nb > 0:
repo_json = repos_json['nodes'][0]
values.loc[login, 'stars'] = repo_json['stargazers']['totalCount']
values.loc[login, 'collaborators'] = repo_json['assignableUsers']['totalCount']
values.loc[login, f'keyword {keyword}'] = True
for keyword in keywords[60:]:
try:
paged_query(keyword)
print(f'Now fetched a total number of {len(values)} organizations.')
except ValueError:
for language in language_filters:
paged_query(keyword, language)
print(f'Now fetched a total number of {len(values)} organizations.')
Search query: type:organization repos:>=5 repository NOT aur-archive Search query: type:organization repos:>=5 repository NOT aur-archive language:JavaScript Search query: type:organization repos:>=5 repository NOT aur-archive language:JavaScript Search query: type:organization repos:>=5 repository NOT aur-archive language:JavaScript Search query: type:organization repos:>=5 repository NOT aur-archive language:JavaScript Search query: type:organization repos:>=5 repository NOT aur-archive language:JavaScript Search query: type:organization repos:>=5 repository NOT aur-archive language:JavaScript Search query: type:organization repos:>=5 repository NOT aur-archive language:JavaScript Search query: type:organization repos:>=5 repository NOT aur-archive language:JavaScript Now fetched a total number of 27130 organizations. Search query: type:organization repos:>=5 repository NOT aur-archive language:Java Search query: type:organization repos:>=5 repository NOT aur-archive language:Java Search query: type:organization repos:>=5 repository NOT aur-archive language:Java Search query: type:organization repos:>=5 repository NOT aur-archive language:Java Search query: type:organization repos:>=5 repository NOT aur-archive language:Java Search query: type:organization repos:>=5 repository NOT aur-archive language:Java Search query: type:organization repos:>=5 repository NOT aur-archive language:Java Now fetched a total number of 27337 organizations. Search query: type:organization repos:>=5 repository NOT aur-archive language:Python Search query: type:organization repos:>=5 repository NOT aur-archive language:Python Search query: type:organization repos:>=5 repository NOT aur-archive language:Python Search query: type:organization repos:>=5 repository NOT aur-archive language:Python Search query: type:organization repos:>=5 repository NOT aur-archive language:Python Search query: type:organization repos:>=5 repository NOT aur-archive language:Python Search query: type:organization repos:>=5 repository NOT aur-archive language:Python Now fetched a total number of 27532 organizations. Search query: type:organization repos:>=5 repository NOT aur-archive language:PHP Search query: type:organization repos:>=5 repository NOT aur-archive language:PHP Search query: type:organization repos:>=5 repository NOT aur-archive language:PHP Search query: type:organization repos:>=5 repository NOT aur-archive language:PHP Now fetched a total number of 27632 organizations. Search query: type:organization repos:>=5 repository NOT aur-archive language:HTML Search query: type:organization repos:>=5 repository NOT aur-archive language:HTML Search query: type:organization repos:>=5 repository NOT aur-archive language:HTML Search query: type:organization repos:>=5 repository NOT aur-archive language:HTML Now fetched a total number of 27728 organizations. Search query: type:organization repos:>=5 repository NOT aur-archive language:C# Search query: type:organization repos:>=5 repository NOT aur-archive language:C# Now fetched a total number of 27785 organizations. Search query: type:organization repos:>=5 repository NOT aur-archive language:C++ Search query: type:organization repos:>=5 repository NOT aur-archive language:C++ Search query: type:organization repos:>=5 repository NOT aur-archive language:C++ Search query: type:organization repos:>=5 repository NOT aur-archive language:C++ Now fetched a total number of 27894 organizations. Search query: type:organization repos:>=5 repository NOT aur-archive language:C Search query: type:organization repos:>=5 repository NOT aur-archive language:C Search query: type:organization repos:>=5 repository NOT aur-archive language:C Search query: type:organization repos:>=5 repository NOT aur-archive language:C Now fetched a total number of 27993 organizations. Search query: type:organization repos:>=5 repository NOT aur-archive language:CSS Search query: type:organization repos:>=5 repository NOT aur-archive language:CSS Now fetched a total number of 28033 organizations. Search query: type:organization repos:>=5 repository NOT aur-archive -language:JavaScript -language:Java -language:Python -language:PHP -language:HTML -language:C# -language:C++ -language:C -language:Ruby -language:CSS Search query: type:organization repos:>=5 repository NOT aur-archive -language:JavaScript -language:Java -language:Python -language:PHP -language:HTML -language:C# -language:C++ -language:C -language:Ruby -language:CSS Search query: type:organization repos:>=5 repository NOT aur-archive -language:JavaScript -language:Java -language:Python -language:PHP -language:HTML -language:C# -language:C++ -language:C -language:Ruby -language:CSS Search query: type:organization repos:>=5 repository NOT aur-archive -language:JavaScript -language:Java -language:Python -language:PHP -language:HTML -language:C# -language:C++ -language:C -language:Ruby -language:CSS Search query: type:organization repos:>=5 repository NOT aur-archive -language:JavaScript -language:Java -language:Python -language:PHP -language:HTML -language:C# -language:C++ -language:C -language:Ruby -language:CSS Search query: type:organization repos:>=5 repository NOT aur-archive -language:JavaScript -language:Java -language:Python -language:PHP -language:HTML -language:C# -language:C++ -language:C -language:Ruby -language:CSS Search query: type:organization repos:>=5 repository NOT aur-archive -language:JavaScript -language:Java -language:Python -language:PHP -language:HTML -language:C# -language:C++ -language:C -language:Ruby -language:CSS Search query: type:organization repos:>=5 repository NOT aur-archive -language:JavaScript -language:Java -language:Python -language:PHP -language:HTML -language:C# -language:C++ -language:C -language:Ruby -language:CSS Search query: type:organization repos:>=5 repository NOT aur-archive -language:JavaScript -language:Java -language:Python -language:PHP -language:HTML -language:C# -language:C++ -language:C -language:Ruby -language:CSS Search query: type:organization repos:>=5 repository NOT aur-archive -language:JavaScript -language:Java -language:Python -language:PHP -language:HTML -language:C# -language:C++ -language:C -language:Ruby -language:CSS Search query: type:organization repos:>=5 repository NOT aur-archive -language:JavaScript -language:Java -language:Python -language:PHP -language:HTML -language:C# -language:C++ -language:C -language:Ruby -language:CSS Now fetched a total number of 28390 organizations. Search query: type:organization repos:>=5 repositories Search query: type:organization repos:>=5 repositories language:JavaScript Search query: type:organization repos:>=5 repositories language:JavaScript Search query: type:organization repos:>=5 repositories language:JavaScript Search query: type:organization repos:>=5 repositories language:JavaScript Search query: type:organization repos:>=5 repositories language:JavaScript Now fetched a total number of 28401 organizations. Search query: type:organization repos:>=5 repositories language:Java Search query: type:organization repos:>=5 repositories language:Java Search query: type:organization repos:>=5 repositories language:Java Search query: type:organization repos:>=5 repositories language:Java Now fetched a total number of 28421 organizations. Search query: type:organization repos:>=5 repositories language:Python Search query: type:organization repos:>=5 repositories language:Python Search query: type:organization repos:>=5 repositories language:Python Search query: type:organization repos:>=5 repositories language:Python Search query: type:organization repos:>=5 repositories language:Python Now fetched a total number of 28437 organizations. Search query: type:organization repos:>=5 repositories language:PHP Search query: type:organization repos:>=5 repositories language:PHP Now fetched a total number of 28443 organizations. Search query: type:organization repos:>=5 repositories language:HTML Search query: type:organization repos:>=5 repositories language:HTML Now fetched a total number of 28447 organizations. Search query: type:organization repos:>=5 repositories language:C# Now fetched a total number of 28447 organizations. Search query: type:organization repos:>=5 repositories language:C++ Search query: type:organization repos:>=5 repositories language:C++ Now fetched a total number of 28453 organizations. Search query: type:organization repos:>=5 repositories language:C Search query: type:organization repos:>=5 repositories language:C Search query: type:organization repos:>=5 repositories language:C Now fetched a total number of 28457 organizations. Search query: type:organization repos:>=5 repositories language:CSS Now fetched a total number of 28457 organizations. Search query: type:organization repos:>=5 repositories -language:JavaScript -language:Java -language:Python -language:PHP -language:HTML -language:C# -language:C++ -language:C -language:Ruby -language:CSS Search query: type:organization repos:>=5 repositories -language:JavaScript -language:Java -language:Python -language:PHP -language:HTML -language:C# -language:C++ -language:C -language:Ruby -language:CSS Search query: type:organization repos:>=5 repositories -language:JavaScript -language:Java -language:Python -language:PHP -language:HTML -language:C# -language:C++ -language:C -language:Ruby -language:CSS Search query: type:organization repos:>=5 repositories -language:JavaScript -language:Java -language:Python -language:PHP -language:HTML -language:C# -language:C++ -language:C -language:Ruby -language:CSS Search query: type:organization repos:>=5 repositories -language:JavaScript -language:Java -language:Python -language:PHP -language:HTML -language:C# -language:C++ -language:C -language:Ruby -language:CSS Search query: type:organization repos:>=5 repositories -language:JavaScript -language:Java -language:Python -language:PHP -language:HTML -language:C# -language:C++ -language:C -language:Ruby -language:CSS Now fetched a total number of 28480 organizations. Search query: type:organization repos:>=5 reuse Search query: type:organization repos:>=5 reuse Now fetched a total number of 28506 organizations. Search query: type:organization repos:>=5 reusable Search query: type:organization repos:>=5 reusable Now fetched a total number of 28513 organizations. Search query: type:organization repos:>=5 share Search query: type:organization repos:>=5 share Search query: type:organization repos:>=5 share Search query: type:organization repos:>=5 share Search query: type:organization repos:>=5 share Search query: type:organization repos:>=5 share Search query: type:organization repos:>=5 share Search query: type:organization repos:>=5 share Search query: type:organization repos:>=5 share Search query: type:organization repos:>=5 share Search query: type:organization repos:>=5 share Search query: type:organization repos:>=5 share Search query: type:organization repos:>=5 share Search query: type:organization repos:>=5 share Search query: type:organization repos:>=5 share Search query: type:organization repos:>=5 share Now fetched a total number of 28926 organizations. Search query: type:organization repos:>=5 shared Search query: type:organization repos:>=5 shared Search query: type:organization repos:>=5 shared Now fetched a total number of 28934 organizations. Search query: type:organization repos:>=5 sharing Search query: type:organization repos:>=5 sharing Search query: type:organization repos:>=5 sharing Search query: type:organization repos:>=5 sharing Search query: type:organization repos:>=5 sharing Now fetched a total number of 28946 organizations. Search query: type:organization repos:>=5 support Search query: type:organization repos:>=5 support language:JavaScript Search query: type:organization repos:>=5 support language:JavaScript Search query: type:organization repos:>=5 support language:JavaScript Search query: type:organization repos:>=5 support language:JavaScript Search query: type:organization repos:>=5 support language:JavaScript Search query: type:organization repos:>=5 support language:JavaScript Search query: type:organization repos:>=5 support language:JavaScript Search query: type:organization repos:>=5 support language:JavaScript Search query: type:organization repos:>=5 support language:JavaScript Search query: type:organization repos:>=5 support language:JavaScript Search query: type:organization repos:>=5 support language:JavaScript Search query: type:organization repos:>=5 support language:JavaScript Search query: type:organization repos:>=5 support language:JavaScript Search query: type:organization repos:>=5 support language:JavaScript Search query: type:organization repos:>=5 support language:JavaScript Search query: type:organization repos:>=5 support language:JavaScript Search query: type:organization repos:>=5 support language:JavaScript Search query: type:organization repos:>=5 support language:JavaScript Search query: type:organization repos:>=5 support language:JavaScript Search query: type:organization repos:>=5 support language:JavaScript Now fetched a total number of 29667 organizations. Search query: type:organization repos:>=5 support language:Java Search query: type:organization repos:>=5 support language:Java Search query: type:organization repos:>=5 support language:Java Search query: type:organization repos:>=5 support language:Java Search query: type:organization repos:>=5 support language:Java Search query: type:organization repos:>=5 support language:Java Search query: type:organization repos:>=5 support language:Java Search query: type:organization repos:>=5 support language:Java Search query: type:organization repos:>=5 support language:Java Now fetched a total number of 29957 organizations. Search query: type:organization repos:>=5 support language:Python Search query: type:organization repos:>=5 support language:Python Search query: type:organization repos:>=5 support language:Python Search query: type:organization repos:>=5 support language:Python Search query: type:organization repos:>=5 support language:Python Search query: type:organization repos:>=5 support language:Python Search query: type:organization repos:>=5 support language:Python Search query: type:organization repos:>=5 support language:Python Search query: type:organization repos:>=5 support language:Python Now fetched a total number of 30246 organizations. Search query: type:organization repos:>=5 support language:PHP Search query: type:organization repos:>=5 support language:PHP Search query: type:organization repos:>=5 support language:PHP Search query: type:organization repos:>=5 support language:PHP Search query: type:organization repos:>=5 support language:PHP Search query: type:organization repos:>=5 support language:PHP Search query: type:organization repos:>=5 support language:PHP Search query: type:organization repos:>=5 support language:PHP Search query: type:organization repos:>=5 support language:PHP Search query: type:organization repos:>=5 support language:PHP Search query: type:organization repos:>=5 support language:PHP Search query: type:organization repos:>=5 support language:PHP Now fetched a total number of 30700 organizations. Search query: type:organization repos:>=5 support language:HTML Search query: type:organization repos:>=5 support language:HTML Search query: type:organization repos:>=5 support language:HTML Search query: type:organization repos:>=5 support language:HTML Now fetched a total number of 30807 organizations. Search query: type:organization repos:>=5 support language:C# Search query: type:organization repos:>=5 support language:C# Search query: type:organization repos:>=5 support language:C# Search query: type:organization repos:>=5 support language:C# Now fetched a total number of 30919 organizations. Search query: type:organization repos:>=5 support language:C++ Search query: type:organization repos:>=5 support language:C++ Search query: type:organization repos:>=5 support language:C++ Search query: type:organization repos:>=5 support language:C++ Now fetched a total number of 31034 organizations. Search query: type:organization repos:>=5 support language:C Search query: type:organization repos:>=5 support language:C Search query: type:organization repos:>=5 support language:C Now fetched a total number of 31134 organizations. Search query: type:organization repos:>=5 support language:CSS Search query: type:organization repos:>=5 support language:CSS Search query: type:organization repos:>=5 support language:CSS Now fetched a total number of 31200 organizations. Search query: type:organization repos:>=5 support -language:JavaScript -language:Java -language:Python -language:PHP -language:HTML -language:C# -language:C++ -language:C -language:Ruby -language:CSS Search query: type:organization repos:>=5 support -language:JavaScript -language:Java -language:Python -language:PHP -language:HTML -language:C# -language:C++ -language:C -language:Ruby -language:CSS Search query: type:organization repos:>=5 support -language:JavaScript -language:Java -language:Python -language:PHP -language:HTML -language:C# -language:C++ -language:C -language:Ruby -language:CSS Search query: type:organization repos:>=5 support -language:JavaScript -language:Java -language:Python -language:PHP -language:HTML -language:C# -language:C++ -language:C -language:Ruby -language:CSS Search query: type:organization repos:>=5 support -language:JavaScript -language:Java -language:Python -language:PHP -language:HTML -language:C# -language:C++ -language:C -language:Ruby -language:CSS Search query: type:organization repos:>=5 support -language:JavaScript -language:Java -language:Python -language:PHP -language:HTML -language:C# -language:C++ -language:C -language:Ruby -language:CSS Search query: type:organization repos:>=5 support -language:JavaScript -language:Java -language:Python -language:PHP -language:HTML -language:C# -language:C++ -language:C -language:Ruby -language:CSS Search query: type:organization repos:>=5 support -language:JavaScript -language:Java -language:Python -language:PHP -language:HTML -language:C# -language:C++ -language:C -language:Ruby -language:CSS Search query: type:organization repos:>=5 support -language:JavaScript -language:Java -language:Python -language:PHP -language:HTML -language:C# -language:C++ -language:C -language:Ruby -language:CSS Search query: type:organization repos:>=5 support -language:JavaScript -language:Java -language:Python -language:PHP -language:HTML -language:C# -language:C++ -language:C -language:Ruby -language:CSS Search query: type:organization repos:>=5 support -language:JavaScript -language:Java -language:Python -language:PHP -language:HTML -language:C# -language:C++ -language:C -language:Ruby -language:CSS Search query: type:organization repos:>=5 support -language:JavaScript -language:Java -language:Python -language:PHP -language:HTML -language:C# -language:C++ -language:C -language:Ruby -language:CSS Search query: type:organization repos:>=5 support -language:JavaScript -language:Java -language:Python -language:PHP -language:HTML -language:C# -language:C++ -language:C -language:Ruby -language:CSS Search query: type:organization repos:>=5 support -language:JavaScript -language:Java -language:Python -language:PHP -language:HTML -language:C# -language:C++ -language:C -language:Ruby -language:CSS Search query: type:organization repos:>=5 support -language:JavaScript -language:Java -language:Python -language:PHP -language:HTML -language:C# -language:C++ -language:C -language:Ruby -language:CSS Search query: type:organization repos:>=5 support -language:JavaScript -language:Java -language:Python -language:PHP -language:HTML -language:C# -language:C++ -language:C -language:Ruby -language:CSS Now fetched a total number of 31751 organizations. Search query: type:organization repos:>=5 supporter Now fetched a total number of 31754 organizations. Search query: type:organization repos:>=5 supporters Now fetched a total number of 31754 organizations. Search query: type:organization repos:>=5 supporting Search query: type:organization repos:>=5 supporting Search query: type:organization repos:>=5 supporting Now fetched a total number of 31759 organizations. Search query: type:organization repos:>=5 together Search query: type:organization repos:>=5 together Search query: type:organization repos:>=5 together Search query: type:organization repos:>=5 together Search query: type:organization repos:>=5 together Search query: type:organization repos:>=5 together Search query: type:organization repos:>=5 together Search query: type:organization repos:>=5 together Now fetched a total number of 31941 organizations. Search query: type:organization repos:>=5 unofficial Search query: type:organization repos:>=5 unofficial Search query: type:organization repos:>=5 unofficial Search query: type:organization repos:>=5 unofficial Search query: type:organization repos:>=5 unofficial Now fetched a total number of 32043 organizations. Search query: type:organization repos:>=5 user NOT aur-archive Search query: type:organization repos:>=5 user NOT aur-archive Search query: type:organization repos:>=5 user NOT aur-archive Search query: type:organization repos:>=5 user NOT aur-archive Search query: type:organization repos:>=5 user NOT aur-archive Search query: type:organization repos:>=5 user NOT aur-archive Search query: type:organization repos:>=5 user NOT aur-archive Search query: type:organization repos:>=5 user NOT aur-archive Search query: type:organization repos:>=5 user NOT aur-archive Search query: type:organization repos:>=5 user NOT aur-archive Search query: type:organization repos:>=5 user NOT aur-archive Search query: type:organization repos:>=5 user NOT aur-archive Search query: type:organization repos:>=5 user NOT aur-archive Search query: type:organization repos:>=5 user NOT aur-archive Search query: type:organization repos:>=5 user NOT aur-archive Search query: type:organization repos:>=5 user NOT aur-archive Search query: type:organization repos:>=5 user NOT aur-archive Search query: type:organization repos:>=5 user NOT aur-archive Now fetched a total number of 32382 organizations. Search query: type:organization repos:>=5 users Search query: type:organization repos:>=5 users Search query: type:organization repos:>=5 users Search query: type:organization repos:>=5 users Search query: type:organization repos:>=5 users Search query: type:organization repos:>=5 users Search query: type:organization repos:>=5 users Search query: type:organization repos:>=5 users Now fetched a total number of 32434 organizations.
values.to_csv('community-organizations-phase-one.csv')
values = pd.read_csv('community-organizations-phase-one.csv', index_col=0, dtype={
'members': 'UInt32',
'repositories': 'UInt32',
'stars': 'UInt32',
'collaborators': 'UInt32'
})
/nix/store/l2drdy46nqd6kqqz3pv3hfmy4c64ixn9-python3.7-ipython-7.6.1/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3057: DtypeWarning: Columns (8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63) have mixed types. Specify dtype option on import or set low_memory=False. interactivity=interactivity, compiler=compiler, result=result)
Apparently, the search filters are not fully efficient because about 1% of our search results have less than 5 repositories (four of them even having zero repositories):
len(values[values['repositories'] < 5]) / len(values)
0.011839427760991552
values = values[values['repositories'] >= 5]
Because many members can decide to make their membership status private (and in fact this is even the default), the public members are just an inferiror bound on the actual number of members of an organization. To estimate an upper bound on organization membership, we have also retrieved the number of assignable users of the most starred repository.
Assignable users are organization members with read access to the repository, or collaborators with write access specifically on this repository. In theory, it is possible for an organization member to not be an assignable user, if the organization owners have changed the default member permissions from "read" to "none". In this case, the number of public members of the organization could be larger than the number of assignable users on the most starred repository, but such situation is quite rare, it represents less than 3% of our dataset:
len(values[values['members'] > values['collaborators']]) / len(values)
0.021029641185647426
In most organizations, there are strictly more collaborators than public members:
np.median(values['collaborators'] - values['members'])
3.0
Most organizations are not community organizations. Community organizations (at least established once) should have a strong membership. Thus we select organizations with at least 10 public members or collaborators on the most starred repository. This represents 25% of the remaining organizations:
len(values[(values['members'] >= 10) | (values['collaborators'] >= 10)]) / len(values)
0.23051482059282372
values = values[(values['members'] >= 10) | (values['collaborators'] >= 10)]
Most organizations do not maintain any popular projects. Community organization should host several popular projects. Stars are often used as a proxy for popularity on GitHub. It is especially relevant for libraries that are mainly targeted to other developers. We set an arbitrary low limit of 10 stars on the most starred project. This represents about 60% of the remaining organizations:
len(values[values['stars'] >= 10]) / len(values)
0.5929886302111532
values = values[values['stars'] >= 10]
len(values)
4381
For each organization in the remaining list, we fetch the creation date of the organization, and the number of repositories that were created before this date, as an under-approximation of the number of transferred repositories. The GraphQL API allows us to batch requests and thus to have much fewer requests:
def build_graphql_query(imin):
query = """
query {
"""
if imin + 40 < len(values):
next_imin = imin + 40
isup = next_imin
else:
next_imin = None
isup = len(values)
index = values.index[imin:isup]
for i, owner in enumerate(index):
query += """
request%d: organization(login: "%s") {
createdAt
}
""" % (i, owner)
query += """
}
"""
return query, index, next_imin
def save_testorg_result(json, index):
data = json['data']
i = 0
while f'request{i}' in data:
result = data[f'request{i}']
if result is None:
print(f'Warning: {values.loc[index[i]].name} has been deleted')
else:
values.loc[index[i],'creation date'] = result['createdAt']
i += 1
imin = 0
while imin is not None:
sys.stdout.write(f'imin: {imin}\r')
sys.stdout.flush()
query, index, imin = build_graphql_query(imin)
json = send_graphql_request(query, {})
save_testorg_result(json, index)
Warning: wedeploy has been deleted Warning: surging-cloud has been deleted Warning: ruby-gnome2 has been deleted Warning: BloomSoftware has been deleted Warning: SchibstedSpain has been deleted imin: 4360
def build_graphql_query(imin):
query = """
query {
"""
if imin + 40 < len(values):
next_imin = imin + 40
isup = next_imin
else:
next_imin = None
isup = len(values)
index = values.index[imin:isup]
for i, owner in enumerate(index):
query += """
request%d: search(query: "user:%s created:<%s", type: REPOSITORY) {
repositoryCount
}
""" % (i, owner, values.loc[owner, 'creation date'])
query += """
}
"""
return query, index, next_imin
def save_testorg_result(json, index):
data = json['data']
i = 0
while f'request{i}' in data:
result = data[f'request{i}']
values.loc[index[i],'transferred repositories'] = result['repositoryCount']
i += 1
imin = 0
while imin is not None:
sys.stdout.write(f'imin: {imin}\r')
sys.stdout.flush()
query, index, imin = build_graphql_query(imin)
json = send_graphql_request(query, {})
save_testorg_result(json, index)
imin: 4360
values[columns + [
'creation date',
'transferred repositories'
] + keyword_columns ].to_csv(
'community-organizations-phase-two.csv'
)
values = pd.read_csv('community-organizations-phase-two.csv', index_col=0, parse_dates=['creation date'], dtype={
'members': 'UInt32',
'repositories': 'UInt32',
'stars': 'UInt32',
'collaborators': 'UInt32'
}).sort_values('transferred repositories', ascending=False)
Organizations with one transferred repository from before their creation represent 35% of the remaining organizations:
len(values[values['transferred repositories'] > 0]) / len(values)
0.35151791828349693
And organizations with two transferred repositories from before their creation represent about 20% of the same organizations:
len(values[values['transferred repositories'] > 1]) / len(values)
0.21410636840903904
len(values[values['transferred repositories'] > 1])
938
values[values['transferred repositories'] > 1].sort_values('creation date')[0:30]
name | description | url | members | repositories | stars | collaborators | creation date | transferred repositories | keyword app | ... | keyword shared | keyword sharing | keyword support | keyword supporter | keyword supporters | keyword supporting | keyword together | keyword unofficial | keyword user | keyword users | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
datadesk | Los Angeles Times Data Desk | Analysis, applications and automation from a t... | https://www.latimes.com | 8 | 184 | 313 | 27 | 2010-07-02 02:04:07+00:00 | 6.0 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
collective | Collective | Plone add-ons shared code repositories | https://collective.github.io | 268 | 1674 | 569 | 628 | 2010-08-13 00:04:43+00:00 | 7.0 | NaN | ... | True | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
uncopenweb | UNC Open Web Group | NaN | http://sites.google.com/site/uncopenweb/ | 11 | 23 | 15 | 2 | 2010-09-04 01:22:47+00:00 | 6.0 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
PerlDancer | PerlDancer | The Dancer Developers group | http://perldancer.org | 10 | 71 | 708 | 15 | 2010-09-21 12:27:49+00:00 | 2.0 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | True |
symphonists | Symphony Community | NaN | https://www.getsymphony.com | 12 | 106 | 47 | 13 | 2010-10-21 15:40:12+00:00 | 56.0 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
libtom | libtom | libtom projects | http://www.libtom.net | 3 | 7 | 859 | 22 | 2010-10-22 09:12:56+00:00 | 5.0 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
xcore | XCore open source project | NaN | github.xcore.com | 26 | 119 | 75 | 7 | 2011-01-13 14:16:30+00:00 | 3.0 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
silverstripe-archive | SilverStripe Archive | Archive of unsupported SilverStripe modules. I... | http://silverstripe.org | 10 | 71 | 72 | 11 | 2011-01-17 00:22:34+00:00 | 4.0 | NaN | ... | NaN | NaN | True | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
mapbox | Mapbox | Mapbox is the location data platform for mobil... | https://www.mapbox.com | 62 | 812 | 4700 | 458 | 2011-02-04 19:02:13+00:00 | 4.0 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
openstate | Open State Foundation | Open State Foundation promotes digital transpa... | https://openstate.eu | 18 | 107 | 23 | 13 | 2011-03-15 21:42:43+00:00 | 2.0 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
charlotte-ruby | Charlotte Ruby Group | Charlotte's local Ruby User Group | http://charlotteruby.org | 10 | 18 | 1277 | 7 | 2011-04-07 15:45:24+00:00 | 3.0 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | True | NaN |
pusher | Pusher | Pusher makes communication and collaboration A... | https://pusher.com/ | 19 | 206 | 1488 | 54 | 2011-04-19 17:16:38+00:00 | 4.0 | True | ... | NaN | NaN | True | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
cul | Columbia University Libraries | NaN | http://library.columbia.edu | 5 | 168 | 20 | 16 | 2011-04-29 14:08:07+00:00 | 5.0 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
silexlabs | Silex Labs | Silex Labs is a foundation dedicated to helpin... | http://www.silexlabs.org/ | 6 | 57 | 688 | 13 | 2011-05-22 17:18:30+00:00 | 2.0 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
bbc | BBC | Open source code used on public facing service... | http://www.bbc.co.uk/opensource/ | 105 | 624 | 1002 | 2036 | 2011-06-04 01:31:11+00:00 | 7.0 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
haiku | Haiku | An open-source operating system that specifica... | https://www.haiku-os.org | 10 | 19 | 766 | 9 | 2011-06-18 03:22:05+00:00 | 2.0 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
XCSoar | XCSoar | ... the open-source glide computer | https://xcsoar.org/ | 7 | 9 | 134 | 11 | 2011-06-20 09:34:21+00:00 | 2.0 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Kozea | Kozea | We build open source software that you will love. | https://community.kozea.fr/ | 13 | 103 | 2921 | 39 | 2011-06-23 10:59:31+00:00 | 2.0 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
JetBrains | JetBrains | JetBrains open source projects. | https://www.jetbrains.com | 94 | 410 | 28677 | 94 | 2011-06-27 10:06:52+00:00 | 6.0 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Automattic | Automattic | We are passionate about making the web a bette... | https://automattic.com | 150 | 548 | 19214 | 892 | 2011-07-01 02:45:15+00:00 | 7.0 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
mantisbt-plugins | MantisBT Community Plugins | NaN | https://www.mantisbt.org | 25 | 89 | 160 | 54 | 2011-07-12 14:07:02+00:00 | 5.0 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
neo4j-contrib | Neo4j Contrib | Public, Open Source Contributions to the Neo4j... | http://neo4j.com/developer | 15 | 134 | 915 | 34 | 2011-07-14 23:15:43+00:00 | 10.0 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
sandstorm | sandstorm | sandstorm - building great web applications | https://sandstorm.de/blog.html | 1 | 84 | 40 | 11 | 2011-08-03 08:20:12+00:00 | 3.0 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Kong | Kong | Next-Generation API Platform for Microservices... | https://konghq.com | 25 | 137 | 22961 | 99 | 2011-08-06 02:08:16+00:00 | 6.0 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
CloudStack-extras | Collection of additional tools that are useful... | NaN | NaN | 18 | 23 | 260 | 18 | 2011-08-26 14:40:35+00:00 | 8.0 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
hacsoc | HacSoc | The organization for all things done by the CW... | http://hacsoc.org/ | 18 | 42 | 16 | 25 | 2011-09-11 04:47:22+00:00 | 2.0 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
vim-jp | vim-jp | Vim community for Japanese developers and users | https://vim-jp.org/ | 47 | 33 | 404 | 116 | 2011-09-15 02:44:30+00:00 | 3.0 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | True | True |
SitePen | SitePen | Modernizing Apps, Tools & Teams for the Enterp... | http://www.sitepen.com | 6 | 31 | 614 | 39 | 2011-09-21 08:39:10+00:00 | 3.0 | True | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
tikalk | Tikal Knowledge, Ltd. | NaN | www.tikalk.com | 5 | 195 | 745 | 99 | 2011-10-06 07:44:56+00:00 | 6.0 | NaN | ... | NaN | NaN | True | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
sbt | sbt | Community organization for all sbt plugin auth... | https://www.scala-sbt.org | 36 | 129 | 3911 | 24 | 2011-10-28 14:16:17+00:00 | 13.0 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
30 rows × 84 columns
for org in [
'coq-community',
'dlang-community',
'elm-community',
'elytra',
'fluent-plugins-nursery',
'ocaml-community',
'react-native-community',
'reasonml-community',
'electron-userland',
'fsprojects',
'sous-chefs',
'voxpupuli'
]:
print(values.loc[org][columns + ['creation date']])
print()
name coq-community description A project for a collaborative, community-drive... url https://github.com/coq-community/manifesto members 27 repositories 22 stars 112 collaborators 27 creation date 2017-12-11 16:11:12+00:00 Name: coq-community, dtype: object name D Community hub description Community hub for popular D projects url https://github.com/dlang-community/discussions members 8 repositories 24 stars 293 collaborators 24 creation date 2016-12-11 20:18:39+00:00 Name: dlang-community, dtype: object name Elm Community description Unofficial group for shared work on Elm packag... url https://elm-community.github.io members 14 repositories 53 stars 885 collaborators 36 creation date 2015-11-20 20:59:45+00:00 Name: elm-community, dtype: object name Elytra description A group of people who like making cool mods an... url https://elytradev.com/ members 6 repositories 56 stars 23 collaborators 13 creation date 2016-05-06 09:11:12+00:00 Name: elytra, dtype: object name fluent-plugins-nursery description Collaborate to maintain Fluentd plugins. url NaN members 2 repositories 12 stars 143 collaborators 13 creation date 2016-09-05 02:46:52+00:00 Name: fluent-plugins-nursery, dtype: object name OCaml Community description A collaborative, community-driven project for ... url https://github.com/ocaml-community/meta members 6 repositories 13 stars 1742 collaborators 16 creation date 2018-08-10 12:06:41+00:00 Name: ocaml-community, dtype: object name React Native Community description Quality code from the React Native Community url https://github.com/react-native-community/.github members 40 repositories 72 stars 11743 collaborators 85 creation date 2016-07-03 18:18:20+00:00 Name: react-native-community, dtype: object name reasonml-community description Reason and BuckleScript's community packages url https://reasonml.github.io/docs/en/community.html members 11 repositories 31 stars 631 collaborators 26 creation date 2017-01-14 03:09:50+00:00 Name: reasonml-community, dtype: object name Electron Userland description Third party community maintained electron modules url https://github.com/electron-userland/about#readme members 9 repositories 30 stars 7498 collaborators 20 creation date 2016-02-27 20:22:27+00:00 Name: electron-userland, dtype: object name F# Community Project Incubation Space description NaN url http://fsprojects.github.io members 39 repositories 115 stars 1485 collaborators 120 creation date 2013-11-21 13:40:41+00:00 Name: fsprojects, dtype: object name Sous Chefs description Community of chef cookbook maintainers url http://sous-chefs.org members 16 repositories 78 stars 522 collaborators 51 creation date 2015-05-08 20:27:19+00:00 Name: sous-chefs, dtype: object name Vox Pupuli description Modules and tooling maintained by and for the ... url https://voxpupuli.org members 59 repositories 175 stars 598 collaborators 12 creation date 2014-09-08 09:22:01+00:00 Name: voxpupuli, dtype: object
We fetch metrics on all the repositories of a given organization (or just the 100 most...) and we hope to find the meta-repository as an outlier for some metrics.
query = """
query repoMetrics($org: String!) {
organization(login: $org) {
repositories(first: 100, orderBy: {field: STARGAZERS, direction: DESC}) {
nodes {
name
createdAt
diskUsage
issues(first:1) {
totalCount
nodes {
createdAt
comments { totalCount }
}
}
pullRequests(first:1) {
totalCount
nodes {
createdAt
comments { totalCount }
}
}
isFork
forkCount
stargazers { totalCount }
languages {
totalSize
totalCount
}
primaryLanguage { name }
}
}
}
}
"""