In [1]:
import datetime
import re

import numpy
import pandas
import matplotlib.pyplot
import matplotlib.dates

dfany = pandas.read_csv("pip-counts-any.csv", parse_dates=[0], infer_datetime_format=True)
dfsci = pandas.read_csv("pip-counts-scientific-linux.csv", parse_dates=[0], infer_datetime_format=True)
dfos = pandas.read_csv("pip-counts-by-os.csv", parse_dates=[0], infer_datetime_format=True)
In [2]:
dfos.groupby("project").count().sort_values("count", ascending=False)
Out[2]:
date os version count
project
numpy 65521 65521 65521 65521
scipy 53757 53757 53757 53757
pandas 52252 52252 52252 52252
tensorflow 45924 45924 45924 45924
matplotlib 41508 41508 41508 41508
scikit-learn 34460 34458 34460 34460
torch 10195 10195 10195 10195
iminuit 2968 2968 2968 2968
uproot 2367 2367 2367 2367
awkward 1771 1771 1771 1771
coffea 148 148 148 148
In [4]:
def plotit(df, vertical, title, selection, islog=True, dropminuit=False, dropml=False, dropcoffea=False, dropmine=False):
    which = ["numpy", "scipy", "pandas", "matplotlib", "tensorflow", "scikit-learn", "torch", "uproot", "awkward", "iminuit", "coffea"]
    if dropminuit:
        del which[which.index("iminuit")]
    if dropcoffea:
        del which[which.index("coffea")]
    if dropml:
        del which[which.index("tensorflow")]
        del which[which.index("scikit-learn")]
        del which[which.index("torch")]
    if dropmine:
        del which[which.index("uproot")]
        del which[which.index("awkward")]
    if selection is not None:
        df = df[df["os"] == selection]
    df = df.groupby(["date", "project"])["count"].sum().unstack()[which]
    ax = df.fillna(0).rolling(30).mean().plot(figsize=(10, 4))
    matplotlib.pyplot.title(title)
    if islog:
        matplotlib.pyplot.yscale("log")
    matplotlib.pyplot.xlabel("")
    matplotlib.pyplot.ylabel("pip-installs/day, 30-day moving average")
    handles, labels = ax.get_legend_handles_labels()
    ax.legend(loc="upper left", frameon=False)
    ax.set_xlim((datetime.date(2015, 5, 1), datetime.date(2019, 10, 22)))
    ax.set_ylim((0.01, vertical))
    ax.xaxis.set_minor_locator(matplotlib.dates.MonthLocator())
    ax.xaxis.set_major_locator(matplotlib.dates.MonthLocator(bymonth=[1, 5, 9]))
    ax.xaxis.set_major_formatter(matplotlib.dates.DateFormatter('%b %Y'))
    return ax
In [5]:
ax = plotit(dfsci, 3000, "Scientific Linux only (physicists)", None, dropminuit=True, dropml=True)
ax.get_figure().savefig("pip-scilinux-uproot.png")
ax.get_figure().savefig("pip-scilinux-uproot.pdf")
In [5]:
ax = plotit(dfsci, 3000, "Scientific Linux only (physicists)", None, dropminuit=False, dropml=True)
ax.get_figure().savefig("pip-scilinux-uproot-iminuit.png")
ax.get_figure().savefig("pip-scilinux-uproot-iminuit.pdf")
In [6]:
ax = plotit(dfsci, 3000, "Scientific Linux only (physicists)", None, dropminuit=True, dropmine=True, dropml=False)
ax.get_figure().savefig("pip-scilinux-ml.png")
ax.get_figure().savefig("pip-scilinux-ml.pdf")
In [7]:
ax = plotit(dfsci, 1700, "Scientific Linux only (physicists)", None, islog=False)
ax.get_figure().savefig("pip-scilinux-linear.png")
ax.get_figure().savefig("pip-scilinux-linear.pdf")
In [8]:
ax = dfany.groupby(["date"])["count"].sum().rolling(30).mean().plot(figsize=(10, 4))
matplotlib.pyplot.title("Scientific Linux only (physicists)")
matplotlib.pyplot.xlabel("")
matplotlib.pyplot.ylabel("pip-installs/day, 30-day moving average")
handles, labels = ax.get_legend_handles_labels()
ax.legend(["all pip packages"], loc="upper left", frameon=False)
ax.set_xlim((datetime.date(2015, 5, 1), datetime.date(2019, 10, 22)))
ax.set_ylim((0.01, 38000))
ax.xaxis.set_minor_locator(matplotlib.dates.MonthLocator())
ax.xaxis.set_major_locator(matplotlib.dates.MonthLocator(bymonth=[1, 5, 9]))
ax.xaxis.set_major_formatter(matplotlib.dates.DateFormatter('%b %Y'))
ax.get_figure().savefig("pip-scilinux-everything.png")
ax.get_figure().savefig("pip-scilinux-everything.pdf")
In [16]:
ax = plotit(dfos, 3000000, "Linux (not just physicists)", "Linux")
ax.get_figure().savefig("pip-linux.png")
ax.get_figure().savefig("pip-linux.pdf")
In [17]:
ax = plotit(dfos, 3000000, "MacOS (not just physicists)", "Darwin")
ax.get_figure().savefig("pip-macos.png")
ax.get_figure().savefig("pip-macos.pdf")
In [18]:
ax = plotit(dfos, 3000000, "Windows (not just physicists)", "Windows")
ax.get_figure().savefig("pip-windows.png")
ax.get_figure().savefig("pip-windows.pdf")
In [4]:
dfuproot = pandas.read_csv("github-issues-uproot.csv", parse_dates=[4, 5], infer_datetime_format=True)
dfmethods = pandas.read_csv("github-issues-uproot-methods.csv", parse_dates=[4, 5], infer_datetime_format=True)
dfawkward = pandas.read_csv("github-issues-awkward.csv", parse_dates=[4, 5], infer_datetime_format=True)

dfuproot["package"] = "uproot"
dfmethods["package"] = "uproot-methods"
dfawkward["package"] = "awkward-array"
for df in dfuproot, dfmethods, dfawkward:
    df["opened"] = 1
    df["closed"] = df["state"].apply(lambda x: 1 if x == "closed" else 0)

dfissues = pandas.concat([dfuproot, dfmethods, dfawkward])
In [7]:
fig, ax = matplotlib.pyplot.subplots(figsize=(3, 4))
for df in dfuproot, dfawkward, dfmethods:
    df[["date_created", "opened", "closed"]][~df["ispull"] & ~numpy.isin(df["user"], ["jpivarski", "reikdas"])].set_index("date_created", inplace=False).sort_index().cumsum().plot(ax=ax)
matplotlib.pyplot.xlabel("")
matplotlib.pyplot.ylabel("number of issues")
handles, labels = ax.get_legend_handles_labels()
ax.legend(["uproot opened", "uproot closed", "awkward opened", "awkward closed", "uproot-methods opened", "uproot-methods closed"], loc="upper left", frameon=False)
ax.set_xlim((datetime.date(2017, 9, 1), datetime.date(2019, 10, 22)))
ax.set_ylim((0, 180))
ax.xaxis.set_minor_locator(matplotlib.dates.MonthLocator())
ax.xaxis.set_major_locator(matplotlib.dates.MonthLocator(bymonth=[1, 5, 9]))
ax.xaxis.set_major_formatter(matplotlib.dates.DateFormatter('%b %Y'))
ax.get_figure().savefig("uproot-issues.png")
ax.get_figure().savefig("uproot-issues.pdf")
In [10]:
ax = dfissues[["date_created", "numcomments"]].set_index("date_created", inplace=False).sort_index().cumsum().plot(figsize=(3, 4))
matplotlib.pyplot.xlabel("")
matplotlib.pyplot.ylabel("number of comments (issues & PRs)")
handles, labels = ax.get_legend_handles_labels()
ax.get_legend().remove()
ax.set_xlim((datetime.date(2017, 9, 1), datetime.date(2019, 10, 22)))
ax.set_ylim((0, 2200))
ax.xaxis.set_minor_locator(matplotlib.dates.MonthLocator())
ax.xaxis.set_major_locator(matplotlib.dates.MonthLocator(bymonth=[1, 5, 9]))
ax.xaxis.set_major_formatter(matplotlib.dates.DateFormatter('%b %Y'))
ax.get_figure().savefig("uproot-comments.png")
ax.get_figure().savefig("uproot-comments.pdf")
In [11]:
ax = dfissues[["user", "date_created", "opened"]][~dfissues["ispull"]].groupby("user").min().set_index("date_created", inplace=False).sort_index().cumsum().plot(figsize=(3, 4))
matplotlib.pyplot.xlabel("")
matplotlib.pyplot.ylabel("number of users opening issues")
handles, labels = ax.get_legend_handles_labels()
ax.get_legend().remove()
ax.set_xlim((datetime.date(2017, 9, 1), datetime.date(2019, 10, 22)))
ax.set_ylim((0, 115))
ax.xaxis.set_minor_locator(matplotlib.dates.MonthLocator())
ax.xaxis.set_major_locator(matplotlib.dates.MonthLocator(bymonth=[1, 5, 9]))
ax.xaxis.set_major_formatter(matplotlib.dates.DateFormatter('%b %Y'))
ax.get_figure().savefig("uproot-users.png")
ax.get_figure().savefig("uproot-users.pdf")
In [21]:
isclosed = dfissues[~dfissues["ispull"] & dfissues["closed"]]

ax = numpy.log10((isclosed["date_closed"].astype(isclosed["date_created"].dtype) - isclosed["date_created"]).astype(int) / 1e9).hist(bins=20, range=(2, 8), grid=False, histtype="step", figsize=(3, 4))

ax.xaxis.set_ticks(numpy.log10([60, 60*60, 24*60*60, 30*24*60*60, 365*24*60*60]))
ax.xaxis.set_ticklabels(["min", "hour", "day", "month", "    year"])
matplotlib.pyplot.xlabel("logarithmic issue response time")
matplotlib.pyplot.ylabel("number of issues per bin")
ax.get_figure().savefig("uproot-response-time.png")
ax.get_figure().savefig("uproot-response-time.pdf")
In [25]:
isclosed = dfissues[~dfissues["ispull"] & dfissues["closed"]]

ax = ((isclosed["date_closed"].astype(isclosed["date_created"].dtype) - isclosed["date_created"]).astype(int) / (1e9*60*60*24)).hist(bins=20, range=(0, 7), grid=False, histtype="step", figsize=(3, 4))

matplotlib.pyplot.xlabel("linear issue response time (days)")
matplotlib.pyplot.ylabel("number of issues per bin")
ax.xaxis.set_ticks([0, 1, 2, 3, 4, 5, 6, 7])
ax.xaxis.set_ticklabels(map(str, [0, 1, 2, 3, 4, 5, 6, 7]))
ax.get_figure().savefig("uproot-response-time-linear.png")
ax.get_figure().savefig("uproot-response-time-linear.pdf")
In [17]:
dfos.groupby(["os"]).count().sort_values("count", ascending=False);
In [2]:
dfgit = pandas.read_csv("github-cmssw.csv", parse_dates=[3], infer_datetime_format=True)
dfgit.columns
Out[2]:
Index(['repo', 'owner', 'isfork', 'created', 'language'], dtype='object')
In [3]:
dfgit["lang"] = dfgit["language"].apply(lambda x: {
    "'C++'": "C/C++", "'C'": "C/C++", "'C#'": "C/C++", "'Objective-C'": "C/C++",
    "'Python'": "Python",
    "'Jupyter Notebook'": "Jupyter",
    "'HTML'": "HTML/CSS/JS", "'CSS'": "HTML/CSS/JS", "'JavaScript'": "HTML/CSS/JS", "'TypeScript'": "HTML/CSS/JS",
        "'CoffeeScript'": "HTML/CSS/JS", "'PHP'": "HTML/CSS/JS",
    "'Java'": "Java",
    "'TeX'": "TeX/LaTeX",
    "'Go'": "Go",
    "'VHDL'": "VHDL/Verilog", "'Verilog'": "VHDL/Verilog",
    "'R'": "R",
    "'Fortran'": "Fortran", "'FORTRAN'": "Fortran",
    "'Matlab'": "Matlab", "'MATLAB'": "Matlab",
    "'Julia'": "Julia",
    "'Mathematica'": "Mathematica",
    "None": "(unknown)",
    }.get(x, "(other)"))
which = ["C/C++", "Python", "Jupyter", "(unknown)", "(other)", "HTML/CSS/JS", "Java", "TeX/LaTeX", "Go", "VHDL/Verilog", "R", "Fortran", "Matlab", "Julia", "Mathematica"]
In [20]:
dfgit.groupby("lang").count().sort_values("repo", ascending=False);
In [4]:
summary = dfgit[~dfgit["isfork"]]["lang"].groupby([dfgit["created"].dt.to_period("M"), dfgit["lang"]]).count().unstack().fillna(0)
summary = summary[reversed(which)]
summary;
In [5]:
color_cycle = (matplotlib.pyplot.rcParams['axes.prop_cycle'].by_key()['color'] * 2)[:len(which)]
color_cycle[0] = "#4e9fd9"
color_cycle[3] = "#ffff00"
color_cycle[4] = "#ff8080"
color_cycle = list(reversed(color_cycle))
In [6]:
ax = summary.plot.bar(width=1, stacked=True, figsize=(12, 5), color=color_cycle)
labels = []
for x in ax.xaxis.get_ticklabels():
    if x.get_text()[:4] != "2009" and x.get_text().endswith("-01"):
        labels.append("Jan " + x.get_text()[:4])
    elif x.get_text()[:4] != "2009" and x.get_text().endswith("-05"):
        labels.append("May " + x.get_text()[:4])
    elif x.get_text()[:4] != "2009" and x.get_text().endswith("-09"):
        labels.append("Sep " + x.get_text()[:4])
    else:
        labels.append("")
ax.set_xticklabels(labels)
matplotlib.pyplot.xlabel("")
matplotlib.pyplot.ylabel("number of non-fork repositories")
matplotlib.pyplot.title("GitHub users who forked CMSSW")
handles, labels = ax.get_legend_handles_labels()
ax.legend([handles[labels.index(x)] for x in which], which, loc="upper left", frameon=False, borderpad=1.25)
ax.get_figure().savefig("github-linear.png")
ax.get_figure().savefig("github-linear.pdf")
In [7]:
summary2 = summary.copy()
total = numpy.zeros(len(summary), dtype=float)
for x in which:
    total += summary2[x]
for x in which:
    summary2[x] = summary2[x] / total
summary2 = summary2.fillna(0)
In [8]:
ax = summary2.plot.bar(width=1, stacked=True, figsize=(12, 5), color=color_cycle)
labels = []
for x in ax.xaxis.get_ticklabels():
    if x.get_text()[:4] != "2009" and x.get_text().endswith("-01"):
        labels.append("Jan " + x.get_text()[:4])
    elif x.get_text()[:4] != "2009" and x.get_text().endswith("-05"):
        labels.append("May " + x.get_text()[:4])
    elif x.get_text()[:4] != "2009" and x.get_text().endswith("-09"):
        labels.append("Sep " + x.get_text()[:4])
    else:
        labels.append("")
ax.set_xticklabels(labels)
ax.yaxis.grid(color="black", linestyle="--")
matplotlib.pyplot.xlabel("")
matplotlib.pyplot.ylabel("fraction of non-fork repositories")
matplotlib.pyplot.title("GitHub users who forked CMSSW")
handles, labels = ax.get_legend_handles_labels()
ax.legend([handles[labels.index(x)] for x in which], which, loc="center left", frameon=True, framealpha=1, borderpad=1.25)
ax.set_ylim((0, 1))
ax.get_figure().savefig("github-fraction.png")
ax.get_figure().savefig("github-fraction.pdf")
In [13]:
ax = summary2[["C/C++", "Python", "Jupyter"]].plot(figsize=(12, 5))
ax.yaxis.grid(color="gray", linestyle="--")
matplotlib.pyplot.xlabel("")
matplotlib.pyplot.ylabel("fraction of non-fork repositories")
matplotlib.pyplot.title("GitHub users who forked CMSSW")
ax.set_ylim((0, 1))
ax.set_xlim((521, 596))
ax.legend(loc="upper left", frameon=False)
ax.get_figure().savefig("github-simplefraction.png")
ax.get_figure().savefig("github-simplefraction.pdf")