cd ~/projekte/openbsd/openbsd-src/
/home/dloss/projekte/openbsd/openbsd-src
!git log --format=format:"%ai, %an" > ../commits
cd ..
/home/dloss/projekte/openbsd
ls
commits genucommits.pdf openbsd-src/
!head commits
2013-04-30 13:57:02 +0000, patrick 2013-04-30 13:23:52 +0000, patrick 2013-04-30 13:10:45 +0000, patrick 2013-04-30 13:05:44 +0000, jmc 2013-04-30 13:04:25 +0000, patrick 2013-04-30 13:00:21 +0000, patrick 2013-04-30 12:30:40 +0000, florian 2013-04-30 12:29:04 +0000, florian 2013-04-30 12:07:21 +0000, eric 2013-04-30 12:02:39 +0000, eric
import pandas as pd
%time df=pd.read_csv("commits", header=None, names=["time", "author"])
CPU times: user 0.39 s, sys: 0.04 s, total: 0.43 s Wall time: 0.43 s
df
<class 'pandas.core.frame.DataFrame'> Int64Index: 142758 entries, 0 to 142757 Data columns (total 2 columns): time 142758 non-null values author 142758 non-null values dtypes: object(2)
Es gibt mehr als 142000 commits seit Oktober 1995.
df.head()
time | author | |
---|---|---|
0 | 2013-04-30 13:57:02 +0000 | patrick |
1 | 2013-04-30 13:23:52 +0000 | patrick |
2 | 2013-04-30 13:10:45 +0000 | patrick |
3 | 2013-04-30 13:05:44 +0000 | jmc |
4 | 2013-04-30 13:04:25 +0000 | patrick |
df.author.value_counts()
deraadt 20602 miod 7559 millert 7046 jmc 6274 mickey 5860 espie 4561 henning 3327 brad 3321 niklas 3039 art 3012 kettenis 2939 jason 2732 markus 2368 claudio 2216 krw 2108 ... hsuenaga 1 rfreeman 1 tomo 1 gonzalo 1 kristaps 1 joey 1 mhitch 1 fgont 1 ehrhardt 1 tqbf 1 yanick 1 matt 1 obecian 1 tobiasu 1 felix 1 Length: 307, dtype: int64
In echte Zeiten konvertieren (dauert lange):
%time df.index = pd.to_datetime(df['time'])
CPU times: user 69.55 s, sys: 1.32 s, total: 70.87 s Wall time: 70.18 s
df.tail()
time | author | |
---|---|---|
time | ||
1995-10-18 08:43:09 | 1995-10-18 08:43:09 +0000 | deraadt |
1995-10-18 08:43:09 | 1995-10-18 08:43:09 +0000 | deraadt |
1995-10-18 08:38:49 | 1995-10-18 08:38:49 +0000 | deraadt |
1995-10-18 08:38:49 | 1995-10-18 08:38:49 +0000 | deraadt |
1995-10-18 08:53:39 | 1995-10-18 08:53:39 +0000 | deraadt |
df.sort_index(inplace=True)
df.tail()
time | author | |
---|---|---|
time | ||
2013-04-30 13:04:25 | 2013-04-30 13:04:25 +0000 | patrick |
2013-04-30 13:05:44 | 2013-04-30 13:05:44 +0000 | jmc |
2013-04-30 13:10:45 | 2013-04-30 13:10:45 +0000 | patrick |
2013-04-30 13:23:52 | 2013-04-30 13:23:52 +0000 | patrick |
2013-04-30 13:57:02 | 2013-04-30 13:57:02 +0000 | patrick |
del df['time']
df["c"]=1
df['author'][2343]
' tholo'
Leerzeichen am Anfang und Ende von Authorennamen entfernen:
df['author'] = df['author'].map(lambda x: x.strip())
df.head()
author | c | |
---|---|---|
time | ||
1995-10-18 08:38:49 | deraadt | 1 |
1995-10-18 08:38:49 | deraadt | 1 |
1995-10-18 08:43:09 | deraadt | 1 |
1995-10-18 08:43:09 | deraadt | 1 |
1995-10-18 08:43:49 | deraadt | 1 |
commits_per_person = df.author.value_counts()
commits_per_person.describe()
count 307.000000 mean 465.009772 std 1495.620881 min 1.000000 25% 13.000000 50% 68.000000 75% 350.000000 max 20602.000000 dtype: float64
307 Committer? (zumindest 307 verschiedene Commit-Author Namen)
%pylab inline
Welcome to pylab, a matplotlib-based Python environment [backend: module://IPython.zmq.pylab.backend_inline]. For more information, type 'help(pylab)'.
Größe der Grafik heraufsetzen:
import matplotlib.pyplot as plt
plt.figsize(10,6)
commits_per_person.plot()
<matplotlib.axes.AxesSubplot at 0x1679a10>
top30=commits_per_person[:30]
top30
deraadt 20602 miod 7559 millert 7046 jmc 6274 mickey 5860 espie 4561 henning 3327 brad 3321 niklas 3039 art 3012 kettenis 2939 jason 2732 markus 2368 claudio 2216 krw 2108 dlg 2053 drahn 1840 jsg 1790 angelos 1671 aaron 1643 nicm 1551 otto 1550 itojun 1520 fgsch 1401 damien 1349 tedu 1306 djm 1269 reyk 1120 downsj 1109 schwarze 1069 dtype: int64
top30.plot(kind="barh")
<matplotlib.axes.AxesSubplot at 0x52f3f90>
commits_per_person['markus']
2368
genuesen = ["markus", "bluhm", "mpf", "hshoexer", "grunk"]
genu_commits = commits_per_person.ix[genuesen]
genu_commits
markus 2368 bluhm 338 mpf 252 hshoexer 591 grunk 83 dtype: int64
bluhm = df[df.author == "bluhm"]
mpf = df[df.author == "mpf"]
bluhm.c.cumsum().plot(style="r", label="bluhm")
mpf.c.cumsum().plot(style="--", label="mpf")
title("mpf vs. bluhm")
legend(loc="best")
<matplotlib.legend.Legend at 0x552ded0>
markus = df[df.author == "markus"]
hshoexer = df[df.author == "hshoexer"]
grunk = df[df.author == "grunk"]
bluhm.c.cumsum().plot(style="r.", label="bluhm", alpha=0.2)
mpf.c.cumsum().plot(style="--.", label="mpf", alpha=0.2)
hshoexer.c.cumsum().plot(style="g-..", label="hshoexer")
grunk.c.cumsum().plot(style="m--.", label="grunk")
markus.c.cumsum().plot(style='k:.', label="markus", alpha=0.2)
legend(loc=0)
savefig("genucommits.pdf")
bluhm.head(1)
author | c | |
---|---|---|
time | ||
2007-01-24 13:24:58 | bluhm | 1 |
mpf.head(1)
author | c | |
---|---|---|
time | ||
2004-01-05 23:53:24 | mpf | 1 |
markus.head(1)
author | c | |
---|---|---|
time | ||
1999-10-03 19:17:40 | markus | 1 |
hshoexer.tail(1)
author | c | |
---|---|---|
time | ||
2009-01-29 10:05:50 | hshoexer | 1 |
grunk.tail(1)
author | c | |
---|---|---|
time | ||
2009-10-01 11:37:33 | grunk | 1 |
df.ix['2013']
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 2208 entries, 2013-01-01 00:02:01 to 2013-04-30 13:57:02 Data columns (total 2 columns): author 2208 non-null values c 2208 non-null values dtypes: int64(1), object(1)
df.c.cumsum().plot()
<matplotlib.axes.AxesSubplot at 0x5986890>
Recht stetiges Wachstum. Wird aber langsamer. Hmm...??
cmon=df.resample("M", how="sum").c.cumsum()
y=cmon.values
y
array([ 196, 394, 799, 1126, 1346, 1533, 1717, 2333, 2927, 3335, 4039, 4735, 5330, 5652, 6147, 6673, 7358, 7750, 8560, 9297, 9932, 10709, 11221, 11802, 12294, 12741, 13141, 13417, 13852, 14484, 14816, 15259, 15621, 16203, 16686, 17279, 17585, 18047, 18438, 18854, 19585, 20143, 20553, 21058, 21493, 22162, 22726, 23338, 23865, 24392, 25050, 25898, 26408, 27032, 27949, 28426, 29173, 29757, 30220, 30767, 31434, 31930, 32382, 33241, 34017, 34937, 35577, 36533, 38236, 39212, 40442, 41478, 42122, 42838, 43539, 44341, 45358, 46182, 46972, 47909, 49421, 50387, 50882, 51505, 52007, 52619, 53370, 54163, 54931, 55779, 56657, 57733, 58839, 59647, 60389, 60960, 61640, 62143, 62986, 64162, 65225, 65917, 66718, 67519, 68404, 69346, 70226, 71031, 71647, 72483, 73487, 74358, 74941, 76016, 77245, 78447, 79155, 80005, 80827, 81584, 82619, 83757, 84943, 85879, 86559, 87673, 88622, 89883, 91002, 91834, 92737, 93233, 94028, 95103, 95887, 96582, 97511, 98444, 99362, 100740, 101582, 102054, 102470, 103309, 104166, 105154, 105726, 106269, 106742, 107278, 107791, 108603, 109685, 110339, 110897, 111336, 111906, 112698, 113396, 114075, 114624, 115236, 115970, 116685, 117807, 118466, 119210, 119672, 120410, 121375, 122024, 122700, 123310, 123819, 124533, 125124, 126019, 127017, 127556, 128249, 128767, 129158, 129603, 130165, 130300, 130706, 131552, 132037, 132555, 133557, 133780, 134245, 134582, 134995, 135294, 135748, 135950, 136323, 136820, 137161, 137500, 138145, 138737, 139254, 139748, 140222, 140550, 141060, 141355, 142187, 142758])
x=arange(cmon.size)
x
array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210])
p=np.polyfit(x,y, 3)
p
array([ -1.87535016e-02, 6.07155155e+00, 2.17719734e+02, 1.83929708e+03])
x1=arange(300)
plot(x,y)
plot(x1,np.polyval(p,x1), "r")
xlabel("months")
<matplotlib.text.Text at 0x5ef8210>
np.polyval(p,x1).max()
144967.34021697595
y.max()
142758
pd.Series(np.polyval(p,x1)).idxmax()
232
pd.Series(x).idxmax()
210
per_day=df.resample("D", how="sum")
per_day.head()
c | |
---|---|
time | |
1995-10-18 | 52 |
1995-10-19 | 16 |
1995-10-20 | 4 |
1995-10-21 | 8 |
1995-10-22 | 14 |
per_day.c.max()
221.0
per_day.c.idxmax()
<Timestamp: 2001-06-25 00:00:00>
df.ix["2001-06-25"]
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 221 entries, 2001-06-25 00:00:44 to 2001-06-25 23:49:28 Data columns (total 2 columns): author 221 non-null values c 221 non-null values dtypes: int64(1), object(1)
per_month=df.resample("M", how="sum")
per_month.max()
c 1703 dtype: int64
per_month.idxmax()
c 2001-06-30 00:00:00 dtype: datetime64[ns]
Die meisten Commits gab es im Juni 2001: 1699 Stück
per_month.c.describe()
count 211.000000 mean 676.578199 std 261.998541 min 135.000000 25% 495.000000 50% 645.000000 75% 840.500000 max 1703.000000 dtype: float64
Die Anzahl der Commits schwankt stark.
per_month.c.plot()
<matplotlib.axes.AxesSubplot at 0x688fb50>
pd.rolling_mean(per_month.c, 15).plot()
<matplotlib.axes.AxesSubplot at 0x6eb2490>
Neue Spalte erzeugen mit dem Zeitstempel, denn der Index lässt sich nicht so gut auswerten.
df["datetime"]=df.index
df.head()
author | c | datetime | |
---|---|---|---|
time | |||
1995-10-18 08:38:49 | deraadt | 1 | 1995-10-18 08:38:49 |
1995-10-18 08:38:49 | deraadt | 1 | 1995-10-18 08:38:49 |
1995-10-18 08:43:09 | deraadt | 1 | 1995-10-18 08:43:09 |
1995-10-18 08:43:09 | deraadt | 1 | 1995-10-18 08:43:09 |
1995-10-18 08:43:49 | deraadt | 1 | 1995-10-18 08:43:49 |
df['weekday'] = df['datetime'].apply(lambda x: x.isoweekday())
Montag = 1, Sonntag = 7
df.head()
author | c | datetime | weekday | |
---|---|---|---|---|
time | ||||
1995-10-18 08:38:49 | deraadt | 1 | 1995-10-18 08:38:49 | 3 |
1995-10-18 08:38:49 | deraadt | 1 | 1995-10-18 08:38:49 | 3 |
1995-10-18 08:43:09 | deraadt | 1 | 1995-10-18 08:43:09 | 3 |
1995-10-18 08:43:09 | deraadt | 1 | 1995-10-18 08:43:09 | 3 |
1995-10-18 08:43:49 | deraadt | 1 | 1995-10-18 08:43:49 | 3 |
wd=df.ix[:,["c","weekday"]]
per_weekday=wd.groupby("weekday")
per_weekday.sum()
c | |
---|---|
weekday | |
1 | 22131 |
2 | 22230 |
3 | 21833 |
4 | 21516 |
5 | 20204 |
6 | 16665 |
7 | 18179 |
Die meisten Commits am Wochenanfang. Am Wochenende weniger (insbesondere Samstag).
per_weekday.sum().plot(kind="bar")
<matplotlib.axes.AxesSubplot at 0x694aa10>
df['hour'] = df['datetime'].apply(lambda x: x.hour)
per_hour = df.groupby('hour')['c'].sum()
per_hour.plot(kind="bar")
<matplotlib.axes.AxesSubplot at 0x7d137d0>
Die meisten Commits abends.