This data was generated in the Git repository
JavaOnAutobahn/spring-petclinic
with
git log --stat > git_log_stat.log
.
This exports the history of the Git repository including some information about the file changes per commit.
Here is an excerpt form this created dataset:
commit 4d3d9de655faa813781027d8b1baed819c6a56fe
Author: Markus Harrer <feststelltaste@googlemail.com>
Date: Tue Mar 5 22:32:20 2019 +0100
add virtual bounded contexts
20 1 jqassistant/business.adoc
It doesn't contain also any tabular structured data but more a row-based style of data (hint: if you want this, you can use Git's --format
options to create such things).
The question is: Can we get this kind of data into a pandas DataFrame?
Warning: Please just read on if you can stand all the brain pain that follows.
import pandas as pd
log = pd.read_csv(
"../../joa_spring-petclinic/git_log_numstat.log",
sep="\n",
names=['raw'])
log.head()
raw | |
---|---|
0 | commit 4d3d9de655faa813781027d8b1baed819c6a56fe |
1 | Author: Markus Harrer <feststelltaste@googlema... |
2 | Date: Tue Mar 5 22:32:20 2019 +0100 |
3 | add virtual bounded contexts |
4 | test\ttest\t |
log['sha'] = log.loc[log['raw'].str.startswith("commit ")]['raw'].str.split("commit ").str[1]
log['author'] = log.loc[log['raw'].str.startswith("Author: ")]['raw'].str.split("Author: ").str[1]
log['timestamp'] = log.loc[log['raw'].str.startswith("Date: ")]['raw'].str.split("Date: ").str[1]
log.head()
raw | sha | author | timestamp | |
---|---|---|---|---|
0 | commit 4d3d9de655faa813781027d8b1baed819c6a56fe | 4d3d9de655faa813781027d8b1baed819c6a56fe | NaN | NaN |
1 | Author: Markus Harrer <feststelltaste@googlema... | NaN | Markus Harrer <feststelltaste@googlemail.com> | NaN |
2 | Date: Tue Mar 5 22:32:20 2019 +0100 | NaN | NaN | Tue Mar 5 22:32:20 2019 +0100 |
3 | add virtual bounded contexts | NaN | NaN | NaN |
4 | test\ttest\t | NaN | NaN | NaN |
log['message'] = log.loc[log['raw'].str.startswith(" "*4)]['raw'].str[4:]
log.head()
raw | sha | author | timestamp | message | |
---|---|---|---|---|---|
0 | commit 4d3d9de655faa813781027d8b1baed819c6a56fe | 4d3d9de655faa813781027d8b1baed819c6a56fe | NaN | NaN | NaN |
1 | Author: Markus Harrer <feststelltaste@googlema... | NaN | Markus Harrer <feststelltaste@googlemail.com> | NaN | NaN |
2 | Date: Tue Mar 5 22:32:20 2019 +0100 | NaN | NaN | Tue Mar 5 22:32:20 2019 +0100 | NaN |
3 | add virtual bounded contexts | NaN | NaN | NaN | add virtual bounded contexts |
4 | test\ttest\t | NaN | NaN | NaN | test\ttest\t |
log['no_entry'] = \
log['sha'].isna() & \
log['author'].isna() & \
log['timestamp'].isna() & \
log['message'].isna()
log.head()
raw | sha | author | timestamp | message | no_entry | |
---|---|---|---|---|---|---|
0 | commit 4d3d9de655faa813781027d8b1baed819c6a56fe | 4d3d9de655faa813781027d8b1baed819c6a56fe | NaN | NaN | NaN | False |
1 | Author: Markus Harrer <feststelltaste@googlema... | NaN | Markus Harrer <feststelltaste@googlemail.com> | NaN | NaN | False |
2 | Date: Tue Mar 5 22:32:20 2019 +0100 | NaN | NaN | Tue Mar 5 22:32:20 2019 +0100 | NaN | False |
3 | add virtual bounded contexts | NaN | NaN | NaN | add virtual bounded contexts | False |
4 | test\ttest\t | NaN | NaN | NaN | test\ttest\t | False |
log['sha'] = log['sha'].fillna(method="ffill")
log.head()
raw | sha | author | timestamp | message | no_entry | |
---|---|---|---|---|---|---|
0 | commit 4d3d9de655faa813781027d8b1baed819c6a56fe | 4d3d9de655faa813781027d8b1baed819c6a56fe | NaN | NaN | NaN | False |
1 | Author: Markus Harrer <feststelltaste@googlema... | 4d3d9de655faa813781027d8b1baed819c6a56fe | Markus Harrer <feststelltaste@googlemail.com> | NaN | NaN | False |
2 | Date: Tue Mar 5 22:32:20 2019 +0100 | 4d3d9de655faa813781027d8b1baed819c6a56fe | NaN | Tue Mar 5 22:32:20 2019 +0100 | NaN | False |
3 | add virtual bounded contexts | 4d3d9de655faa813781027d8b1baed819c6a56fe | NaN | NaN | add virtual bounded contexts | False |
4 | test\ttest\t | 4d3d9de655faa813781027d8b1baed819c6a56fe | NaN | NaN | test\ttest\t | False |
sha_msg = log.dropna(subset=['message']).groupby('sha')['message'].apply(' '.join)
sha_msg.head()
sha 024811d252f8d8218e6795d46203cff25971bc19 simplifying access to Integer 0365d34d2977dd24ec0bb3e8b0edff5694908c80 downgrade jqassistant due to weird error 0504ec9fe345d9d34b15c374333f709fb147e6d6 Update petclinic_db_setup_mysql.txt Correct in... 053c84ecc95b246ef4a40fb3d4304e8908604af4 migrated to Spring 4.0.1 057015c14cce4791ff309419de8a8bd6339fd6e7 Spring MVC Test Framework and migration to Spr... Name: message, dtype: object
sha_files = log[log['no_entry']][['sha', 'raw']]
sha_files = sha_files.set_index('sha')
sha_files.head()
raw | |
---|---|
sha | |
4d3d9de655faa813781027d8b1baed819c6a56fe | 20\t1\tjqassistant/business.adoc |
4d3d9de655faa813781027d8b1baed819c6a56fe | 1\t1\tsrc/main/java/org/springframework/sample... |
4d3d9de655faa813781027d8b1baed819c6a56fe | 2\t0\tsrc/main/java/org/springframework/sample... |
4d3d9de655faa813781027d8b1baed819c6a56fe | 2\t1\tsrc/main/java/org/springframework/sample... |
4d3d9de655faa813781027d8b1baed819c6a56fe | 2\t0\tsrc/main/java/org/springframework/sample... |
sha_files[['additions', 'deletions', 'filename']] = sha_files['raw'].str.split("\t", expand=True)
del(sha_files['raw'])
sha_files.head()
additions | deletions | filename | |
---|---|---|---|
sha | |||
4d3d9de655faa813781027d8b1baed819c6a56fe | 20 | 1 | jqassistant/business.adoc |
4d3d9de655faa813781027d8b1baed819c6a56fe | 1 | 1 | src/main/java/org/springframework/samples/petc... |
4d3d9de655faa813781027d8b1baed819c6a56fe | 2 | 0 | src/main/java/org/springframework/samples/petc... |
4d3d9de655faa813781027d8b1baed819c6a56fe | 2 | 1 | src/main/java/org/springframework/samples/petc... |
4d3d9de655faa813781027d8b1baed819c6a56fe | 2 | 0 | src/main/java/org/springframework/samples/petc... |
df = log.groupby('sha')[['author', 'timestamp']].first()
df.head()
author | timestamp | |
---|---|---|
sha | ||
024811d252f8d8218e6795d46203cff25971bc19 | Mic <misvy@vmware.com> | Thu Mar 14 18:04:36 2013 +0800 |
0365d34d2977dd24ec0bb3e8b0edff5694908c80 | Markus Harrer <feststelltaste@googlemail.com> | Mon Nov 12 10:28:34 2018 +0100 |
0504ec9fe345d9d34b15c374333f709fb147e6d6 | thinksh <thinkshihang@gmail.com> | Wed Feb 3 23:19:46 2016 -0500 |
053c84ecc95b246ef4a40fb3d4304e8908604af4 | Mic <misvy@vmware.com> | Mon Feb 3 09:31:44 2014 +0800 |
057015c14cce4791ff309419de8a8bd6339fd6e7 | Mic <misvy@vmware.com> | Fri Feb 15 15:31:04 2013 +0800 |
df = df.join(sha_msg)
df.head()
author | timestamp | message | |
---|---|---|---|
sha | |||
024811d252f8d8218e6795d46203cff25971bc19 | Mic <misvy@vmware.com> | Thu Mar 14 18:04:36 2013 +0800 | simplifying access to Integer |
0365d34d2977dd24ec0bb3e8b0edff5694908c80 | Markus Harrer <feststelltaste@googlemail.com> | Mon Nov 12 10:28:34 2018 +0100 | downgrade jqassistant due to weird error |
0504ec9fe345d9d34b15c374333f709fb147e6d6 | thinksh <thinkshihang@gmail.com> | Wed Feb 3 23:19:46 2016 -0500 | Update petclinic_db_setup_mysql.txt Correct in... |
053c84ecc95b246ef4a40fb3d4304e8908604af4 | Mic <misvy@vmware.com> | Mon Feb 3 09:31:44 2014 +0800 | migrated to Spring 4.0.1 |
057015c14cce4791ff309419de8a8bd6339fd6e7 | Mic <misvy@vmware.com> | Fri Feb 15 15:31:04 2013 +0800 | Spring MVC Test Framework and migration to Spr... |
df = df.join(sha_files, how='right')
df.head()
author | timestamp | message | additions | deletions | filename | |
---|---|---|---|---|---|---|
sha | ||||||
024811d252f8d8218e6795d46203cff25971bc19 | Mic <misvy@vmware.com> | Thu Mar 14 18:04:36 2013 +0800 | simplifying access to Integer | 1 | 1 | src/main/java/org/springframework/samples/petc... |
0365d34d2977dd24ec0bb3e8b0edff5694908c80 | Markus Harrer <feststelltaste@googlemail.com> | Mon Nov 12 10:28:34 2018 +0100 | downgrade jqassistant due to weird error | 1 | 1 | pom.xml |
0504ec9fe345d9d34b15c374333f709fb147e6d6 | thinksh <thinkshihang@gmail.com> | Wed Feb 3 23:19:46 2016 -0500 | Update petclinic_db_setup_mysql.txt Correct in... | 1 | 1 | src/main/resources/db/mysql/petclinic_db_setup... |
053c84ecc95b246ef4a40fb3d4304e8908604af4 | Mic <misvy@vmware.com> | Mon Feb 3 09:31:44 2014 +0800 | migrated to Spring 4.0.1 | 1 | 1 | pom.xml |
057015c14cce4791ff309419de8a8bd6339fd6e7 | Mic <misvy@vmware.com> | Fri Feb 15 15:31:04 2013 +0800 | Spring MVC Test Framework and migration to Spr... | 1 | 18 | .springBeans |