Note: this notebook was updated July 2021 to point to newer versions of its dependencies, which had become deprecated and were not allowing the notebook to complete successfully. It was also used for a presentation of Kotlin's Jupyter kernel in March 2021, so the 2020 season data, which didn't exist at the time the initial article was written, was added.
// two "supported" packages, we can skip the full dependency & import boilerplate
%use lets-plot, krangl
// csv is courtesy of pro-football-reference: https://www.pro-football-reference.com/years/NFL/scoring.htm
val dfScoring = DataFrame.readCSV("nfl_scoring.csv")
dfScoring
Rk | Year | Tms | RshTD | RecTD | PR TD | KR TD | FblTD | IntTD | OthTD | AllTD | 2PM | 2PA | XPM | XPA | FGM | FGA | Sfty | Pts | Pts/G |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 2020 | 32 | 532 | 871 | 8 | 7 | 21 | 31 | 3 | 1473 | 63 | 131 | 1244 | 1338 | 812 | 960 | 24 | 12692 | 24.8 |
2 | 2019 | 32 | 447 | 797 | 7 | 7 | 34 | 35 | 5 | 1332 | 54 | 113 | 1136 | 1210 | 802 | 983 | 17 | 11676 | 22.8 |
3 | 2018 | 32 | 439 | 847 | 7 | 5 | 24 | 45 | 4 | 1371 | 66 | 129 | 1164 | 1235 | 802 | 947 | 10 | 11948 | 23.3 |
4 | 2017 | 32 | 380 | 741 | 10 | 7 | 41 | 42 | 4 | 1225 | 37 | 82 | 1066 | 1134 | 866 | 1027 | 15 | 11118 | 21.7 |
5 | 2016 | 32 | 443 | 786 | 10 | 7 | 22 | 34 | 4 | 1306 | 51 | 105 | 1119 | 1195 | 850 | 1009 | 20 | 11647 | 22.8 |
6 | 2015 | 32 | 365 | 842 | 13 | 7 | 33 | 53 | 5 | 1318 | 45 | 94 | 1146 | 1217 | 834 | 987 | 16 | 11678 | 22.8 |
7 | 2014 | 32 | 380 | 807 | 13 | 6 | 28 | 47 | 12 | 1293 | 28 | 58 | 1222 | 1230 | 829 | 987 | 21 | 11565 | 22.6 |
8 | 2013 | 32 | 410 | 804 | 13 | 7 | 30 | 65 | 9 | 1338 | 34 | 69 | 1262 | 1267 | 863 | 998 | 20 | 11987 | 23.4 |
9 | 2012 | 32 | 401 | 757 | 18 | 13 | 26 | 71 | 11 | 1297 | 29 | 56 | 1229 | 1235 | 852 | 1016 | 13 | 11651 | 22.8 |
10 | 2011 | 32 | 400 | 745 | 20 | 9 | 31 | 49 | 5 | 1259 | 24 | 50 | 1200 | 1207 | 838 | 1011 | 21 | 11358 | 22.2 |
11 | 2010 | 32 | 399 | 751 | 13 | 23 | 22 | 57 | 5 | 1270 | 26 | 50 | 1203 | 1214 | 794 | 964 | 13 | 11283 | 22.0 |
12 | 2009 | 32 | 429 | 710 | 10 | 18 | 25 | 48 | 7 | 1247 | 24 | 59 | 1165 | 1185 | 756 | 930 | 14 | 10991 | 21.5 |
13 | 2008 | 32 | 476 | 646 | 16 | 13 | 33 | 52 | 10 | 1246 | 28 | 64 | 1170 | 1176 | 845 | 1000 | 21 | 11279 | 22.0 |
14 | 2007 | 32 | 386 | 720 | 17 | 25 | 37 | 52 | 6 | 1243 | 30 | 57 | 1165 | 1177 | 795 | 960 | 18 | 11104 | 21.7 |
15 | 2006 | 32 | 424 | 648 | 15 | 9 | 33 | 49 | 3 | 1181 | 21 | 35 | 1124 | 1135 | 767 | 942 | 12 | 10577 | 20.7 |
16 | 2005 | 32 | 431 | 644 | 9 | 12 | 23 | 47 | 6 | 1172 | 27 | 47 | 1099 | 1114 | 783 | 967 | 11 | 10556 | 20.6 |
17 | 2004 | 32 | 416 | 732 | 11 | 17 | 34 | 53 | 5 | 1268 | 37 | 73 | 1179 | 1189 | 703 | 870 | 15 | 11000 | 21.5 |
18 | 2003 | 32 | 427 | 654 | 18 | 13 | 24 | 58 | 4 | 1198 | 29 | 60 | 1110 | 1128 | 756 | 954 | 21 | 10666 | 20.8 |
19 | 2002 | 32 | 460 | 694 | 22 | 17 | 26 | 46 | 5 | 1270 | 47 | 81 | 1148 | 1165 | 737 | 951 | 12 | 11097 | 21.7 |
20 | 2001 | 31 | 365 | 635 | 12 | 10 | 33 | 59 | 6 | 1120 | 40 | 85 | 1008 | 1027 | 732 | 959 | 10 | 10024 | 20.2 |
... only showing top 20 rows
look how filter
is a normal, native Kotlin command! Only difference is lt
or gt
instead of >
or <
compare to non-native Python required by Pandas for simple filtering:
df.loc[(df['column_name'] >= A) & (df['column_name'] <= B)]
// DataFrames are cool but so are native Kotlin data structures, like Maps
val mapScoring = dfScoring.filter { (it["Year"] lt 2021) AND (it["Year"] gt 1990) }.toMap()
mapScoring.keys
[Rk, Year, Tms, RshTD, RecTD, PR TD, KR TD, FblTD, IntTD, OthTD, AllTD, 2PM, 2PA, XPM, XPA, FGM, FGA, Sfty, Pts, Pts/G]
// the map's keys are strings (column titles), the values are lists, the individual lists contain the column data
mapScoring["Year"]?.map { it }
[2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010, 2009, 2008, 2007, 2006, 2005, 2004, 2003, 2002, 2001, 2000, 1999, 1998, 1997, 1996, 1995, 1994, 1993, 1992, 1991]
// boom... we can easily plot a key column, Total Points
val p = letsPlot(mapScoring) { x = "Year"; y = "Pts" } + ggsize(640, 240)
p + geomBar(stat=Stat.identity) +
ggtitle("Total Points per NFL regular season")
// and another, Receiving TDs
val p = letsPlot(mapScoring) { x = "Year"; y = "RecTD" } + ggsize(640, 240)
p + geomBar(stat=Stat.identity) +
ggtitle("Total Receiving Touchdowns per NFL regular season")
// to add new columns (for bucketing), we return to the original DataFrame and create new columns based on existing values
// krangl's `addColumn` is not a native Kotlin method, but its syntax is just like `filter` or `map`, it accesses `it`, etc.
val dfScoringRanges = dfScoring
.filter { (it["Year"] lt 2021) AND (it["Year"] gt 1990) }
.addColumn("YearRange") { it["Year"].map<Double>{ floor(it.minus(1).div(5.0)).times(5).plus(1).toInt() }}
.addColumn("Years") { it["YearRange"].map<Int>{ "$it - ${it + 4}" }}
// we're creating another Map, but now we are grouping by year bucket and averaging the values within each bucket
val mapScoringRanges = dfScoringRanges
.select({ listOf("Year", "Pts", "RecTD", "YearRange", "Years") })
.groupBy("YearRange", "Years")
.summarize(
"mean_Pts" to { it["Pts"].mean(removeNA = true) },
"mean_RecTD" to { it["RecTD"].mean(removeNA = true) }
).toMap()
// these xlimits are the discrete values used on the x-axis (and the labels)
// only annoying thing is all the null handling of a data source we know is non-null
val xlimits = mapScoringRanges["Years"]?.toSet()?.reversed()?.filterNotNull()
// same plot as before, but bucketed -- unlike above graph, every value is higher than previous, no ups & downs
val p = letsPlot(mapScoringRanges) { x = "Years"; y = "mean_Pts" } + ggsize(780, 240)
p + geomBar(stat=Stat.identity) + scaleXDiscrete(limits = xlimits) +
ggtitle("Average total points per NFL regular season")
// ggsave(p + geom_bar(stat=Stat.identity) + scale_x_discrete(limits = xlimits) +
// ggtitle("Average total points per NFL regular season"), "avg_points_binned.png")
// again, same plot, bucketed
val p2 = letsPlot(mapScoringRanges) { x = "Years"; y = "mean_RecTD" } + ggsize(780, 240)
p2 + geomBar(stat=Stat.identity) + scaleXDiscrete(limits = xlimits) +
ggtitle("Average Receiving Touchdowns per NFL regular season")
// ggsave(p2 + geom_bar(stat=Stat.identity) + scale_x_discrete(limits = xlimits) +
// ggtitle("Average Receiving Touchdowns per NFL regular season"), "avg_rectd_binned.png")