This is an experiment in using the Kotlin kernel for Jupyter¶

Note: this notebook was updated July 2021 to point to newer versions of its dependencies, which had become deprecated and were not allowing the notebook to complete successfully. It was also used for a presentation of Kotlin's Jupyter kernel in March 2021, so the 2020 season data, which didn't exist at the time the initial article was written, was added.

In [1]:

// two "supported" packages, we can skip the full dependency & import boilerplate
%use lets-plot, krangl

In [2]:

// csv is courtesy of pro-football-reference: https://www.pro-football-reference.com/years/NFL/scoring.htm
val dfScoring = DataFrame.readCSV("nfl_scoring.csv")
dfScoring

Out[2]:

Rk	Year	Tms	RshTD	RecTD	PR TD	KR TD	FblTD	IntTD	OthTD	AllTD	2PM	2PA	XPM	XPA	FGM	FGA	Sfty	Pts	Pts/G
1	2020	32	532	871	8	7	21	31	3	1473	63	131	1244	1338	812	960	24	12692	24.8
2	2019	32	447	797	7	7	34	35	5	1332	54	113	1136	1210	802	983	17	11676	22.8
3	2018	32	439	847	7	5	24	45	4	1371	66	129	1164	1235	802	947	10	11948	23.3
4	2017	32	380	741	10	7	41	42	4	1225	37	82	1066	1134	866	1027	15	11118	21.7
5	2016	32	443	786	10	7	22	34	4	1306	51	105	1119	1195	850	1009	20	11647	22.8
6	2015	32	365	842	13	7	33	53	5	1318	45	94	1146	1217	834	987	16	11678	22.8
7	2014	32	380	807	13	6	28	47	12	1293	28	58	1222	1230	829	987	21	11565	22.6
8	2013	32	410	804	13	7	30	65	9	1338	34	69	1262	1267	863	998	20	11987	23.4
9	2012	32	401	757	18	13	26	71	11	1297	29	56	1229	1235	852	1016	13	11651	22.8
10	2011	32	400	745	20	9	31	49	5	1259	24	50	1200	1207	838	1011	21	11358	22.2
11	2010	32	399	751	13	23	22	57	5	1270	26	50	1203	1214	794	964	13	11283	22.0
12	2009	32	429	710	10	18	25	48	7	1247	24	59	1165	1185	756	930	14	10991	21.5
13	2008	32	476	646	16	13	33	52	10	1246	28	64	1170	1176	845	1000	21	11279	22.0
14	2007	32	386	720	17	25	37	52	6	1243	30	57	1165	1177	795	960	18	11104	21.7
15	2006	32	424	648	15	9	33	49	3	1181	21	35	1124	1135	767	942	12	10577	20.7
16	2005	32	431	644	9	12	23	47	6	1172	27	47	1099	1114	783	967	11	10556	20.6
17	2004	32	416	732	11	17	34	53	5	1268	37	73	1179	1189	703	870	15	11000	21.5
18	2003	32	427	654	18	13	24	58	4	1198	29	60	1110	1128	756	954	21	10666	20.8
19	2002	32	460	694	22	17	26	46	5	1270	47	81	1148	1165	737	951	12	11097	21.7
20	2001	31	365	635	12	10	33	59	6	1120	40	85	1008	1027	732	959	10	10024	20.2

... only showing top 20 rows

look how filter is a normal, native Kotlin command! Only difference is lt or gt instead of > or <

compare to non-native Python required by Pandas for simple filtering: df.loc[(df['column_name'] >= A) & (df['column_name'] <= B)]

https://stackoverflow.com/questions/17071871/how-do-i-select-rows-from-a-dataframe-based-on-column-values

In [3]:

// DataFrames are cool but so are native Kotlin data structures, like Maps
val mapScoring = dfScoring.filter { (it["Year"] lt 2021) AND (it["Year"] gt 1990) }.toMap()
mapScoring.keys

Out[3]:

[Rk, Year, Tms, RshTD, RecTD, PR TD, KR TD, FblTD, IntTD, OthTD, AllTD, 2PM, 2PA, XPM, XPA, FGM, FGA, Sfty, Pts, Pts/G]

In [4]:

// the map's keys are strings (column titles), the values are lists, the individual lists contain the column data
mapScoring["Year"]?.map { it }

Out[4]:

[2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010, 2009, 2008, 2007, 2006, 2005, 2004, 2003, 2002, 2001, 2000, 1999, 1998, 1997, 1996, 1995, 1994, 1993, 1992, 1991]

In [5]:

// boom... we can easily plot a key column, Total Points
val p = letsPlot(mapScoring) { x = "Year"; y = "Pts" } + ggsize(640, 240)
p + geomBar(stat=Stat.identity) +
    ggtitle("Total Points per NFL regular season")

Out[5]:

In [6]:

// and another, Receiving TDs
val p = letsPlot(mapScoring) { x = "Year"; y = "RecTD" } + ggsize(640, 240)
p + geomBar(stat=Stat.identity) +
    ggtitle("Total Receiving Touchdowns per NFL regular season")

Out[6]:

The graphs will be a bit more dramatic if we group the years by 5-year buckets¶

In [27]:

// to add new columns (for bucketing), we return to the original DataFrame and create new columns based on existing values
// krangl's `addColumn` is not a native Kotlin method, but its syntax is just like `filter` or `map`, it accesses `it`, etc.
val dfScoringRanges = dfScoring
    .filter { (it["Year"] lt 2021) AND (it["Year"] gt 1990) }
    .addColumn("YearRange") { it["Year"].map<Double>{ floor(it.minus(1).div(5.0)).times(5).plus(1).toInt() }}
    .addColumn("Years") { it["YearRange"].map<Int>{ "$it - ${it + 4}" }}
 
// we're creating another Map, but now we are grouping by year bucket and averaging the values within each bucket
val mapScoringRanges = dfScoringRanges
    .select({ listOf("Year", "Pts", "RecTD", "YearRange", "Years") })
    .groupBy("YearRange", "Years")
    .summarize(
        "mean_Pts" to { it["Pts"].mean(removeNA = true) },
        "mean_RecTD" to { it["RecTD"].mean(removeNA = true) }
    ).toMap()

// these xlimits are the discrete values used on the x-axis (and the labels)
// only annoying thing is all the null handling of a data source we know is non-null
val xlimits = mapScoringRanges["Years"]?.toSet()?.reversed()?.filterNotNull()

In [28]:

// same plot as before, but bucketed -- unlike above graph, every value is higher than previous, no ups & downs
val p = letsPlot(mapScoringRanges) { x = "Years"; y = "mean_Pts" } + ggsize(780, 240)
    p + geomBar(stat=Stat.identity) + scaleXDiscrete(limits = xlimits) +
    ggtitle("Average total points per NFL regular season")

Out[28]:

In [29]:

// ggsave(p + geom_bar(stat=Stat.identity) + scale_x_discrete(limits = xlimits) +
//     ggtitle("Average total points per NFL regular season"), "avg_points_binned.png")

In [30]:

// again, same plot, bucketed
val p2 = letsPlot(mapScoringRanges) { x = "Years"; y = "mean_RecTD" } + ggsize(780, 240)
p2 + geomBar(stat=Stat.identity) + scaleXDiscrete(limits = xlimits) +
    ggtitle("Average Receiving Touchdowns per NFL regular season")

Out[30]:

In [31]:

// ggsave(p2 + geom_bar(stat=Stat.identity) + scale_x_discrete(limits = xlimits) +
//     ggtitle("Average Receiving Touchdowns per NFL regular season"), "avg_rectd_binned.png")

In [ ]: