# リスト3.8.1:DataFrameの集約
import os
import pandas as pd
import numpy as np
base_url = (
"https://raw.githubusercontent.com/practical-jupyter/sample-data/master/anime/"
)
anime_master_csv = os.path.join(base_url, "anime_master.csv")
df = pd.read_csv(anime_master_csv)
grouped = df.groupby("type")
type(grouped)
pandas.core.groupby.DataFrameGroupBy
# リスト3.8.3:集約されたデータの平均値
grouped.mean().round(1)
anime_id | episodes | rating | members | |
---|---|---|---|---|
type | ||||
Movie | 14322.5 | 1.1 | 6.3 | 10654.0 |
Music | 22495.1 | 1.1 | 5.6 | 1273.0 |
ONA | 22738.0 | 6.8 | 5.6 | 4401.8 |
OVA | 12207.7 | 2.5 | 6.5 | 6849.5 |
Special | 16802.3 | 2.5 | 6.5 | 7424.6 |
TV | 10929.6 | 37.5 | 6.9 | 41832.3 |
# リスト3.8.4:集約されたデータの基本統計量
grouped.describe().round(1).head(16)
anime_id | episodes | members | rating | ||
---|---|---|---|---|---|
type | |||||
Movie | count | 2220.0 | 2220.0 | 2220.0 | 2220.0 |
mean | 14322.5 | 1.1 | 10654.0 | 6.3 | |
std | 10925.7 | 2.2 | 31603.6 | 1.2 | |
min | 5.0 | 1.0 | 13.0 | 2.5 | |
25% | 4396.8 | 1.0 | 119.0 | 5.4 | |
50% | 10677.5 | 1.0 | 489.5 | 6.5 | |
75% | 24071.5 | 1.0 | 4239.0 | 7.3 | |
max | 34201.0 | 100.0 | 466254.0 | 10.0 | |
Music | count | 485.0 | 485.0 | 485.0 | 485.0 |
mean | 22495.1 | 1.1 | 1273.0 | 5.6 | |
std | 10175.0 | 1.3 | 4489.0 | 1.0 | |
min | 731.0 | 1.0 | 24.0 | 3.3 | |
25% | 12101.0 | 1.0 | 97.0 | 5.0 | |
50% | 24903.0 | 1.0 | 226.0 | 5.6 | |
75% | 31925.0 | 1.0 | 797.0 | 6.2 | |
max | 34412.0 | 24.0 | 71136.0 | 8.4 |
# リスト3.8.5:集約された複数列データの平均値
df.groupby(["type", "episodes"]).mean().round(1).head(20)
anime_id | rating | members | ||
---|---|---|---|---|
type | episodes | |||
Movie | 1 | 14320.0 | 6.3 | 10588.6 |
2 | 13802.0 | 6.9 | 6638.9 | |
3 | 11339.3 | 6.7 | 53598.1 | |
4 | 15723.5 | 7.3 | 3566.5 | |
5 | 12558.3 | 6.1 | 3641.0 | |
6 | 8433.5 | 6.0 | 178.5 | |
7 | 13602.5 | 6.9 | 11989.5 | |
9 | 8928.0 | 6.2 | 267.0 | |
10 | 31020.0 | 6.9 | 57.0 | |
12 | 20908.0 | 5.4 | 790.7 | |
14 | 13817.0 | 6.0 | 137.0 | |
100 | 29924.0 | 7.0 | 56.0 | |
Music | 1 | 22576.2 | 5.6 | 1200.0 |
2 | 28390.8 | 6.0 | 404.8 | |
4 | 1998.0 | 6.7 | 6650.0 | |
5 | 14359.0 | 6.2 | 1582.0 | |
7 | 8348.0 | 7.1 | 13361.0 | |
8 | 4705.0 | 7.3 | 22213.0 | |
14 | 5016.0 | 6.7 | 2240.0 | |
24 | 32633.0 | 7.6 | 560.0 |
# リスト3.8.6:pivot_tableメソッドによる平均値
df.pivot_table(index="type", aggfunc=np.mean)
anime_id | episodes | members | rating | |
---|---|---|---|---|
type | ||||
Movie | 14322.477928 | 1.100901 | 10654.022072 | 6.328599 |
Music | 22495.111340 | 1.125773 | 1273.028866 | 5.583918 |
ONA | 22738.000000 | 6.778342 | 4401.822335 | 5.629628 |
OVA | 12207.692547 | 2.549689 | 6849.526398 | 6.475217 |
Special | 16802.341463 | 2.495715 | 7424.628873 | 6.525577 |
TV | 10929.554655 | 37.456156 | 41832.314414 | 6.928961 |
# リスト3.8.7:pivot_tableメソッドによる複数列の平均値
df.pivot_table(index=["type", "episodes"], aggfunc=np.mean)
anime_id | members | rating | ||
---|---|---|---|---|
type | episodes | |||
Movie | 1 | 14320.010507 | 10588.556418 | 6.325464 |
2 | 13802.000000 | 6638.875000 | 6.930000 | |
3 | 11339.285714 | 53598.142857 | 6.661429 | |
4 | 15723.500000 | 3566.500000 | 7.260000 | |
5 | 12558.333333 | 3641.000000 | 6.063333 | |
6 | 8433.500000 | 178.500000 | 5.980000 | |
7 | 13602.500000 | 11989.500000 | 6.940000 | |
9 | 8928.000000 | 267.000000 | 6.220000 | |
10 | 31020.000000 | 57.000000 | 6.860000 | |
12 | 20908.000000 | 790.666667 | 5.450000 | |
14 | 13817.000000 | 137.000000 | 6.000000 | |
100 | 29924.000000 | 56.000000 | 7.000000 | |
Music | 1 | 22576.194093 | 1199.976793 | 5.562089 |
2 | 28390.800000 | 404.800000 | 6.020000 | |
4 | 1998.000000 | 6650.000000 | 6.740000 | |
5 | 14359.000000 | 1582.000000 | 6.190000 | |
7 | 8348.000000 | 13361.000000 | 7.090000 | |
8 | 4705.000000 | 22213.000000 | 7.270000 | |
14 | 5016.000000 | 2240.000000 | 6.740000 | |
24 | 32633.000000 | 560.000000 | 7.640000 | |
ONA | 1 | 23349.758007 | 1861.380783 | 5.320925 |
2 | 22729.257143 | 863.685714 | 5.115143 | |
3 | 25820.379310 | 3876.724138 | 5.714828 | |
4 | 21650.931034 | 7187.655172 | 5.676897 | |
5 | 20013.217391 | 5855.739130 | 5.636957 | |
6 | 18373.962963 | 10364.962963 | 5.960741 | |
7 | 15560.615385 | 5263.923077 | 5.338462 | |
8 | 22464.071429 | 1116.714286 | 5.497857 | |
9 | 21263.250000 | 7259.000000 | 5.972500 | |
10 | 20428.111111 | 7128.333333 | 6.126667 | |
... | ... | ... | ... | ... |
TV | 243 | 2222.000000 | 5330.000000 | 7.280000 |
260 | 17280.000000 | 89.000000 | 6.730000 | |
263 | 17601.000000 | 95.000000 | 6.840000 | |
276 | 527.000000 | 229157.000000 | 7.430000 | |
283 | 20077.000000 | 75.000000 | 6.750000 | |
291 | 813.000000 | 375662.000000 | 8.320000 | |
296 | 2721.500000 | 15076.500000 | 7.040000 | |
300 | 3683.000000 | 240.000000 | 6.550000 | |
305 | 8127.000000 | 562.500000 | 6.460000 | |
312 | 14212.000000 | 150.500000 | 6.395000 | |
330 | 3006.000000 | 1524.000000 | 7.390000 | |
331 | 6509.000000 | 1083.000000 | 6.630000 | |
358 | 516.000000 | 31632.000000 | 7.930000 | |
365 | 14566.000000 | 841.500000 | 6.195000 | |
366 | 269.000000 | 624055.000000 | 7.950000 | |
373 | 3545.000000 | 4734.000000 | 7.820000 | |
475 | 31517.000000 | 146.000000 | 5.500000 | |
510 | 7981.000000 | 161.000000 | 6.540000 | |
526 | 11595.000000 | 447.000000 | 6.340000 | |
694 | 4936.000000 | 2116.000000 | 6.920000 | |
726 | 9768.000000 | 237.000000 | 6.250000 | |
773 | 20117.000000 | 110.000000 | 5.630000 | |
1006 | 10241.000000 | 153.000000 | 5.920000 | |
1274 | 22221.000000 | 112.000000 | 6.800000 | |
1306 | 32448.000000 | 59.000000 | 6.430000 | |
1428 | 8213.000000 | 194.000000 | 6.050000 | |
1471 | 6277.000000 | 406.000000 | 6.480000 | |
1565 | 23349.000000 | 116.000000 | 5.560000 | |
1787 | 2471.000000 | 14233.000000 | 7.760000 | |
1818 | 12393.000000 | 160.000000 | 6.180000 |
291 rows × 3 columns
# リスト3.8.8:genre列を分割する前処理
# genres列のカンマ区切りのデータを分割
genres = df["genre"].map(lambda x: x.split(","))
# numpy.arrayにして2次元から1次元のデータに変換
ser = pd.Series(np.hstack(genres.values))
# ユニークにする
unique_genres = ser.str.strip().unique()
unique_genres.sort()
unique_genres
array(['Action', 'Adventure', 'Cars', 'Comedy', 'Dementia', 'Demons', 'Drama', 'Fantasy', 'Game', 'Harem', 'Historical', 'Horror', 'Josei', 'Kids', 'Magic', 'Martial Arts', 'Mecha', 'Military', 'Music', 'Mystery', 'Parody', 'Police', 'Psychological', 'Romance', 'Samurai', 'School', 'Sci-Fi', 'Seinen', 'Shoujo', 'Shoujo Ai', 'Shounen', 'Shounen Ai', 'Slice of Life', 'Space', 'Sports', 'Super Power', 'Supernatural', 'Thriller', 'Vampire'], dtype=object)
# リスト3.8.10:genreごとのDataFrameを結合する前処理
# 指定したジャンル名をDataFrameから抽出
def filter_df_by_genre(df, genre):
genre_df = df.loc[df["genre"].map(lambda x: genre in x)].copy()
genre_df["genre"] = genre
return genre_df
# 上記の関数をすべてのジャンルに対して実行
genre_df_list = [filter_df_by_genre(df, genre) for genre in unique_genres]
# 上記dataを結合
df2 = pd.concat(genre_df_list)
# name列でソート
df2.sort_values("name", inplace=True)
# メンバ数が多いジャンルトップ10
top10 = df2.groupby("genre")["members"].sum().sort_values(ascending=False).index[:10]
# top10からデータを抽出
df2 = df2[df2["genre"].isin(top10)]
# リスト3.8.11: 「君の名は。 」の抽出
df2.loc[df2["name"] == "Kimi no Na wa."]
anime_id | name | genre | type | episodes | rating | members | |
---|---|---|---|---|---|---|---|
0 | 32281 | Kimi no Na wa. | Supernatural | Movie | 1 | 9.37 | 200630 |
0 | 32281 | Kimi no Na wa. | Drama | Movie | 1 | 9.37 | 200630 |
0 | 32281 | Kimi no Na wa. | Romance | Movie | 1 | 9.37 | 200630 |
0 | 32281 | Kimi no Na wa. | School | Movie | 1 | 9.37 | 200630 |
# リスト3.8.12:genre列とtype列のクロス集計
df2.pivot_table(
index="genre", columns="type", values=["members"], aggfunc=np.sum
).head()
members | ||||||
---|---|---|---|---|---|---|
type | Movie | Music | ONA | OVA | Special | TV |
genre | ||||||
Action | 10224960.0 | 77054.0 | 524907.0 | 5793680.0 | 3412689.0 | 63364032.0 |
Adventure | 9485223.0 | 42829.0 | 70431.0 | 2373765.0 | 2052024.0 | 27529975.0 |
Comedy | 7293127.0 | 20860.0 | 1477266.0 | 5614758.0 | 6659293.0 | 65420862.0 |
Drama | 9034099.0 | 100734.0 | 188427.0 | 3043374.0 | 1915578.0 | 41011557.0 |
Fantasy | 8019406.0 | 43962.0 | 188937.0 | 2754224.0 | 2504131.0 | 34932563.0 |