import pandas as pd
# http://archive.ics.uci.edu/ml/datasets/Online+Retail# のデータを利用
data_dtype = {'InvoiceNo':'str', 'StockCode': 'str', 'Description': 'str', 'Quantity': 'int', 'InvoiceDate': 'datetime64', 'UnitPrice': 'float', 'CustomerID': 'str', 'Country': 'str'}
df = pd.read_excel("./Online Retail.xlsx", dtype = data_dtype, keep_default_na=False)
# 541909 entriesと表示されて、エクセルで開いた時のレコード数と一致するか確認する
# エクセルでは列名を含むので、541910行ある
# nullという欠損データとして扱われるデータがないことも確認する。
# 欠損がある場合は削除や調査による穴埋め、統計処理により代入などの追加処理が必要になる
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 541909 entries, 0 to 541908 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 InvoiceNo 541909 non-null object 1 StockCode 541909 non-null object 2 Description 541909 non-null object 3 Quantity 541909 non-null int64 4 InvoiceDate 541909 non-null datetime64[ns] 5 UnitPrice 541909 non-null float64 6 CustomerID 541909 non-null object 7 Country 541909 non-null object dtypes: datetime64[ns](1), float64(1), int64(1), object(5) memory usage: 33.1+ MB
# 先頭5行を表示
df.head()
InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | |
---|---|---|---|---|---|---|---|---|
0 | 536365 | 85123A | WHITE HANGING HEART T-LIGHT HOLDER | 6 | 2010-12-01 08:26:00 | 2.55 | 17850 | United Kingdom |
1 | 536365 | 71053 | WHITE METAL LANTERN | 6 | 2010-12-01 08:26:00 | 3.39 | 17850 | United Kingdom |
2 | 536365 | 84406B | CREAM CUPID HEARTS COAT HANGER | 8 | 2010-12-01 08:26:00 | 2.75 | 17850 | United Kingdom |
3 | 536365 | 84029G | KNITTED UNION FLAG HOT WATER BOTTLE | 6 | 2010-12-01 08:26:00 | 3.39 | 17850 | United Kingdom |
4 | 536365 | 84029E | RED WOOLLY HOTTIE WHITE HEART. | 6 | 2010-12-01 08:26:00 | 3.39 | 17850 | United Kingdom |
# 数値データの集計
# 購入量 Quantityにマイナスが含まれている。ログデータを見ると返品時の処理と推測される
# UnitPriceがマイナスの場合については判明せず。業務の場合は取引先にヒアリングを実施
df.describe()
Quantity | UnitPrice | |
---|---|---|
count | 541909.000000 | 541909.000000 |
mean | 9.552250 | 4.611114 |
std | 218.081158 | 96.759853 |
min | -80995.000000 | -11062.060000 |
25% | 1.000000 | 1.250000 |
50% | 3.000000 | 2.080000 |
75% | 10.000000 | 4.130000 |
max | 80995.000000 | 38970.000000 |
# 会員に限定して分析
df = df.loc[df["CustomerID"] != "", :]
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 406829 entries, 0 to 541908 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 InvoiceNo 406829 non-null object 1 StockCode 406829 non-null object 2 Description 406829 non-null object 3 Quantity 406829 non-null int64 4 InvoiceDate 406829 non-null datetime64[ns] 5 UnitPrice 406829 non-null float64 6 CustomerID 406829 non-null object 7 Country 406829 non-null object dtypes: datetime64[ns](1), float64(1), int64(1), object(5) memory usage: 27.9+ MB
# 2011年10月末までを来店の集計期間にする
train_df = df.loc[df["InvoiceDate"] < "2011-11-01",]
import datetime
def transform(data_df):
days = max(data_df.loc[:, "InvoiceDate"]) - min(data_df.loc[:, "InvoiceDate"])
# Recency
recency = (datetime.datetime(2011, 11, 1) - data_df.groupby("CustomerID")["InvoiceDate"].max()).apply(lambda x: x.days)
recency.name = "recency"
# Frequency
frequency = data_df.groupby("CustomerID")["InvoiceNo"].nunique() / days.days
frequency.name = "frequency"
# Monetary
data_process_df = data_df.copy()
data_process_df.loc[:, "total_price"] = data_process_df.loc[:, "Quantity"] * data_process_df.loc[:, "UnitPrice"]
monetary = data_process_df.groupby("CustomerID")["total_price"].sum()
monetary.name = "monetary"
return pd.merge(recency, frequency, left_index=True, right_index=True).merge(monetary, left_index=True, right_index=True)
def is_visit(data_df, visitors):
data_prcess_df = data_df.copy()
data_prcess_df.loc[:, "is_visit"] = data_prcess_df.loc[:, "CustomerID"].apply(lambda x: x in visitors)
return data_prcess_df.groupby("CustomerID")["is_visit"].max()
# RFM分析
df_rfm = transform(train_df)
df_rfm
recency | frequency | monetary | |
---|---|---|---|
CustomerID | |||
12346 | 286 | 0.005988 | 0.00 |
12347 | 0 | 0.017964 | 4085.18 |
12348 | 36 | 0.011976 | 1797.24 |
12350 | 271 | 0.002994 | 334.40 |
12352 | 33 | 0.029940 | 1233.68 |
... | ... | ... | ... |
18280 | 238 | 0.002994 | 180.60 |
18281 | 141 | 0.002994 | 80.82 |
18282 | 83 | 0.005988 | 98.76 |
18283 | 4 | 0.032934 | 1235.32 |
18287 | 3 | 0.008982 | 1837.28 |
4010 rows × 3 columns
# 11月に購入実績があるかの判定フラグをたてる
# 11月に購入実績のあるCustomerIDを取得
nov_customers = df.loc[(df["InvoiceDate"] >= "2011-11-01") & (df["InvoiceDate"] < "2011-12-01"), "CustomerID"].unique()
# 10月末までのCustomerで11月に購入実績があるかを判定
visit_df = is_visit(train_df, nov_customers)
# RFM分析の結果と結合する
all_df = pd.merge(df_rfm, visit_df, left_index=True, right_index=True).reset_index()
all_df
CustomerID | recency | frequency | monetary | is_visit | |
---|---|---|---|---|---|
0 | 12346 | 286 | 0.005988 | 0.00 | False |
1 | 12347 | 0 | 0.017964 | 4085.18 | False |
2 | 12348 | 36 | 0.011976 | 1797.24 | False |
3 | 12350 | 271 | 0.002994 | 334.40 | False |
4 | 12352 | 33 | 0.029940 | 1233.68 | True |
... | ... | ... | ... | ... | ... |
4005 | 18280 | 238 | 0.002994 | 180.60 | False |
4006 | 18281 | 141 | 0.002994 | 80.82 | False |
4007 | 18282 | 83 | 0.005988 | 98.76 | False |
4008 | 18283 | 4 | 0.032934 | 1235.32 | True |
4009 | 18287 | 3 | 0.008982 | 1837.28 | False |
4010 rows × 5 columns