import pandas as pd import numpy as np !ls ../in !ls ../in/sales*.xlsx import glob glob.glob("../in/sales*.xlsx") all_data = pd.DataFrame() for f in glob.glob("../in/sales*.xlsx"): df = pd.read_excel(f) all_data = all_data.append(df,ignore_index=True) all_data.describe() all_data.head() all_data['date'] = pd.to_datetime(all_data['date']) status = pd.read_excel("../in/customer-status.xlsx") status all_data_st = pd.merge(all_data, status, how='left') all_data_st.head() all_data_st[all_data_st["account number"]==737550].head() all_data_st['status'].fillna('bronze',inplace=True) all_data_st.head() all_data_st[all_data_st["account number"]==737550].head() pd.__version__ all_data_st["status"] = all_data_st["status"].astype("category") all_data_st.head() all_data_st.dtypes all_data_st.sort(columns=["status"]).head() all_data_st["status"].cat.set_categories([ "gold","silver","bronze"],inplace=True) all_data_st.sort(columns=["status"]).head() all_data_st["status"].describe() all_data_st.groupby(["status"])["quantity","unit price","ext price"].mean() all_data_st.groupby(["status"])["quantity","unit price","ext price"].agg([np.sum,np.mean, np.std]) all_data_st.drop_duplicates(subset=["account number","name"]).ix[:,[0,1,7]].groupby(["status"])["name"].count()