import pandas as pd import numpy as np import matplotlib.pyplot as plt import os import operator import random %matplotlib inline dfs = [] for csvfile in os.listdir("benefit_summary"): dfs.append(pd.read_csv("benefit_summary/" + csvfile)) df = pd.concat(dfs) df.shape # find the distribution of records by gender df[["BENE_SEX_IDENT_CD"]].groupby("BENE_SEX_IDENT_CD").count() # find distribution of records by race df[["BENE_RACE_CD"]].groupby("BENE_RACE_CD").count() # visualize incidence of diseases by gender df_trunc = df[["BENE_SEX_IDENT_CD", "SP_ALZHDMTA", "SP_CHF", "SP_CHRNKIDN", "SP_CNCR", "SP_COPD", "SP_DEPRESSN", "SP_DIABETES", "SP_ISCHMCHT", "SP_OSTEOPRS", "SP_RA_OA", "SP_STRKETIA"]] df_grouped = df_trunc.groupby("BENE_SEX_IDENT_CD").agg(lambda x: 100.0 * np.mean(x - 1)) df_grouped.transpose() df_grouped.plot(kind="bar", legend=False) # visualize incidence of diseases by race df_trunc = df[["BENE_RACE_CD", "SP_ALZHDMTA", "SP_CHF", "SP_CHRNKIDN", "SP_CNCR", "SP_COPD", "SP_DEPRESSN", "SP_DIABETES", "SP_ISCHMCHT", "SP_OSTEOPRS", "SP_RA_OA", "SP_STRKETIA"]] df_grouped = df_trunc.groupby("BENE_RACE_CD").agg(lambda x: 100.0 * np.mean(x - 1)) df_grouped.transpose() df_grouped.plot(kind="bar", legend=False) colnames = [x for x in df.columns if x.startswith("SP_") and not x.endswith("STATE_CODE")] colidxs = {x[1]:x[0] for x in enumerate(colnames)} CM = np.zeros((len(colnames), len(colnames))) comorbidities = [] for colname1 in colnames: for colname2 in colnames: i = colidxs[colname1] j = colidxs[colname2] if i == j: CM[i,j] = 1.0 continue if i < j: CM[i,j] = np.corrcoef(df.ix[:, colname1], df.ix[:, colname2])[0,1] CM[j,i] = CM[i,j] comorbidities.append((colname1, colname2, CM[i,j])) continue sorted(comorbidities, key=operator.itemgetter(2), reverse=True)[0:10] fig = plt.figure() plt.hot() plt.pcolormesh(CM) plt.colorbar()