import pandas as pd
import numpy as np
from copy import deepcopy
from clustergrammer2 import net
df = {}
clustergrammer2 backend version 0.2.9
import clustergrammer_groupby as cby
# load json to dict
def load_to_dict( filename ):
import json
# load
f = open(filename,'r')
inst_dict = json.load(f)
f.close()
return inst_dict
# save dict to json
def save_to_json(inst_dict, filename, indent=True):
import json
# save as a json
fw = open(filename, 'w')
if indent == True:
fw.write( json.dumps(inst_dict, indent=2) )
else:
fw.write( json.dumps(inst_dict) )
fw.close()
address_dict = load_to_dict('../data/address_dict.json')
len(list(address_dict.keys()))
3456
df['ini'] = pd.read_csv('../challenge_data/dvs_challenge_1_membership_time_space.csv')
country_dict = {}
city_dict = {}
lat_dict = {}
lng_dict = {}
for inst_row in df['ini'].index.tolist():
lat_dict[str(inst_row)] = df['ini'].loc[inst_row]['lat']
lng_dict[str(inst_row)] = df['ini'].loc[inst_row]['long']
inst_row = str(inst_row)
if str(inst_row) in address_dict:
inst_address = address_dict[inst_row].split(', ')
inst_country = inst_address[-1]
try:
inst_city = inst_address[-4]
except:
inst_city = 'N.A.'
else:
inst_country = 'N.A.'
inst_city = 'N.A.'
country_dict[inst_row] = inst_country
city_dict[inst_row] = inst_city
df['ini'].head()
lat | long | data | visualization | society | date_with_hour | date | hour | |
---|---|---|---|---|---|---|---|---|
0 | 19.075984 | 72.877656 | 3.666667 | 3.333333 | 2.666667 | 2/20/2019 12 | 2/20/2019 | 12 |
1 | 43.653226 | -79.383184 | 3.333333 | 3.000000 | 3.333333 | 2/20/2019 12 | 2/20/2019 | 12 |
2 | 39.739236 | -104.990251 | 3.000000 | 1.666667 | 1.666667 | 2/20/2019 12 | 2/20/2019 | 12 |
3 | 60.169856 | 24.938379 | 2.000000 | 3.666667 | 2.333333 | 2/20/2019 12 | 2/20/2019 | 12 |
4 | 38.907192 | -77.036871 | 2.333333 | 4.000000 | 2.666667 | 2/20/2019 12 | 2/20/2019 | 12 |
df['clean'] = deepcopy(df['ini'])
df['clean'] = df['clean'].drop(['lat', 'long', 'date_with_hour', 'date'], axis=1)
df['clean'].shape
(3515, 4)
df['clean'].head()
data | visualization | society | hour | |
---|---|---|---|---|
0 | 3.666667 | 3.333333 | 2.666667 | 12 |
1 | 3.333333 | 3.000000 | 3.333333 | 12 |
2 | 3.000000 | 1.666667 | 1.666667 | 12 |
3 | 2.000000 | 3.666667 | 2.333333 | 12 |
4 | 2.333333 | 4.000000 | 2.666667 | 12 |
df['cat'] = deepcopy(df['clean'].transpose())
cols = df['cat'].columns.tolist()
new_cols = [('P-' + str(x),
'Country: ' + country_dict[str(x)],
'City: ' + city_dict[str(x)],
'Lat: ' + str(lat_dict[str(x)]),
'Long: ' + str(lng_dict[str(x)])
) for x in cols]
df['cat'].columns = new_cols
rows = df['cat'].index.tolist()
mat = df['cat'].get_values().astype('float')
df['proc'] = pd.DataFrame(columns=new_cols, index=rows, data=mat)
cols = df['proc'].columns.tolist()
keep_cols = [x for x in cols if 'N.A.' not in x[1]]
print(len(cols), len(keep_cols))
df['prot'] = df['proc'][keep_cols]
3515 3456
df['proc'].head()
(P-0, Country: India, City: Mumbai Suburban, Lat: 19.0759837, Long: 72.8776559) | (P-1, Country: Canada, City: Toronto, Lat: 43.653226, Long: -79.3831843) | (P-2, Country: USA, City: Denver County, Lat: 39.7392358, Long: -104.990251) | (P-3, Country: Finland, City: Southern Finland, Lat: 60.1698557, Long: 24.9383791) | (P-4, Country: USA, City: Washington, Lat: 38.9071923, Long: -77.0368707) | (P-5, Country: Brazil, City: Rio Grande do Sul, Lat: -30.0346471, Long: -51.2176584) | (P-6, Country: USA, City: Cook County, Lat: 41.8781136, Long: -87.6297982) | (P-7, Country: USA, City: Washington, Lat: 38.9071923, Long: -77.0368707) | (P-8, Country: United Kingdom, City: London, Lat: 51.5073509, Long: -0.1277583) | (P-9, Country: India, City: Bangalore Urban, Lat: 12.9715987, Long: 77.5945627) | ... | (P-3505, Country: Canada, City: Halifax County, Lat: 44.6487635, Long: -63.5752387) | (P-3506, Country: Chile, City: Provincia de Marga Marga, Lat: -33.0482707, Long: -71.4408752) | (P-3507, Country: USA, City: Fulton County, Lat: 33.7489954, Long: -84.3879824) | (P-3508, Country: USA, City: Los Angeles County, Lat: 34.0966764, Long: -117.7197785) | (P-3509, Country: USA, City: Cook County, Lat: 41.8781136, Long: -87.6297982) | (P-3510, Country: Luxembourg, City: Esch-sur-Alzette, Lat: 49.5008805, Long: 5.9860925) | (P-3511, Country: USA, City: Washington, Lat: 38.9071923, Long: -77.0368707) | (P-3512, Country: USA, City: Washington, Lat: 38.9071923, Long: -77.0368707) | (P-3513, Country: USA, City: Harris County, Lat: 29.7604267, Long: -95.3698028) | (P-3514, Country: USA, City: New Haven County, Lat: 41.308274, Long: -72.9278835) | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
data | 3.666667 | 3.333333 | 3.000000 | 2.000000 | 2.333333 | 4.000000 | 4.000000 | 3.0 | 0.666667 | 3.000000 | ... | 3.0 | 2.666667 | 1.666667 | 1.666667 | 3.666667 | 2.000000 | 3.333333 | 1.333333 | 2.000000 | 1.666667 |
visualization | 3.333333 | 3.000000 | 1.666667 | 3.666667 | 4.000000 | 2.000000 | 2.666667 | 4.0 | 1.666667 | 3.666667 | ... | 2.0 | 2.000000 | 4.333333 | 3.000000 | 3.333333 | 2.000000 | 1.000000 | 2.333333 | 4.000000 | 1.000000 |
society | 2.666667 | 3.333333 | 1.666667 | 2.333333 | 2.666667 | 3.333333 | 3.000000 | 4.0 | 2.666667 | 1.666667 | ... | 0.0 | 1.666667 | 4.000000 | 2.333333 | 2.000000 | 2.333333 | 3.333333 | 2.666667 | 0.666667 | 0.000000 |
hour | 12.000000 | 12.000000 | 12.000000 | 12.000000 | 12.000000 | 12.000000 | 12.000000 | 12.0 | 12.000000 | 12.000000 | ... | 9.0 | 10.000000 | 11.000000 | 11.000000 | 12.000000 | 12.000000 | 12.000000 | 12.000000 | 12.000000 | 12.000000 |
4 rows × 3515 columns
net.load_df(df['proc'])
net.set_cat_color(axis='col', cat_index=1, cat_name='Country: USA', inst_color='blue')
net.set_cat_color(axis='col', cat_index=1, cat_name='Country: United Kingdom', inst_color='white')
net.set_cat_color(axis='col', cat_index=1, cat_name='Country: Canada', inst_color='red')
net.set_cat_color(axis='col', cat_index=1, cat_name='Country: India', inst_color='green')
net.set_cat_color(axis='col', cat_index=1, cat_name='Country: Australia', inst_color='black')
net.load_df(df['proc'])
net.swap_nan_for_zero()
net.normalize(axis='row', norm_type='zscore')
net.widget()
ExampleWidget(network='{"row_nodes": [{"name": "data", "ini": 4, "clust": 1, "rank": 2, "rankvar": 2, "group":…
df_sig, keep_genes_dict, df_gene_pval, all_fold_info = cby.generate_signatures(df['proc'], category_level='Country')
/Users/nickfernandez/anaconda3/envs/py36lab/lib/python3.6/site-packages/numpy/core/fromnumeric.py:3146: RuntimeWarning: Degrees of freedom <= 0 for slice **kwargs) /Users/nickfernandez/anaconda3/envs/py36lab/lib/python3.6/site-packages/numpy/core/_methods.py:125: RuntimeWarning: invalid value encountered in true_divide ret, rcount, out=ret, casting='unsafe', subok=False) /Users/nickfernandez/anaconda3/envs/py36lab/lib/python3.6/site-packages/scipy/stats/_distn_infrastructure.py:879: RuntimeWarning: invalid value encountered in greater return (self.a < x) & (x < self.b) /Users/nickfernandez/anaconda3/envs/py36lab/lib/python3.6/site-packages/scipy/stats/_distn_infrastructure.py:879: RuntimeWarning: invalid value encountered in less return (self.a < x) & (x < self.b) /Users/nickfernandez/anaconda3/envs/py36lab/lib/python3.6/site-packages/scipy/stats/_distn_infrastructure.py:1821: RuntimeWarning: invalid value encountered in less_equal cond2 = cond0 & (x <= self.a)
net.load_df(df_sig)
net.normalize(axis='row', norm_type='zscore')
net.widget()
ExampleWidget(network='{"row_nodes": [{"name": "data", "ini": 4, "clust": 2, "rank": 2, "rankvar": 3, "group":…
net.set_cat_color(axis='col', cat_index=2, cat_name='City: New York City', inst_color='blue')
net.set_cat_color(axis='col', cat_index=1, cat_name='City: San Francisco and County', inst_color='white')
net.set_cat_color(axis='col', cat_index=1, cat_name='City: Washington', inst_color='red')
net.set_cat_color(axis='col', cat_index=1, cat_name='Country: India', inst_color='green')
net.load_df(df['proc'])
net.filter_cat(axis='col', cat_index=1, cat_name='Country: USA')
df['usa'] = net.export_df()
net.normalize(axis='row', norm_type='zscore')
net.widget()
ExampleWidget(network='{"row_nodes": [{"name": "data", "ini": 4, "clust": 1, "rank": 1, "rankvar": 3, "group":…
df_sig, keep_genes_dict, df_gene_pval, all_fold_info = cby.generate_signatures(df['usa'], category_level='City')
/Users/nickfernandez/anaconda3/envs/py36lab/lib/python3.6/site-packages/numpy/core/fromnumeric.py:3146: RuntimeWarning: Degrees of freedom <= 0 for slice **kwargs) /Users/nickfernandez/anaconda3/envs/py36lab/lib/python3.6/site-packages/numpy/core/_methods.py:125: RuntimeWarning: invalid value encountered in true_divide ret, rcount, out=ret, casting='unsafe', subok=False) /Users/nickfernandez/anaconda3/envs/py36lab/lib/python3.6/site-packages/scipy/stats/_distn_infrastructure.py:879: RuntimeWarning: invalid value encountered in greater return (self.a < x) & (x < self.b) /Users/nickfernandez/anaconda3/envs/py36lab/lib/python3.6/site-packages/scipy/stats/_distn_infrastructure.py:879: RuntimeWarning: invalid value encountered in less return (self.a < x) & (x < self.b) /Users/nickfernandez/anaconda3/envs/py36lab/lib/python3.6/site-packages/scipy/stats/_distn_infrastructure.py:1821: RuntimeWarning: invalid value encountered in less_equal cond2 = cond0 & (x <= self.a)
df_sig.shape
(4, 256)
net.load_df(df_sig)
net.normalize(axis='row', norm_type='zscore')
net.widget()
ExampleWidget(network='{"row_nodes": [{"name": "data", "ini": 4, "clust": 1, "rank": 1, "rankvar": 0, "group":…
df['proc'].shape
(4, 3515)
df['proc'].columns.tolist()[0]
('P-0', 'Country: India', 'City: Mumbai Suburban', 'Lat: 19.0759837', 'Long: 72.8776559')
keep_cols = [x for x in df['proc'].columns.tolist() if 'nan' not in x[3] ]
len(keep_cols)
3460
df['def-pos'] = deepcopy(df['proc'])
df['def-pos'] = df['def-pos'][keep_cols]
df['def-pos'].shape
(4, 3460)
df['def-pos'].to_csv('../data/members_with_positions.txt', sep='\t')