from __future__ import print_function
import numpy as np
import pandas as pd
from collections import OrderedDict #sorting participant df dict before pd.concat()
import matplotlib.pylab as plt
%matplotlib inline
pd.options.display.mpl_style = 'default'
import cPickle as pickle
non_english_fluent = ['023', '031', '045', '050', '070', '106',]
left_handed = ['042', '088',]
pro_inst_skill = ['026', '037']
excluded_all_tasks = non_english_fluent + left_handed + pro_inst_skill
def col_matches(df, regex):
'returns a list of columns in a df that match a regex string.'
import re
cols = list(enumerate(df.columns))
matches = [c for (i, c) in cols
if re.findall(regex, c)]
return matches
def compare_transformations(df, columns, functions, **kwargs):
print('raw')
df[columns].hist(**kwargs)
plt.show()
for name, func in functions.items():
print(name)
df[columns].apply(func).hist(**kwargs)
plt.show()
def quickcompare(r, df, size=(15,7)):
inverse = lambda x: 1.0/x
return compare_transformations(df, col_matches(df, r),
{'inverse': inverse,
'log1p': np.log1p,
'sqrt': np.sqrt,
'log': np.log,
},
figsize=size)
# using this for inline documentation so that it's clear
# that the printing statement isn't part of the necessary
# transformation code.
def html_print(df):
try:
out = df.to_html()
except AttributeError:
out = pd.DataFrame(df).to_html()
return HTML(out)
def htmljoin(df_list, delimiter=''):
return HTML(delimiter.join([x.to_html() for x in df_list]))
def col_matches(df, regex):
import re
cols = list(enumerate(df.columns))
matches = [c for (i, c) in cols
if re.findall(regex, c)]
return matches
def concat_matches(df, *args):
assert all([len(r) for r in args])
import re
col_match_lists = [col_matches(df, regex) for regex in args]
col_set = [df[matches] for matches in col_match_lists]
if len(col_set) == 0:
return None
elif len(col_set) == 1:
return col_set[0]
else:
return pd.concat(col_set, axis=1)
def show_frames(frame_list, delimiter=''):
from IPython.display import HTML
if len(frame_list) == len(delimiter):
html_out = ""
item_template = '<p><strong>{}</strong></p>{}<br>'
for i, tup in enumerate(zip(frame_list, delimiter)):
frame = tup[0]
tag = tup[1]
html_out += item_template.format(tag, frame.to_html())
return HTML(html_out)
else:
html_out = [df.to_html() for df in frame_list]
return HTML(delimiter.join(html_out))
pfilenames = "c:/db_pickles/pickle - dfo-{measure} - {updated}.pickle"
updated_scales = '2014-10-29a'
updated_isip = '2014-10-12b'
updated_sms = '2014-10-20a'
sms_df = pd.read_pickle(pfilenames.format(measure = 'sms', updated = updated_sms))
scales_df = pd.read_pickle(pfilenames.format(measure = 'scales',
updated = updated_scales))
isip_measures = ['isip5_lag2',
'isip5_avgprev4',
'isip8_lag2',
'isip8_avgprev4',
]
isip_frames = {m: pd.read_pickle(pfilenames.format(measure = m,
updated = updated_isip))
for m in isip_measures}
sms_frames = {'sms': sms_df}
scales_frames = {'scales': scales_df}
from itertools import chain
task_output = OrderedDict(chain(scales_frames.items(),
sms_frames.items(),
isip_frames.items(),
))
dfo = pd.concat(task_output.values(),
axis=1, #defaults
join='outer',
keys=task_output.keys(),
names=['set'],
)
to_drop = set(dfo.index).intersection(excluded_all_tasks)
if to_drop:
print("Dropping: {}".format(list(to_drop)))
dfo = dfo.drop(to_drop)
assert len(set(dfo.index).intersection(excluded_all_tasks))==0
Dropping: ['026', '037']
full_updated = '2014-10-29a'
pfilenames = "c:/db_pickles/pickle - dfo-{measure} - {updated}.pickle"
output_file = pfilenames.format(measure='full', updated=full_updated)
pickle.dump(dfo, open(output_file, "wb"))
# To use this outside of pandas (R) we'll want to make a flat table
# where the outer index is converted to a prefix for the variable name.
top_level_label = {'scales': 'SCAL',
'sms': 'SMSR',
'isip8_lag2': 'I8L2',
'isip5_lag2': 'I5L2',
'isip8_avgprev4': 'I8P4',
'isip5_avgprev4': 'I5P4',
}
dfo_flat_subset = dfo.copy().xs(top_level_label.keys(), axis=1)
dfo_flat_subset.columns = ['_'.join([top_level_label[outer], inner])
for (outer, inner) in dfo_flat_subset.columns]
#shorten variable name for export
dfo_flat_subset.columns = [c.replace('avgprev4sq_', "") for c in dfo_flat_subset.columns]
full_updated = '2014-10-29a'
pfilenames = "c:/db_pickles/pickle - dfo-{measure} - {updated}.{ext}"
output_file_csv = pfilenames.format(measure='flat', updated=full_updated, ext="csv")
output_file_pickle = pfilenames.format(measure='flat', updated=full_updated, ext="pickle")
pickle.dump(dfo_flat_subset, open(output_file_pickle, "wb"))
# Re-code NaNs for external analyses
dfo_coded_nan = dfo_flat_subset.replace(np.nan, 77777)
dfo_coded_nan.to_csv(output_file_csv)
dfo_flat_subset.T[::15]
015 | 016 | 017 | 018 | 019 | 020 | 021 | 022 | 024 | 025 | ... | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
SCAL_session_day | 20140228 | 20140303 | 20140303 | 20140303 | 20140303 | 20140303 | 20140304 | 20140304 | 20140304 | 20140304 | ... | 20140508 | 20140508 | 20140508 | 20140508 | 20140509 | 20140509 | 20140509 | 20140509 | 20140509 | 20140509 |
SCAL_calc_bfi_openness | 2.8 | 3.7 | 3.7 | 3.6 | 3.3 | 2.5 | 2.7 | 4.7 | 4.2 | 3.2 | ... | 4.3 | 3.6 | 3.6 | 4.3 | 4.4 | 3 | 3.9 | 4.3 | 4.9 | 4.2 |
SCAL_notes_bfi | ... | ||||||||||||||||||||
SCAL_wasivocab_itemscore02_shovel | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | ... | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
SCAL_wasivocab_itemscore17_decade | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 1 | 2 | ... | 1 | 2 | 2 | 2 | 2 | 1 | 2 | 2 | 2 | 2 |
SCAL_wasimatrix_itemscore01 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | ... | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
SCAL_wasimatrix_itemscore16 | 0 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 1 | 1 | ... | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 1 | 1 |
SCAL_qbasic_isfemale | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | ... | 1 | 0 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 1 |
SCAL_qbasic_relationshipyears | 2 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 1 | 2 | 0 | 3 | 0 | 1 | 0 | 1 | 0 | 0 |
SCAL_qbasic_exerciseyn | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
SCAL_qmusic_gamehoursall | 5 | 0 | 0 | 0 | 10 | 5 | 0 | 0 | 10 | 28 | ... | 3 | 2 | 8 | 0 | 1 | 8 | 0 | 10 | 0 | 2 |
SCAL_bfi_item02 | 2 | 3 | 4 | 2 | 2 | 1 | 3 | 4 | 1 | 3 | ... | 1 | 3 | 2 | 3 | 4 | 2 | 5 | 3 | 3 | 3 |
SCAL_bfi_item17 | 1 | 5 | 2 | 4 | 4 | 5 | 4 | 3 | 5 | 3 | ... | 5 | 5 | 5 | 4 | 4 | 4 | 5 | 5 | 4 | 3 |
SCAL_bfi_item32 | 3 | 5 | 3 | 5 | 5 | 4 | 4 | 4 | 5 | 2 | ... | 5 | 5 | 5 | 2 | 4 | 5 | 5 | 5 | 3 | 4 |
SCAL_qmusic_dancehours_nonzero | 1 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 1 | ... | 1 | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | 0 |
SCAL_orders_phase | 2 | 2 | 2 | 0 | 2 | 0 | 2 | 0 | 2 | 1 | ... | 1 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 2 | 0 |
SCAL_order_lin8j | 9 | 12 | 10 | 14 | 10 | 11 | 11 | 11 | 12 | 13 | ... | 13 | 13 | 13 | 13 | 10 | 12 | 14 | 12 | 11 | 12 |
SMSR_iso8t1_DPm | -3.198754 | -2.827224 | -13.31115 | -0.5532028 | -2.3272 | -3.092538 | -1.458121 | 0.9686097 | -2.741571 | 0.3546706 | ... | -5.737524 | 0.2056869 | -0.8263522 | -2.226755 | -1.372491 | -1.247007 | 1.162443 | -0.06084488 | -9.056157 | -3.304496 |
SMSR_iso5t2_ints_mean | 497.7625 | 500.7283 | 499.561 | 500.0639 | 500.2271 | 500.4319 | 499.6852 | 499.952 | 499.9137 | 502.1116 | ... | 499.2338 | 500.8816 | 500.1932 | 499.4348 | 500.5344 | 499.8039 | 500.2396 | 499.958 | 500.086 | 499.497 |
SMSR_lin5t_dev_mean | -56.08282 | -29.30826 | -54.49151 | -32.42488 | -17.69265 | -50.66523 | -50.73567 | -19.21868 | -15.79687 | -5.572805 | ... | -90.36823 | -7.757573 | -73.42938 | -27.87748 | -21.62065 | -38.02459 | -12.0379 | -29.42216 | -41.91985 | -51.97161 |
SMSR_phase5t_DPm | -8.176227 | -2.727189 | -4.037511 | -3.222491 | -0.05654191 | -3.009335 | -4.691215 | -0.8777716 | -1.028469 | 2.607912 | ... | -7.511362 | 2.75177 | -4.86865 | -0.9072752 | -1.493659 | -7.582498 | 3.159855 | 2.719712 | -6.585747 | -2.178178 |
SMSR_phase8t_ints_mean | 854.141 | 799.6398 | 799.1124 | 800.6056 | 800.2841 | 799.3914 | 800.748 | 799.8787 | 799.9338 | 805.9211 | ... | 800.5021 | 801.6799 | 800.7783 | 800.6417 | 800.7478 | 801.5609 | 800.0165 | 801.3236 | 799.9278 | 808.1808 |
SMSR_iso8j_dev_mean | -88.09475 | -49.75272 | -80.51963 | -39.90446 | -77.28071 | -63.80611 | -71.43951 | 13.19332 | -55.40579 | -17.78908 | ... | -96.03697 | -17.07073 | -95.44311 | -47.41177 | -47.15015 | -37.05709 | 0.4706038 | -41.18913 | -59.59362 | -85.35649 |
SMSR_lin8j_DPm | -2.533303 | -2.792467 | 2.636454 | -0.01542999 | 4.258314 | -5.878196 | 2.467047 | 0.5887284 | -2.485666 | 0.5191878 | ... | -9.6238 | 3.214722 | -10.07283 | -1.967341 | -0.026768 | -9.894439 | 0.8668867 | 1.388157 | -2.93977 | -2.640397 |
SMSR_phase5j_ints_mean | 506.8288 | 499.9173 | 502.1424 | 499.1505 | 500.969 | 499.8378 | 499.8912 | 499.5976 | 500.2107 | 499.647 | ... | 500.6447 | 499.4388 | 500.236 | 500.3357 | 500.1026 | 502.6973 | 500.2303 | 500.3979 | 500.0319 | 499.8732 |
SMSR_phase8j_mpk_dev_mean | -79.62756 | -36.40628 | -81.11475 | -2.749477 | -13.86751 | -91.65142 | -78.06689 | -7.354462 | -42.08086 | -4.122462 | ... | -95.79649 | -33.15391 | -99.04375 | -64.691 | -65.41613 | -17.4071 | -28.808 | -28.49723 | -78.43963 | -94.47895 |
SMSR_phase8tp_mpk_DPm | -7.140225 | -3.235273 | -11.71376 | 4.041789 | -0.5309412 | -2.577912 | -2.118466 | -0.9703458 | -2.596259 | 1.407291 | ... | -9.201369 | 0.6187495 | -5.925623 | -2.968792 | -2.300928 | 0.5214794 | 0.9848873 | -0.6199092 | -4.715839 | -2.282429 |
SMSR_phase8t_mpr_ints_mean | 860.2441 | 798.0705 | 794.8977 | 792.7024 | 802.1488 | 803.5191 | 800.7006 | 792.727 | 796.321 | 811.0014 | ... | 800.3062 | 802.3016 | 799.1497 | 797.8899 | 798.3463 | 802.1358 | 800.9553 | 801.6578 | 795.8792 | 805.1593 |
SMSR_phase5j_mpr_dev_mean | -50.26178 | -14.76985 | -8.999873 | -32.08286 | -28.65655 | -42.44111 | -27.44129 | -48.19151 | -10.88 | -24.22806 | ... | -57.69003 | 19.43637 | -51.56131 | -16.02781 | -24.93797 | -17.4806 | -29.83729 | -19.30714 | -46.16006 | -8.326523 |
SMSR_phase5t_mpr_DPm | -12.23243 | -3.513054 | -5.154859 | -2.50175 | 0.2565294 | -5.129334 | -5.860247 | -1.669478 | -3.13251 | 1.938041 | ... | -5.24474 | 1.745637 | -3.206258 | -2.324636 | -1.114924 | -8.193405 | 8.807288 | 2.862179 | -6.574524 | -5.232282 |
SMSR_phase8j_psk_ints_mean | 784.9812 | 799.4197 | 805.6947 | 802.441 | 544.9198 | 713.3414 | 805.3557 | 810.4893 | 799.2237 | 586.9541 | ... | 583.3398 | 803.537 | 671.0698 | 805.7653 | 784.7367 | 678.0739 | 791.1003 | 797.979 | 809.8003 | 748.8567 |
SMSR_phase8tp_psk_dev_mean | -50.91143 | -39.09818 | -50.33416 | 58.50181 | -6.125333 | -44.83719 | -0.1602748 | -5.275 | -20.88967 | -5.535 | ... | -43.32022 | -15.734 | -55.33533 | -2.387 | -19.805 | -43.359 | 30.537 | -9.380667 | -35.2106 | -22.77448 |
SMSR_phase5j_psk_DPm | -9.344384 | -1.153378 | -3.226146 | -1.629408 | -6.722491 | -10.10705 | -4.234703 | -6.802409 | -0.8209128 | -4.411202 | ... | -11.40426 | 1.375 | -10.93867 | 0.682929 | -4.114753 | -4.736637 | -3.420041 | -1.031487 | -8.463048 | -2.845095 |
SMSR_phase5j_psr_ints_mean | 424.6862 | 496.433 | 370.9547 | 495.8177 | 451.8123 | 347.7508 | 500.6287 | 456.572 | 459.4063 | 462.9783 | ... | 404.6119 | 382.4583 | 421.5708 | 461.0163 | 492.9083 | 505.074 | 505.492 | 507.864 | 502.129 | 501.7723 |
SMSR_phase5t_psr_dev_mean | -10.77273 | -23.94733 | -16.24867 | 22.54767 | 10.64733 | -8.098464 | -17.88367 | 2.382921 | -5.860746 | 7.635087 | ... | -36.13908 | 7.519333 | -17.80067 | -15.92233 | 0.6186667 | -24.97372 | 37.26721 | 12.706 | -22.69108 | 5.455534 |
SMSR_phase8t_nrm_DPm | -5.078323 | -3.202527 | -11.17975 | 0.8494247 | -0.6688233 | -2.777164 | -2.903466 | -0.2268467 | -2.632887 | 0.8131526 | ... | -5.441476 | 1.11521 | -7.397457 | -3.748128 | -1.774755 | -1.543361 | 2.506108 | 0.04823645 | -5.629138 | -3.905582 |
SMSR_phase5j_nrm_ints_mean | 501.0488 | 500.4317 | 497.811 | 497.976 | 499.3087 | 501.4147 | 500.0498 | 498.73 | 499.6319 | 499.6477 | ... | 501.6538 | 500.1294 | 499.322 | 498.9119 | 500.7787 | 499.4885 | 500.1757 | 498.786 | 498.9888 | 499.419 |
SMSR_lint_610690_dev_mean | -43.56383 | -23.69982 | -34.07529 | -13.36782 | -6.537333 | -41.06354 | -27.31262 | -7.483244 | -16.6308 | 9.663867 | ... | -95.22391 | 2.222044 | -41.00091 | -16.31708 | -13.41098 | -23.47564 | 7.974311 | -17.76238 | -25.82191 | -17.1446 |
SMSR_lint_700800_DPm | -3.457716 | -2.836999 | -5.324347 | -2.588808 | -0.5348895 | -3.920463 | -2.249709 | -1.423945 | -3.538202 | 2.053367 | ... | -6.206299 | -0.2037628 | -2.421917 | -3.037278 | -3.049772 | -3.185119 | 0.8063814 | -1.259571 | -4.405488 | -6.258277 |
SMSR_lint_500600_ints_mean | 589.4669 | 556.8507 | 559.6688 | 556.4262 | 556.9577 | 557.5166 | 557.2973 | 557.415 | 555.6666 | 563.1187 | ... | 565.8739 | 563.6617 | 557.8447 | 558.8867 | 559.3468 | 562.4424 | 560.3478 | 558.3712 | 556.7481 | 559.5265 |
SMSR_linj_500600_dev_mean | -53.02957 | -24.77545 | -38.33945 | NaN | -42.62447 | -52.70923 | -26.50805 | -32.03333 | -20.17362 | -20.1902 | ... | -96.46904 | -5.09497 | -85.95285 | -29.80226 | -12.3015 | -68.68218 | -14.45013 | -9.628119 | -23.82299 | -18.08722 |
I5P4_lagdev_local | 2.918599 | 2.692996 | 2.764646 | 2.080451 | 1.397983 | 3.357435 | 3.454273 | 2.634599 | 2.834259 | 2.568266 | ... | 2.629085 | 1.89882 | 4.873864 | 2.944964 | 2.272517 | 2.884701 | 3.168178 | 1.649012 | 2.400801 | 2.143469 |
I8L2_ints_variance | 2173.496 | 887.7761 | 1740.214 | 1034.252 | 676.3646 | 2114.371 | 1388.896 | 511.8083 | 903.3189 | 2234.78 | ... | 1722.752 | 830.9859 | 1941.435 | 492.8784 | 431.2057 | 1463.46 | 1311.735 | 550.8182 | 666.7421 | 1675.055 |
I5L2_lag2devsq_count | 114 | 105 | 111 | 115 | 114 | 106 | 115 | 116 | 113 | 104 | ... | 111 | 113 | 113 | 109 | 111 | 106 | 111 | 106 | 118 | 113 |
44 rows × 97 columns
#quickcompare('DPsd$', df=dfo_flat_subset)
# demonstrates that the inverse transform works really well here (or did before I worked on earlier stages...)
quickcompare('nrm_DPsd', df=dfo_flat_subset) #looks good
quickcompare('psk_DPsd', df=dfo_flat_subset)
raw
inverse
log1p
log
sqrt
raw
inverse
log1p
log
sqrt
scales = dfo.xs('scales', axis=1)
#scales
females = scales[scales.session_isfemale==True]
males = scales[scales.session_isfemale==False]
len(males)
allstats = scales.describe()[1:3]
malestats = males.describe()[1:3]
femalestats = females.describe()[1:3]
allstats.T
mean | std | |
---|---|---|
session_day | 20140411.371134 | 74.619032 |
session_isfemale | 0.608247 | 0.490678 |
sex_femalezero | 0.391753 | 0.490678 |
participant_age | 20.979381 | 5.055895 |
calc_wasivocab_totalrawscore | 38.412371 | 5.448151 |
calc_wasivocab_tscore | 52.360825 | 9.084311 |
calc_bfi_extraversion | 3.518041 | 0.877227 |
calc_bfi_agreeableness | 3.914089 | 0.626388 |
calc_bfi_conscientiousness | 3.663230 | 0.640588 |
calc_bfi_neuroticism | 2.854566 | 0.735095 |
calc_bfi_openness | 3.791753 | 0.604440 |
calc_qmusic_socialimportance | 4.295533 | 1.464769 |
order_500ms_first | 0.556701 | 0.499355 |
order_rhythmfirst | 0.515464 | 0.502357 |
wasivocab_itemscore01_fish | 1.000000 | 0.000000 |
wasivocab_itemscore02_shovel | 1.000000 | 0.000000 |
wasivocab_itemscore03_shell | 1.000000 | 0.000000 |
wasivocab_itemscore04_shirt | 2.000000 | 0.000000 |
wasivocab_itemscore05_car | 1.958763 | 0.285684 |
wasivocab_itemscore06_lamp | 1.989691 | 0.101535 |
wasivocab_itemscore07_bird | 1.927835 | 0.297469 |
wasivocab_itemscore08_tongue | 1.783505 | 0.413995 |
wasivocab_itemscore09_pet | 1.742268 | 0.462744 |
wasivocab_itemscore10_lunch | 1.876289 | 0.414772 |
wasivocab_itemscore11_bell | 1.731959 | 0.489801 |
wasivocab_itemscore12_calendar | 1.793814 | 0.431522 |
wasivocab_itemscore13_alligator | 1.896907 | 0.338025 |
wasivocab_itemscore14_dance | 1.711340 | 0.499140 |
wasivocab_itemscore15_summer | 1.567010 | 0.720347 |
wasivocab_itemscore16_reveal | 1.876289 | 0.388848 |
... | ... | ... |
qmusic_gamehoursall_nonzero | 0.484536 | 0.502357 |
qmusic_gamehoursdrumsticks_nonzero | 0.195876 | 0.398935 |
qmusic_behaviors_08_otherprs_ln1p | 1.846726 | 0.954699 |
qmusic_behaviors_09_danceprv_ln1p | 0.938295 | 0.989376 |
qmusic_dancelevel_ln1p | 0.625159 | 0.645580 |
qmusic_dancelevel_tophalf | 0.402062 | 0.492861 |
orders_500 | 0.443299 | 0.499355 |
orders_800 | 0.556701 | 0.499355 |
orders_iso | 0.989691 | 0.810026 |
orders_phase | 0.969072 | 0.883337 |
orders_linear | 1.041237 | 0.762637 |
order_iso5t1 | 1.443299 | 0.499355 |
order_iso8t1 | 1.556701 | 0.499355 |
order_iso5t2 | 5.422680 | 1.676026 |
order_iso8t2 | 5.536082 | 1.714290 |
order_psh5t | 5.381443 | 1.862141 |
order_psh8t | 5.494845 | 1.809257 |
order_lin5t | 5.525773 | 1.594867 |
order_lin8t | 5.639175 | 1.614940 |
order_iso5j | 11.422680 | 1.676026 |
order_iso8j | 11.536082 | 1.714290 |
order_psh5j | 11.381443 | 1.862141 |
order_psh8j | 11.494845 | 1.809257 |
order_lin5j | 11.525773 | 1.594867 |
order_lin8j | 11.639175 | 1.614940 |
order_isip5 | 15.443299 | 0.499355 |
order_isip8 | 15.556701 | 0.499355 |
orderc_iso_before_lin | 0.484536 | 0.502357 |
orderc_iso_before_phase | 0.525773 | 0.501929 |
orderc_phase_before_lin | 0.556701 | 0.499355 |
162 rows × 2 columns
search= "ethnicity"
cols=concat_matches(scales, search)
cols.to_csv('ethnicity.csv')
concat_matches(scales, 'hours').describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
qmusic_instrumenthours | 97 | 7.927835 | 27.719107 | 0 | 0 | 0 | 4 | 250 |
qmusic_gamehoursall | 97 | 5.569072 | 25.235418 | 0 | 0 | 0 | 3 | 240 |
qmusic_gamehoursdrumsticks | 97 | 0.432990 | 1.396385 | 0 | 0 | 0 | 0 | 10 |
qmusic_singinghours_nonzero | 97 | 0.206186 | 0.406667 | 0 | 0 | 0 | 0 | 1 |
qmusic_dancehours_nonzero | 97 | 0.432990 | 0.498063 | 0 | 0 | 0 | 1 | 1 |
qmusic_instrumenthours_nonzero | 97 | 0.422680 | 0.496552 | 0 | 0 | 0 | 1 | 1 |
qmusic_drumhours_nonzero | 97 | 0.061856 | 0.242145 | 0 | 0 | 0 | 0 | 1 |
qmusic_gamehoursall_nonzero | 97 | 0.484536 | 0.502357 | 0 | 0 | 0 | 1 | 1 |
qmusic_gamehoursdrumsticks_nonzero | 97 | 0.195876 | 0.398935 | 0 | 0 | 0 | 0 | 1 |
len(scales.qmusic_dancelevel[scales.qmusic_dancelevel==0])
46
search= "yn$"
cols=concat_matches(scales, search)
#cols.to_csv('qualifiers_yn.csv')
print(search+'\n------')
cols.apply(pd.value_counts).T
dfo.to_csv('csv_dfo_929.csv')
x = dfo.xs('isip5_avgprev3', axis=1).lagdev_avgprev3sq_local
y = dfo.xs('isip8_avgprev3', axis=1).lagdev_avgprev3sq_local
x.corr(y)
0.64749740485689034
dfscatter = pd.concat([x, y], keys=['isip500local', 'isip800local'], axis=1)
#dfscatter_inverse = 1.0 / dfscatter
dfscatter.plot(x=0, y=1, kind='scatter')
#print((1 / x).corr(1 / y))
#(1 / dfscatter).plot(x=0, y=1, kind='scatter')
dfscatter.sort(columns='isip500local')
isip500local | isip800local | |
---|---|---|
019 | 1.458044 | 2.409286 |
075 | 1.512254 | 2.097847 |
040 | 1.520219 | 2.595587 |
032 | 1.713234 | 1.580831 |
119 | 1.736570 | 2.134176 |
030 | 1.882929 | 2.266210 |
054 | 1.924084 | 2.853508 |
110 | 1.988980 | 2.310470 |
113 | 2.020436 | 1.798715 |
057 | 2.115470 | 2.445437 |
048 | 2.126246 | 1.826718 |
018 | 2.180362 | 3.365396 |
081 | 2.205461 | 2.434559 |
078 | 2.206225 | 2.207061 |
013 | 2.230631 | 1.679067 |
121 | 2.288308 | 3.344639 |
038 | 2.323221 | 3.113442 |
041 | 2.336903 | 2.582472 |
066 | 2.344056 | 2.374577 |
059 | 2.374586 | 3.266615 |
034 | 2.412615 | 2.204436 |
116 | 2.422717 | 2.186702 |
120 | 2.472802 | 2.712842 |
098 | 2.520819 | 2.484190 |
027 | 2.545177 | 2.032936 |
112 | 2.548912 | 3.962077 |
107 | 2.553538 | 3.174529 |
011 | 2.561063 | 2.819736 |
091 | 2.575460 | 2.433766 |
061 | 2.584169 | 2.327133 |
... | ... | ... |
108 | 3.266065 | 2.781837 |
102 | 3.287046 | 1.881973 |
092 | 3.349463 | 2.871268 |
026 | 3.356011 | 1.784634 |
118 | 3.365157 | 3.464296 |
085 | 3.379907 | 3.044101 |
020 | 3.411905 | 4.007090 |
097 | 3.421431 | 2.735617 |
062 | 3.432983 | 4.354812 |
086 | 3.493859 | 3.497730 |
037 | 3.518156 | 2.877774 |
055 | 3.520752 | 2.811153 |
090 | 3.530500 | 2.866428 |
044 | 3.543828 | 3.340074 |
046 | 3.574300 | 3.018440 |
021 | 3.592341 | 2.720564 |
087 | 3.655366 | 2.490506 |
012 | 3.720961 | 2.964895 |
079 | 3.901361 | 5.243465 |
094 | 4.091857 | 2.603532 |
071 | 4.112043 | 4.450198 |
072 | 4.185346 | 3.492831 |
067 | 4.214285 | 3.106370 |
073 | 4.296935 | 4.520292 |
065 | 4.406004 | 4.210819 |
036 | 4.641768 | 5.366874 |
114 | 4.898744 | 4.989202 |
069 | 4.905948 | 4.665402 |
077 | 4.959826 | 4.180945 |
049 | 5.224991 | 7.989958 |
102 rows × 2 columns
lookup = lambda text: [c for c in dfo_flat.columns
if text in c]
lookup2 = lambda tup: set(lookup(tup[0])).intersection(set(lookup(tup[1])))
lookup2(('isip5', 'ints_count'))
{'isip5_avgprev2_x_ints_count', 'isip5_avgprev3_x_ints_count', 'isip5_avgprev4_x_ints_count', 'isip5_avgprev_12_x_ints_count', 'isip5_lag2_x_ints_count'}
compare = pd.concat([scales, sms_jitter, sms_ticks, dfo_isip8], axis=1)
#scales.append(sms_jitter) #['tick_index'] = sms_ticks_index
#scales.append(sms_ticks)
#scales['jit_index'] = sms_jitter_index
compare
session_day | session_time | session_isfemale | exclusion_jitterlinearmissing | exclusion_rhythmadminerror | sex_femalezero | participant_age | calc_wasivocab_totalrawscore | calc_wasimatrix_totalscore | calc_agecolumn | ... | ints_count | ints_mean | ints_variance | ints_stdev | lag2devsq_sum | lag2devsq_count | lag2devsq_mean | lag2devsq_local_sq_abs | lag2devsq_local | lag2devsq_drift | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
011 | 20140226 | 2:40pm | 1 | 1 | 0 | 0 | 19 | 40 | 24 | 2 | ... | 128 | 776.994000 | 883.209799 | 29.718846 | 208781.623552 | 126 | 1656.997012 | 828.498506 | 3.704488 | 0.951965 |
012 | 20140226 | 4:00pm | 1 | 1 | 0 | 0 | 57 | 33 | 18 | 8 | ... | 120 | 831.791733 | 986.297632 | 31.405376 | 176171.441216 | 118 | 1492.978315 | 746.489158 | 3.284711 | 1.861734 |
013 | 20140227 | 8:00am | 0 | 1 | 1 | 1 | 21 | 42 | 20 | 3 | ... | 114 | 834.087930 | 338.276056 | 18.392283 | 60632.582384 | 106 | 572.005494 | 286.002747 | 2.027558 | 0.866818 |
015 | 20140228 | 1:00pm | 1 | 0 | 0 | 0 | 21 | 37 | 15 | 3 | ... | 137 | 702.533956 | 2430.437702 | 49.299470 | 302872.981344 | 131 | 2312.007491 | 1156.003746 | 4.839632 | 5.081493 |
016 | 20140303 | 9:10am | 0 | 0 | 0 | 1 | 19 | 48 | 22 | 2 | ... | 112 | 811.894107 | 887.776090 | 29.795572 | 169532.034352 | 108 | 1569.741059 | 784.870529 | 3.450640 | 1.249453 |
017 | 20140303 | 10:30am | 0 | 0 | 0 | 1 | 23 | 41 | 23 | 3 | ... | 109 | 844.185541 | 1740.213950 | 41.715872 | 222698.851904 | 107 | 2081.297681 | 1040.648841 | 3.821327 | 3.133114 |
018 | 20140303 | 1:00pm | 0 | 0 | 0 | 1 | 19 | 37 | 22 | 2 | ... | 114 | 807.715930 | 1034.252108 | 32.159790 | 260203.336288 | 112 | 2323.244074 | 1161.622037 | 4.219624 | NaN |
019 | 20140303 | 2:20pm | 0 | 0 | 0 | 1 | 19 | 40 | 23 | 2 | ... | 116 | 794.750414 | 676.364586 | 26.007010 | 129798.102080 | 114 | 1138.579843 | 569.289921 | 3.002175 | 1.302005 |
020 | 20140303 | 3:37pm | 0 | 0 | 0 | 1 | 18 | 41 | 20 | 2 | ... | 111 | 819.063459 | 2114.370996 | 45.982290 | 308034.627472 | 107 | 2878.828294 | 1439.414147 | 4.632072 | 3.171907 |
021 | 20140304 | 9:40am | 1 | 0 | 0 | 0 | 19 | 39 | 17 | 2 | ... | 121 | 761.102909 | 1388.895981 | 37.267895 | 140437.019936 | 119 | 1180.143025 | 590.071512 | 3.191604 | 3.713490 |
022 | 20140304 | 12:30nn | 1 | 0 | 0 | 0 | 23 | 42 | 19 | 3 | ... | 109 | 823.375670 | 511.808289 | 22.623180 | 66120.177264 | 105 | 629.715974 | 314.857987 | 2.155060 | 1.704434 |
024 | 20140304 | 3:20pm | 1 | 0 | 0 | 0 | 18 | 33 | 23 | 2 | ... | 124 | 741.422387 | 903.318885 | 30.055264 | 147698.985472 | 122 | 1210.647422 | 605.323711 | 3.318396 | 2.328300 |
025 | 20140304 | 4:50pm | 0 | 0 | 0 | 1 | 21 | 36 | 20 | 3 | ... | 105 | 873.708838 | 2234.780431 | 47.273464 | 125167.630608 | 103 | 1215.219715 | 607.609857 | 2.821277 | 4.616893 |
026 | 20140305 | 8:00am | 0 | 0 | 0 | 1 | 20 | 35 | 23 | 3 | ... | 117 | 787.017641 | 326.880811 | 18.079845 | 59249.180048 | 115 | 515.210261 | 257.605131 | 2.039355 | 1.057562 |
027 | 20140305 | 9:10am | 0 | 0 | 0 | 1 | 19 | 39 | 22 | 2 | ... | 112 | 822.994571 | 522.639055 | 22.861300 | 97856.858672 | 110 | 889.607806 | 444.803903 | 2.562638 | 1.071990 |
028 | 20140305 | 3:40pm | 0 | 0 | 0 | 1 | 18 | 44 | 23 | 2 | ... | 122 | 753.348820 | 667.002049 | 25.826383 | 133482.370400 | 120 | 1112.353087 | 556.176543 | 3.130475 | 1.397410 |
029 | 20140306 | 8:20am | 1 | 0 | 0 | 0 | 20 | 39 | 24 | 3 | ... | 115 | 804.279235 | 1867.368872 | 43.213064 | 394605.813712 | 113 | 3492.086847 | 1746.043424 | 5.195420 | 1.369522 |
030 | 20140306 | 12:40nn | 1 | 0 | 0 | 0 | 43 | 46 | 23 | 6 | ... | 108 | 853.777815 | 806.482565 | 28.398637 | 120379.428128 | 106 | 1135.654982 | 567.827491 | 2.791023 | 1.809425 |
032 | 20140306 | 3:30pm | 1 | 0 | 0 | 0 | 18 | 41 | 23 | 2 | ... | 117 | 792.519453 | 280.617415 | 16.751639 | 40211.747744 | 115 | 349.667372 | 174.833686 | 1.668409 | 1.297775 |
033 | 20140307 | 1:00pm | 0 | 0 | 0 | 1 | 19 | 43 | 24 | 2 | ... | 116 | 795.213966 | 677.040070 | 26.019994 | 141114.364880 | 114 | 1237.845306 | 618.922653 | 3.128486 | 0.958670 |
034 | 20140307 | 2:20pm | 0 | 0 | 0 | 1 | 19 | 39 | 21 | 2 | ... | 114 | 812.306596 | 513.101841 | 22.651751 | 98116.291088 | 112 | 876.038313 | 438.019157 | 2.576479 | 1.066719 |
035 | 20140310 | 10:30am | 1 | 0 | 0 | 0 | 20 | 42 | 22 | 3 | ... | 116 | 778.160310 | 1199.769973 | 34.637696 | 236568.727616 | 112 | 2112.220782 | 1056.110391 | 4.176241 | 1.540275 |
036 | 20140310 | 2:20pm | 1 | 0 | 0 | 0 | 19 | 32 | 19 | 2 | ... | 129 | 703.468155 | 2774.862029 | 52.676959 | 571834.011472 | 125 | 4574.672092 | 2287.336046 | 6.798617 | 3.138733 |
037 | 20140311 | 9:40am | 1 | 0 | 0 | 0 | 18 | 41 | 28 | 2 | ... | 119 | 775.521647 | 1463.407121 | 38.254505 | 139151.014144 | 117 | 1189.324907 | 594.662454 | 3.144426 | 3.800600 |
038 | 20140311 | 3:10pm | 0 | 0 | 0 | 1 | 18 | 41 | 22 | 2 | ... | 112 | 807.224500 | 981.703129 | 31.332142 | 141240.152480 | 108 | 1307.779190 | 653.889595 | 3.167801 | 2.242947 |
039 | 20140312 | 12:00nn | 0 | 0 | 0 | 1 | 21 | 27 | 25 | 3 | ... | 120 | 763.926033 | 950.985383 | 30.838051 | 212786.534608 | 118 | 1803.275717 | 901.637859 | 3.930653 | 0.919563 |
040 | 20140325 | 5:10pm | 0 | 0 | 0 | 1 | 21 | 39 | 26 | 3 | ... | 118 | 765.104644 | 914.416946 | 30.239328 | 109097.011056 | 114 | 956.991325 | 478.495663 | 2.859026 | 2.728872 |
041 | 20140326 | 1:00pm | 1 | 0 | 0 | 0 | 23 | 40 | 23 | 3 | ... | 119 | 770.669849 | 579.816556 | 24.079380 | 102163.465424 | 117 | 873.192012 | 436.596006 | 2.711262 | 1.552867 |
043 | 20140328 | 11:00am | 1 | 0 | 0 | 0 | 19 | 29 | 15 | 2 | ... | 132 | 701.174636 | 1999.840545 | 44.719577 | 188007.440288 | 130 | 1446.211079 | 723.105540 | 3.835082 | 5.095939 |
044 | 20140331 | 10:30am | 1 | 0 | 0 | 0 | 18 | 40 | 22 | 2 | ... | 111 | 827.880505 | 1191.182247 | 34.513508 | 164717.853136 | 109 | 1511.172965 | 755.586482 | 3.320278 | 2.521008 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
091 | 20140430 | 10:30am | 1 | 0 | 0 | 0 | 23 | 40 | 23 | 3 | ... | 131 | 704.692244 | 760.246930 | 27.572576 | 120765.580960 | 129 | 936.167294 | 468.083647 | 3.070169 | 2.425567 |
092 | 20140430 | 12:00nn | 1 | 0 | 0 | 0 | 20 | 36 | 21 | 3 | ... | 120 | 764.673100 | 1090.989115 | 33.030124 | 146268.311696 | 118 | 1239.561964 | 619.780982 | 3.255692 | 2.838772 |
093 | 20140430 | 1:30pm | 1 | 0 | 0 | 0 | 19 | 34 | 20 | 2 | ... | 117 | 779.269573 | 886.952485 | 29.781748 | 135531.019856 | 113 | 1199.389556 | 599.694778 | 3.142515 | 2.174944 |
094 | 20140501 | 7:35am | 1 | 0 | 0 | 0 | 22 | 35 | 20 | 3 | ... | 81 | 945.213432 | 1126.350208 | 33.561141 | 103098.920160 | 60 | 1718.315336 | 859.157668 | 3.101034 | 1.729348 |
095 | 20140501 | 9:00am | 0 | 0 | 0 | 1 | 19 | 43 | 21 | 2 | ... | 115 | 795.117287 | 774.746157 | 27.834262 | 103241.385440 | 113 | 913.640579 | 456.820290 | 2.688076 | 2.242496 |
096 | 20140501 | 10:35am | 1 | 0 | 0 | 0 | 22 | 40 | 26 | 3 | ... | 105 | 874.146171 | 1060.140879 | 32.559805 | 154800.927216 | 103 | 1502.921623 | 751.460812 | 3.135950 | 2.009880 |
097 | 20140502 | 9:00am | 0 | 0 | 0 | 1 | 19 | 39 | 21 | 2 | ... | 115 | 800.441391 | 824.802565 | 28.719376 | 168980.032944 | 113 | 1495.398522 | 747.699261 | 3.416126 | 1.097001 |
098 | 20140502 | 10:25am | 0 | 0 | 0 | 1 | 21 | 38 | 26 | 3 | ... | 118 | 777.291593 | 567.512488 | 23.822521 | 130337.320464 | 116 | 1123.597590 | 561.798795 | 3.049344 | 0.307521 |
099 | 20140502 | 1:00pm | 0 | 0 | 0 | 1 | 19 | 36 | 18 | 2 | ... | 124 | 741.676419 | 2017.520730 | 44.916820 | 287594.460320 | 122 | 2357.331642 | 1178.665821 | 4.628933 | 3.905071 |
100 | 20140502 | 2:50pm | 1 | 0 | 0 | 0 | 18 | 39 | 25 | 2 | ... | 118 | 775.535661 | 1054.056078 | 32.466230 | 173694.763488 | 116 | 1497.368651 | 748.684325 | 3.528155 | 2.253267 |
101 | 20140502 | 5:35pm | 0 | 0 | 0 | 1 | 19 | 40 | 19 | 2 | ... | 126 | 726.387333 | 1340.031487 | 36.606441 | 134605.189536 | 124 | 1085.525722 | 542.762861 | 3.207279 | 3.887175 |
102 | 20140503 | 9:30am | 1 | 0 | 0 | 0 | 18 | 41 | 21 | 2 | ... | 114 | 804.779053 | 531.380717 | 23.051697 | 72300.804720 | 112 | 645.542899 | 322.771450 | 2.232394 | 1.794693 |
103 | 20140503 | 10:50am | 0 | 0 | 0 | 1 | 21 | 41 | 20 | 3 | ... | 116 | 790.648517 | 869.395279 | 29.485510 | 160461.065344 | 114 | 1407.553205 | 703.776602 | 3.355320 | 1.627688 |
104 | 20140505 | 1:30pm | 1 | 0 | 0 | 0 | 21 | 29 | 21 | 3 | ... | 113 | 799.264850 | 774.593421 | 27.831518 | 117943.966224 | 109 | 1082.054736 | 541.027368 | 2.910174 | 1.912116 |
105 | 20140505 | 5:50pm | 1 | 0 | 0 | 0 | 19 | 37 | 22 | 2 | ... | 123 | 744.315415 | 1875.171194 | 43.303247 | 273473.501568 | 121 | 2260.111583 | 1130.055792 | 4.516406 | 3.667370 |
107 | 20140506 | 12:00nn | 0 | 0 | 0 | 1 | 20 | 43 | 21 | 3 | ... | 113 | 810.179221 | 935.669667 | 30.588718 | 192996.431600 | 111 | 1738.706591 | 869.353295 | 3.639293 | 1.005146 |
108 | 20140507 | 12:00nn | 1 | 0 | 0 | 0 | 19 | 37 | 19 | 2 | ... | 118 | 779.401288 | 886.402550 | 29.772513 | 160667.758368 | 116 | 1385.066882 | 692.533441 | 3.376442 | 1.786460 |
109 | 20140507 | 1:30pm | 1 | 0 | 0 | 0 | 21 | 34 | 22 | 3 | ... | 122 | 748.960230 | 1078.115928 | 32.834676 | 232252.546624 | 120 | 1935.437889 | 967.718944 | 4.153516 | 1.402878 |
110 | 20140507 | 3:00pm | 0 | 0 | 0 | 1 | 19 | 47 | 21 | 2 | ... | 114 | 807.640421 | 530.966500 | 23.042710 | 113958.267760 | 112 | 1017.484534 | 508.742267 | 2.792742 | 0.583708 |
111 | 20140507 | 5:00pm | 1 | 0 | 0 | 0 | 20 | 41 | 26 | 3 | ... | 103 | 896.778214 | 1346.015472 | 36.688084 | 180413.440000 | 101 | 1786.271683 | 893.135842 | 3.332527 | 2.373047 |
112 | 20140508 | 12:20nn | 1 | 0 | 0 | 0 | 20 | 30 | 19 | 3 | ... | 125 | 730.919936 | 2441.423644 | 49.410764 | 332315.308624 | 123 | 2701.750477 | 1350.875238 | 5.028493 | 4.518067 |
113 | 20140508 | 2:00pm | 0 | 0 | 0 | 1 | 19 | 33 | 20 | 2 | ... | 114 | 807.840737 | 830.985874 | 28.826826 | 59596.038976 | 112 | 532.107491 | 266.053745 | 2.019105 | 2.942201 |
114 | 20140508 | 5:00pm | 1 | 0 | 0 | 0 | 19 | 44 | 25 | 2 | ... | 117 | 760.652444 | 1941.435481 | 44.061724 | 309333.439024 | 111 | 2786.787739 | 1393.393869 | 4.907391 | 3.077659 |
115 | 20140508 | 6:30pm | 1 | 0 | 0 | 0 | 19 | 47 | 22 | 2 | ... | 119 | 777.876975 | 492.878399 | 22.200865 | 108679.512016 | 117 | 928.884718 | 464.442359 | 2.770480 | 0.685526 |
116 | 20140509 | 7:30am | 0 | 0 | 0 | 1 | 20 | 39 | 27 | 3 | ... | 113 | 794.877381 | 431.205669 | 20.765492 | 115885.327472 | 109 | 1063.168142 | 531.584071 | 2.900587 | NaN |
117 | 20140509 | 10:30am | 1 | 0 | 0 | 0 | 19 | 34 | 20 | 2 | ... | 78 | 931.637744 | 1463.459871 | 38.255194 | 127423.589056 | 56 | 2275.421233 | 1137.710617 | 3.620502 | 1.937290 |
118 | 20140509 | 12:00nn | 0 | 0 | 0 | 1 | 20 | 42 | 22 | 3 | ... | 109 | 831.885431 | 1311.735249 | 36.217886 | 243292.114656 | 105 | 2317.067759 | 1158.533879 | 4.091578 | 1.487880 |
119 | 20140509 | 3:00pm | 1 | 0 | 0 | 0 | 19 | 38 | 23 | 2 | ... | 111 | 832.191856 | 550.818229 | 23.469517 | 99282.943600 | 109 | 910.852694 | 455.426347 | 2.564399 | 1.173633 |
120 | 20140509 | 4:30pm | 0 | 0 | 0 | 1 | 24 | 33 | 24 | 3 | ... | 121 | 758.141686 | 666.742129 | 25.821350 | 148368.920576 | 119 | 1246.797652 | 623.398826 | 3.293310 | 0.868382 |
121 | 20140509 | 7:30pm | 1 | 0 | 0 | 0 | 18 | 36 | 20 | 2 | ... | 105 | 870.896114 | 1675.054895 | 40.927435 | 267651.276272 | 103 | 2598.556080 | 1299.278040 | 4.138898 | 2.225864 |
102 rows × 235 columns