#!/usr/bin/env python
# coding: utf-8
# In[104]:
#from __future__ import print_function
import numpy as np
import pandas as pd
from collections import OrderedDict #sorting participant df dict before pd.concat()
import matplotlib.pylab as plt
get_ipython().run_line_magic('matplotlib', 'inline')
pd.options.display.mpl_style = 'default'
import cPickle as pickle
pilot_data = ['010', '011', '012', '013', '014',]
non_english_fluent = ['023', '031', '045', '050', '070', '106',]
left_handed = ['042', '088',]
pro_inst_skill = ['026', '037']
excluded_all_tasks = pilot_data + non_english_fluent + left_handed + pro_inst_skill
# ## Utility functions
# In[105]:
def col_matches(df, regex):
'returns a list of columns in a df that match a regex string.'
import re
cols = list(enumerate(df.columns))
matches = [c for (i, c) in cols
if re.findall(regex, c)]
return matches
def compare_transformations(df, columns, functions, **kwargs):
print('raw')
df[columns].hist(**kwargs)
plt.show()
for name, func in functions.items():
print(name)
df[columns].apply(func).hist(**kwargs)
plt.show()
def quickcompare(r, df, size=(15,7)):
inverse = lambda x: 1.0/x
return compare_transformations(df, col_matches(df, r),
{'inverse': inverse,
'log1p': np.log1p,
'sqrt': np.sqrt, },
figsize=size)
# using this for inline documentation so that it's clear
# that the printing statement isn't part of the necessary
# transformation code.
def html_print(df):
from IPython.display import HTML
try:
out = df.to_html()
except AttributeError:
out = pd.DataFrame(df).to_html()
return HTML(out)
def htmljoin(df_list, delimiter=''):
from IPython.display import HTML
return HTML(delimiter.join([x.to_html() for x in df_list]))
def col_matches(df, regex):
import re
cols = list(enumerate(df.columns))
matches = [c for (i, c) in cols
if re.findall(regex, c)]
return matches
def concat_matches(df, *args):
assert all([len(r) for r in args])
import re
col_match_lists = [col_matches(df, regex) for regex in args]
col_set = [df[matches] for matches in col_match_lists]
if len(col_set) == 0:
return None
elif len(col_set) == 1:
return col_set[0]
else:
return pd.concat(col_set, axis=1)
def show_frames(frame_list, delimiter=''):
from IPython.display import HTML
if len(frame_list) == len(delimiter):
html_out = ""
item_template = '
{}
{}
'
for i, tup in enumerate(zip(frame_list, delimiter)):
frame = tup[0]
tag = tup[1]
html_out += item_template.format(tag, frame.to_html())
return HTML(html_out)
else:
html_out = [df.to_html() for df in frame_list]
return HTML(delimiter.join(html_out))
def hist_all(df, *args, **kwargs):
numcols = len(df.columns)
if numcols > 30:
yn = raw_input(str(numcols) + " columns. Proceed?")
if 'n' in yn: return None
for c in df:
print(c)
try:
plt.hist(df[c])
plt.show()
except:
print("\t(can't histogram this)\n")
def scatter_all(df, print_max=None, *args, **kwargs):
from itertools import combinations
numcols = len(df.columns)
if numcols > 6:
yn = raw_input(str(numcols) + " columns. Proceed?")
if 'n' in yn: return None
combos = combinations(df.columns, 2)
for c in combos:
print(c)
x = df[c[0]]
y = df[c[1]]
dfc = pd.concat([x, y], axis=1)
xsort = dfc.sort(columns=dfc.columns[0], inplace=False)
ysort = dfc.sort(columns=dfc.columns[1], inplace=False)
#print(dfc)
try:
dfc.plot(kind='scatter', x=0, y=1)
plt.show()
except:
print("can't plot")
if print_max:
print(xsort.head(print_max))
print(ysort.head(print_max))
# ## Importing data, exporting partial info to CSV for SPSS
# In[106]:
pfilenames = "c:/db_pickles/pickle - dfo-{measure} - {updated}.pickle"
full_updated = '2014-10-13a'
#pfile = pfilenames.format(measure='full', updated=full_updated)
pfile = pfilenames.format(measure='flat', updated=full_updated)
print(pfile)
with open(pfile) as f:
dfo = pickle.load(f)
#for quick searches later
match = lambda x: concat_matches(dfo, x)
dfo = dfo.replace(77777, np.nan)
dfo = dfo.replace('77777', np.nan)
#task_pids = {k: sorted(set(v.index.get_level_values('pid')))
# for (k, v) in task_frames.items()}
to_drop = set(dfo.index).intersection(excluded_all_tasks)
dfo = dfo.drop(to_drop)
# In[107]:
dfo.count()
# In[108]:
match('order').T
# In[109]:
pasted_scales = '''
# the only 'order' variable needed when just looking at ISIP tasks
SCAL_order_500ms_first
SCAL_sex_femalezero
SCAL_orders_iso
SCAL_orders_phase
SCAL_orders_linear
SCAL_calc_wasivocab_tscore
SCAL_calc_wasimatrix_tscore
SCAL_calc_wasi_tscore_total
SCAL_calc_fsiq2
SCAL_calc_bfi_extraversion
SCAL_calc_bfi_agreeableness
SCAL_calc_bfi_conscientiousness
SCAL_calc_bfi_neuroticism
SCAL_calc_bfi_openness
# compare with usefulness of constructed index
SCAL_qmusic_dancelevel
SCAL_qmusic_instrumentlevel
SCAL_qmusic_drumlevel
SCAL_qmusic_behaviors_12_friendstaste # comment
SCAL_qmusic_behaviors_13_sharingint
SCAL_qmusic_behaviors_14_getinterest
'''
pasted_isip = '''
#from: list(match('local$|drift$').columns)
I5P4_lagdev_local
I8P4_lagdev_local
I8P4_lagdev_drift
I5P4_lagdev_drift
I8L2_lag2devsq_local
I5L2_lag2devsq_local
I8L2_lag2devsq_drift
I5L2_lag2devsq_drift
#needed for filtering out a P that didn't do many taps
I8P4_ints_count
I5P4_ints_count
I8L2_ints_count
I5L2_ints_count
'''
pasted_sms = '''
'''
def clean_pasted_vars(pstring):
pasted_vars = pstring.split('\n')
#keep line contents before comment
pasted_vars = [i.split('#')[0] for i in pasted_vars]
#remove hidden whitespace and blank lines
pasted_vars = [i.strip() for i in pasted_vars]
pasted_vars = filter(lambda i: i != "", pasted_vars)
return pasted_vars
df_scales = dfo[clean_pasted_vars(pasted_scales)]
df_isip = dfo[clean_pasted_vars(pasted_isip)]
df_isip = df_isip.rename(columns=lambda x: x.replace('lagdev_',""))
df_isip = df_isip.rename(columns=lambda x: x.replace('lag2devsq_',""))
for c in ['I8P4_drift', 'I8P4_local']:
ISI = 800
df_isip[c + 'perc'] = df_isip[c] * 100. / ISI
for c in ['I5P4_drift', 'I5P4_local']:
ISI = 500
df_isip[c + 'perc'] = df_isip[c] * 100. / ISI
# In[110]:
df_isip.T
# In[111]:
# (missing values propagate in pandas arithmetic operations)
total_hours = (dfo.SCAL_qmusic_drumhours +
dfo.SCAL_qmusic_instrumenthours +
dfo.SCAL_qmusic_dancehours)
any_hours = (total_hours > 0).astype(int)
#skipna = False: if any missing values, produce a missing-value result
max_skill_level = pd.concat([dfo.SCAL_qmusic_dancelevel,
dfo.SCAL_qmusic_instrumentlevel,
dfo.SCAL_qmusic_drumlevel], axis=1).T.max(skipna=False)
sum_skill_level = pd.concat([dfo.SCAL_qmusic_dancelevel,
dfo.SCAL_qmusic_instrumentlevel,
dfo.SCAL_qmusic_drumlevel], axis=1).T.sum(skipna=False)
social_importance = pd.concat([dfo.SCAL_qmusic_behaviors_12_friendstaste,
dfo.SCAL_qmusic_behaviors_13_sharingint,
dfo.SCAL_qmusic_behaviors_14_getinterest,], axis=1).T.sum(skipna=False)
# (there are no missing values for these three vars)
df_constructed = pd.concat(axis=1,
objs=[any_hours,
max_skill_level,
sum_skill_level,
social_importance],
keys=['qmusic_calc_anyhours',
'qmusic_calc_maxskill',
'qmusic_calc_sumskill',
'qmusic_calc_socialimp'])
# In[112]:
df_constructed[df_constructed.qmusic_calc_maxskill.isnull()==True]
# In[113]:
def truncate(s):
z_limit = 2.97
maxval = s.mean() + z_limit * s.std()
minval = s.mean() - z_limit * s.std()
print "\n" + s.name
print "limits: {}, {}".format(maxval, minval)
assert minval < s.mean() < maxval
def truncval(val):
tstr = "truncated {} to {}."
if val > maxval:
print tstr.format(val, maxval)
return maxval
elif val < minval:
print tstr.format(val, minval)
if 'DPsd' in s.name:
print "WARNING: summary data should not have to be truncated in this direction."
return minval
else:
return val
out = s.apply(truncval)
if 'DPsd' in s.name:
#print('checking...')
assert out.min() >= 0
return out
def test_trunc(s):
print "Original"
s.hist()
plt.show()
print "Truncated"
truncate(s).hist()
plt.show()
test_trunc(df_isip.I5P4_drift)
# In[114]:
drifts = concat_matches(df_isip, 'P4_drift$|local$').apply(truncate)
drifts.head(3)
# In[115]:
drifts.plot(kind='scatter', x=0,y=1)
#Interesting issue with p. 55 (the outlier on IP54_drift).
# It appears legitimate: in general the local variation is very
# small-- but there's a lot of variability, because the subject
# drifted way down to around 400ms, then jumped up to around 550
# immediately-- so there were only a couple of intervals where
# there was a big change from one interval to the next.
# especially if smoothing across four intervals.....
# In[116]:
df_isip_out = pd.DataFrame(index = df_isip.index)
for c in df_isip.columns:
if 'ints_count' in c:
df_isip_out[c] = df_isip[c]
else:
df_isip_out[c + '_trunc'] = truncate(df_isip[c])
#del df_isip[c]
# In[117]:
df_isip_out.T
# In[120]:
#list(match('DPm$|DPsd$'))
match('DP').T
# In[121]:
df_sms = match('DP')
df_sms_out = pd.DataFrame(index = df_sms.index)
for c in df_sms.columns:
trimname = 's_' + c[5:]
if ("DPct" in c) or ("DPm" in c):
df_sms_out[trimname] = df_sms[c]
else:
df_sms_out[trimname + '_trunc'] = truncate(df_sms[c])
df_sms_out.T
# In[122]:
df_nonzero_transformed = match('nonzero')
df_log_transformed = match('ln1p')
isip_using = ['I5P4_local_trunc',
'I8P4_local_trunc',
'I5P4_drift_trunc',
'I8P4_drift_trunc',]
df_log_isips = np.log(df_isip_out[isip_using])
df_log_isips.columns = [c + "_log" for c in df_log_isips.columns]
to_log = [c for c in df_sms_out if "DPsd" in c]
df_log_sms = np.log(df_sms_out[to_log])
df_log_sms.columns = [c + "_log" for c in df_log_sms.columns]
df_log_sms
# In[123]:
df_to_analyze = pd.concat(axis=1,
objs=[df_scales,
df_constructed,
df_log_transformed,
df_nonzero_transformed,
df_isip_out,
df_log_isips,
df_sms_out,
df_log_sms,
])
# In[124]:
#concat_matches(df_to_analyze, 'I?P4_local_trunc|I?P4_drift_trunc').T
concat_matches(df_to_analyze, 'log$').T
# In[125]:
# TO DO:
# Calculate z scores for each DPsd
# Calculate the mean of the two z scores for each 500/800 pairing
# See if the value of this still correlates with the 500-first/800-first order variable
remove_unused = [c for c in df_to_analyze.columns
if ( '_psk_' in c
or 's_lint_' in c
or 's_linj_' in c)]
for c in remove_unused:
del df_to_analyze[c]
to_combine = concat_matches(df_to_analyze, 'DPsd_trunc')
#for p in list(to_combine.columns):
#print(p)
for c in ['I5P4_local_trunc',
'I8P4_local_trunc',
'I5P4_drift_trunc',
'I8P4_drift_trunc',]:
to_combine[c] = df_to_analyze[c]
z_to_combine = (to_combine.mean() - to_combine) / to_combine.std()
#proper column-wise z score output was confirmed
df_to_analyze['IP4_local_trunc_mz58'] = ( z_to_combine['I5P4_local_trunc']
+ z_to_combine['I8P4_local_trunc']) / 2
df_to_analyze['IP4_drift_trunc_mz58'] = ( z_to_combine['I5P4_drift_trunc']
+ z_to_combine['I8P4_drift_trunc']) / 2
df_to_analyze['iso_j_DPsd_trunc_mz58'] = ( z_to_combine['s_iso5j_DPsd_trunc']
+ z_to_combine['s_iso8j_DPsd_trunc']) / 2
df_to_analyze['iso_t1_DPsd_trunc_mz58'] = ( z_to_combine['s_iso5t1_DPsd_trunc']
+ z_to_combine['s_iso8t1_DPsd_trunc']) / 2
df_to_analyze['iso_t2_DPsd_trunc_mz58'] = ( z_to_combine['s_iso5t2_DPsd_trunc']
+ z_to_combine['s_iso8t2_DPsd_trunc']) / 2
df_to_analyze['lin_j_DPsd_trunc_mz58'] = ( z_to_combine['s_lin5j_DPsd_trunc']
+ z_to_combine['s_lin8j_DPsd_trunc']) / 2
df_to_analyze['lin_t_DPsd_trunc_mz58'] = ( z_to_combine['s_lin5t_DPsd_trunc']
+ z_to_combine['s_lin8t_DPsd_trunc']) / 2
df_to_analyze['phase_j_nrm_DPsd_trunc_mz58'] = ( z_to_combine['s_phase5j_nrm_DPsd_trunc']
+ z_to_combine['s_phase8j_nrm_DPsd_trunc']) / 2
df_to_analyze['phase_j_psr_DPsd_trunc_mz58'] = ( z_to_combine['s_phase5j_psr_DPsd_trunc']
+ z_to_combine['s_phase8j_psr_DPsd_trunc']) / 2
df_to_analyze['phase_t_nrm_DPsd_trunc_mz58'] = ( z_to_combine['s_phase5t_nrm_DPsd_trunc']
+ z_to_combine['s_phase8t_nrm_DPsd_trunc']) / 2
df_to_analyze['phase_t_psr_DPsd_trunc_mz58'] = ( z_to_combine['s_phase5t_psr_DPsd_trunc']
+ z_to_combine['s_phase8t_psr_DPsd_trunc']) / 2
#null values propagate to new measure (confirmed)
#df_to_analyze['IP4_drift_trunc_mz58'][20:]
# In[126]:
update = {'measure': 'subset_to_spss',
'updated': '2014-10-15c'}
pfilenames = "c:/db_pickles/pickle - dfo-{measure} - {updated}.{ext}"
output_file_csv = pfilenames.format(measure=update['measure'],
updated=update['updated'],
ext="csv")
output_file_pickle = pfilenames.format(measure=update['measure'],
updated=update['updated'],
ext="pickle")
df_to_analyze.to_pickle(output_file_pickle)
dfo_to_analyze_missing_coded = df_to_analyze.replace(np.nan, '77777')
dfo_to_analyze_missing_coded.to_csv(output_file_csv)
print("\nSAVED: {}\n".format(output_file_csv))
df_to_analyze.T
# In[22]:
df_to_analyze.count().to_csv('non-null counts 2014-10-15b.csv')
# In[95]:
dfa = df_to_analyze
for p in concat_matches(df_to_analyze, '_log'): print p
# In[103]:
paste_1 = ('''
s_iso5t2_DPsd_trunc_log
s_iso8t2_DPsd_trunc_log
s_iso5j_DPsd_trunc_log
s_iso8j_DPsd_trunc_log
s_phase8j_psr_DPsd_trunc_log
s_phase8t_psr_DPsd_trunc_log
s_phase5j_psr_DPsd_trunc_log
s_phase5t_psr_DPsd_trunc_log
''')
design_1 = clean_pasted_vars(paste_1)
scatter_all(dfa[design_1])
# In[98]:
design_2 = clean_pasted_vars('''
s_lin5t_DPsd_trunc_log
s_lin8t_DPsd_trunc_log
s_lin5j_DPsd_trunc_log
s_lin8j_DPsd_trunc_log
''')
scatter_all(dfa[design_2])
# In[99]:
design_3 = clean_pasted_vars('''
I5P4_local_trunc_log
I8P4_local_trunc_log
I5P4_drift_trunc_log
I8P4_drift_trunc_log
''')
scatter_all(dfa[design_3])
# In[101]:
match('drumlevel').sort(columns='SCAL_qmusic_drumlevel').tail(20)
# In[102]:
match('instrumentlevel').sort(columns='SCAL_qmusic_instrumentlevel').median()
# In[207]:
def stack_rm_case(case_series):
total_stacked_vars = 12
caseid = case_series.name
#caseid_repeated = [caseid] * total_stacked_vars
#caseid_list = {}
tasktype = {}
targetioi = {}
targetstim = {}
tasktype['s_iso5t2_DPsd_trunc_log'] = 1
tasktype['s_iso8t2_DPsd_trunc_log'] = 1
tasktype['s_iso5j_DPsd_trunc_log'] = 1
tasktype['s_iso8j_DPsd_trunc_log'] = 1
tasktype['s_phase8j_psr_DPsd_trunc_log'] = 2
tasktype['s_phase8t_psr_DPsd_trunc_log'] = 2
tasktype['s_phase5j_psr_DPsd_trunc_log'] = 2
tasktype['s_phase5t_psr_DPsd_trunc_log'] = 2
tasktype['s_phase5t_nrm_DPsd_trunc_log'] = 3
tasktype['s_phase8t_nrm_DPsd_trunc_log'] = 3
tasktype['s_phase5j_nrm_DPsd_trunc_log'] = 3
tasktype['s_phase8j_nrm_DPsd_trunc_log'] = 3
tasktype['s_iso5t2_DPsd_trunc_log'] = 1
tasktype['s_iso8t2_DPsd_trunc_log'] = 1
tasktype['s_iso5j_DPsd_trunc_log'] = 1
tasktype['s_iso8j_DPsd_trunc_log'] = 1
tasktype['s_phase8j_psr_DPsd_trunc_log'] = 2
tasktype['s_phase8t_psr_DPsd_trunc_log'] = 2
tasktype['s_phase5j_psr_DPsd_trunc_log'] = 2
tasktype['s_phase5t_psr_DPsd_trunc_log'] = 2
tasktype['s_phase5t_nrm_DPsd_trunc_log'] = 3
tasktype['s_phase8t_nrm_DPsd_trunc_log'] = 3
tasktype['s_phase5j_nrm_DPsd_trunc_log'] = 3
tasktype['s_phase8j_nrm_DPsd_trunc_log'] = 3
targetioi['s_iso5t2_DPsd_trunc_log'] = 0
targetioi['s_iso8t2_DPsd_trunc_log'] = 1
targetioi['s_iso5j_DPsd_trunc_log'] = 0
targetioi['s_iso8j_DPsd_trunc_log'] = 1
targetioi['s_phase8j_psr_DPsd_trunc_log'] = 1
targetioi['s_phase8t_psr_DPsd_trunc_log'] = 1
targetioi['s_phase5j_psr_DPsd_trunc_log'] = 0
targetioi['s_phase5t_psr_DPsd_trunc_log'] = 0
targetioi['s_phase5t_nrm_DPsd_trunc_log'] = 0
targetioi['s_phase8t_nrm_DPsd_trunc_log'] = 1
targetioi['s_phase5j_nrm_DPsd_trunc_log'] = 0
targetioi['s_phase8j_nrm_DPsd_trunc_log'] = 1
targetstim['s_iso5t2_DPsd_trunc_log'] = 0
targetstim['s_iso8t2_DPsd_trunc_log'] = 0
targetstim['s_iso5j_DPsd_trunc_log'] = 1
targetstim['s_iso8j_DPsd_trunc_log'] = 1
targetstim['s_phase8j_psr_DPsd_trunc_log'] = 1
targetstim['s_phase8t_psr_DPsd_trunc_log'] = 0
targetstim['s_phase5j_psr_DPsd_trunc_log'] = 1
targetstim['s_phase5t_psr_DPsd_trunc_log'] = 0
targetstim['s_phase5t_nrm_DPsd_trunc_log'] = 0
targetstim['s_phase8t_nrm_DPsd_trunc_log'] = 0
targetstim['s_phase5j_nrm_DPsd_trunc_log'] = 1
targetstim['s_phase8j_nrm_DPsd_trunc_log'] = 1
caseid_repeated = {k: caseid for k in tasktype.keys()}
stackedvars = pd.DataFrame({'caseid': caseid_repeated,
'casedata': acase,
'tasktype': tasktype,
'targetioi': targetioi,
'targetstim': targetstim,
},
#index = acase.T.index
)
stackedvars.index.name='original_varname'
case_out = stackedvars.reset_index('original_varname')
return case_out
repmeas = concat_matches(df_to_analyze, 'psr.*log|nrm.*log|iso.t2.*log|iso.j.*log')
cases = [stack_rm_case(repmeas.loc[p]) for p in repmeas.index]
stacked = pd.concat(cases, axis=0)
stacked.index = range(len(stacked))
stacked.index.name = "st_row"
stacked = stacked.reset_index('st_row')
stacked = stacked.set_index('caseid')
# In[208]:
df_to_analyze['SCAL_calc_fsiq2']
# In[219]:
df_to_analyze.loc['015', staticvar]
# In[214]:
staticvar = 'SCAL_calc_fsiq2'
ids = sorted(set(stacked.index))
for caseid in ids:
print(caseid)
stacked[staticvar] = np.nan
stacked.loc[caseid, staticvar] = df_to_analyze.loc[caseid, staticvar]
#slc = stacked.loc[stacked.caseid=='015']
#slc.somevarname = 'the_value'
#stacked.to_csv('stacked_test.csv')
stacked
# In[171]:
df_to_analyze.loc[caseid, staticvar]
# In[313]:
print("NULL VALUES (INCLUDING REMOVED FOR INCOMPLETE TAP SETS):\n\n")
for c in df_to_analyze:
print(c)
s = df_to_analyze[c]
print(list(s[s.isnull()].index))
print('')
# # After exporting CSV: looking at distributions here...
# ## Descriptives (into manuscript)
# In[209]:
dfa = df_to_analyze
get = lambda r: (list(concat_matches(dfo, r).columns), concat_matches(dfo, r))
geta = lambda r: (list(concat_matches(df_to_analyze, r).columns), concat_matches(df_to_analyze, r))
firstcol = lambda df: df.T.iloc[0]
firstcol(match('participant_age')).describe()
# In[157]:
sex = firstcol(match('sex_femalezero'))
is_female = (sex==0)
is_male = (sex==1)
assert is_female[is_female==True].count() == 60
assert is_female[is_female==False].count() == 39
assert is_male[is_male==True].count() == 39
assert is_male[is_male==False].count() == 60
# In[159]:
var1 = firstcol(match('participant_age'))
print (" females")
print firstcol(match('participant_age'))[is_female].describe()
print
print (" males")
print firstcol(match('participant_age'))[is_male].describe()
# In[171]:
names, df = get('white')
print 'female'
print df[is_female].sum()
print df[is_female].count()
print
print 'male'
print df[is_male].sum()
print df[is_male].count()
# In[133]:
match('participant_age').columns
#dfo['SCAL_participant_age'].name
# In[217]:
names, df = get('I?P4_ints_count')
df.describe()
# In[228]:
names, df = geta('I?P4_drift_trunc$')
df.describe()
# In[261]:
names, df = geta('s_.*DPsd_trunc$')
dtable = df.describe().T[:14]
reformat = np.round(dtable[['mean', 'std', 'count']], 4)
reformat
# In[264]:
names, df = geta('nrm_DPsd_trunc$|psr_DPsd_trunc$')
dtable = df.describe().T #[14:-6]
reformat = dtable[['mean', 'std', 'count']]
reformat
# In[269]:
names, df = get('DPsd')
dtable = df.describe().T[:14]
reformat = np.round(dtable[['mean', 'std', 'count']], 4)
reformat
# In[203]:
names, df = get('I5P4_ints_count')
df.std()
# ## var1
# In[76]:
dfa = df_to_analyze
matcha = lambda x: concat_matches(dfa, x)
isips = matcha('P4_drift_trunc|P4_local_trunc')
isips
# In[82]:
dfa = df_to_analyze
matcha = lambda x: concat_matches(dfa, x)
#isips = matcha('P4_drift_trunc|P4_local_trunc')
smscols = matcha('^s_.*DPsd_trunc$')
#scatter_all(isips, print_max=3)
#scatter_all(np.log(isips), print_max=3)
smscols.T
# In[47]:
dft1 = df_to_analyze['s_phase5t_s4a_DPm_trunc']
dft2 = df_to_analyze['s_phase8j_s4a_DPm_trunc']
#dft2.corr(dft1)
dft1.corr(dft2)
# In[64]:
#mna = match('5._DPm|8._DPm|5.2_DPm|8.2_DPm')
#mna.to_csv('perc_negative_asynchrony_20141008.csv')
# In[18]:
phase_sections_means = match('a_DPm|b_DPm')
phase_sections_sd = match('a_DPsd|b_DPsd')
match('nonzero').T
# In[4]:
#for c in range(35):
# s = phase_sections_sd.ix[:,c]
# m = phase_sections_means.ix[:,c]
# print phase_sections_sd.columns[c]
# print phase_sections_means.columns[c]
# print s.corr(m)
# In[4]:
#matchq('behaviors_')
pasted = '''
SCAL_sex_femalezero
SCAL_calc_wasivocab_totalrawscore
SCAL_calc_wasimatrix_totalscore
SCAL_calc_wasivocab_tscore
SCAL_calc_wasimatrix_tscore
SCAL_calc_wasi_tscore_total
SCAL_calc_fsiq2
SCAL_calc_bfi_extraversion
SCAL_calc_bfi_agreeableness
SCAL_calc_bfi_conscientiousness
SCAL_calc_bfi_neuroticism
SCAL_calc_bfi_openness
SCAL_session_taskorder
SCAL_order_500ms_first
SCAL_order_rhythmfirst
SCAL_qbasic_hearingdeficityn
SCAL_qbasic_injuriesyn
SCAL_qbasic_exerciseyn
SCAL_qbasic_neurodisorderyn
SCAL_qmusic_singingyn
SCAL_qmusic_singinghours --> NONZERO
SCAL_qmusic_singingtimes --> NONZERO
SCAL_qmusic_dancelevel --> LN1P
SCAL_qmusic_instrumentlevel --> good
SCAL_qmusic_dancehours --> NONZERO
SCAL_qmusic_instrumenthours --> NONZERO
SCAL_qmusic_danceyn
SCAL_qmusic_instrumentyn
SCAL_qmusic_gameyn
SCAL_qmusic_drumsyn
SCAL_qmusic_gamenames --> string
SCAL_qmusic_gamehoursall --> NONZERO
SCAL_qmusic_gamehoursdrumsticks --> NONZERO
SCAL_qmusic_drumstyles --> string
SCAL_qmusic_drumhours --> NONZERO
SCAL_qmusic_drumlevel --> NONZERO
SCAL_qmusic_behaviors_07_yourself --> LN1P
SCAL_qmusic_behaviors_08_otherprs --> LN1P
SCAL_qmusic_behaviors_09_danceprv --> LN1P
SCAL_qmusic_behaviors_10_dancepub --> NONZERO
SCAL_qmusic_behaviors_11_urgemove --> NONZERO
SCAL_qmusic_behaviors_12_friendstaste --> good
SCAL_qmusic_behaviors_13_sharingint --> good
SCAL_qmusic_behaviors_14_getinterest --> good
'''
tolist = pasted.split('\n')
nonzero = filter(lambda i: i.split(" ")[-1] == "NONZERO", tolist)
nonzero = [i.split(" ")[0] for i in nonzero]
assert len(nonzero) == pasted.count('NONZERO')
LN1P = filter(lambda i: i.split(" ")[-1] == "LN1P", tolist)
LN1P = [i.split(" ")[0] for i in LN1P]
assert len(LN1P) == pasted.count('LN1P')
tolist = [i.replace("--> good", "") for i in tolist]
tolist = filter(lambda i: "-->" not in i, tolist)
tolist = [i.strip() for i in tolist]
tolist = filter(lambda i: i != "", tolist)
LN1P
# In[5]:
match = lambda x: concat_matches(dfo, x)
df_q = match('SCAL_qbasic|SCAL_qmusic')
matchq = lambda x: concat_matches(df_q, x)
rnot = lambda r: '^((?!' + r + ').)*$'
#scales = concat_matches(scales, '^((?!notes).)*$') #hacky "does not contain 'notes' matcher
# In[6]:
scales_keep = dfo[['SCAL_qmusic_instrumentlevel',
'SCAL_qmusic_behaviors_12_friendstaste',
]]
plist = lambda l: '\n'.join(l)
print plist(match('SCAL_').columns)
#print('\n'.join(list(match('SCAL_').columns)))
# In[7]:
dfo['SCAL_orders_psh_first'] = (dfo.SCAL_orders_phase==0).astype(int)
dfo['SCAL_orders_lin_first'] = (dfo.SCAL_orders_linear==0).astype(int)
dfo['SCAL_orders_iso_first'] = (dfo.SCAL_orders_iso==0).astype(int)
match('orders').head(4).T
# In[8]:
dff = dfo[tolist]
dff.T
# In[9]:
match('order').T
# In[ ]:
hrs = dfo.SCAL_qmusic_danceyn
hrs[hrs > 0].count()
total = dfo.SCAL_qmusic_drumhours + dfo.SCAL_qmusic_instrumenthours + dfo.SCAL_qmusic_dancehours
def filter_outliers(series):
# Tabachnik & fidell call +- 3.29 SD a removable/truncatable outlier
return series[np.abs(series) <= 3.29 * series.std()]
trunc_count = 0
def truncate_outliers(series):
# Tabachnik & fidell call +- 3.29 SD a removable/truncatable outlier
maxval = series.mean() + 3.29 * series.std()
minval = series.mean() - 3.29 * series.std()
trunc_count = 0
def trunc(val):
if val > maxval:
trunc_count += 1
return maxval
elif val < minval:
trunc_count += 1
return minval
else:
return val
s = series.apply(trunc)
print('truncated {} of {} cases.'.format(trunc_count, len(s)))
return s
truncate_outliers(total).hist()
# In[11]:
dfo
# In[62]:
#dfo.scales.bfi_item39.hist()
dfo.sms.phase5t_DPsd.apply(lambda x: 1/x).hist()
# ## SPSS SYNTAX GENERATION
# In[183]:
def variable_labels_syntax(varlist):
var_labels = "VARIABLE LABELS \n{vlist}."
vl_item = " {var} '{label}'\n"
vl_list = '\n'.join([vl_item.format(var=v, label=l) for (v, l) in varlist])
return var_labels.format(vlist=vl_list)
#testing
print variable_labels_syntax(varlist = [("fff", "sssss")])
# In[165]:
bfi={}
bfi['E'] = ['1', '6R', '11', '16', '21R', '26', '31R', '36']
bfi['A'] = ['2R', '7', '12R', '17', '22', '27R', '32', '37R', '42']
bfi['C'] = ['3', '8R', '13', '18R', '23R', '28', '33', '38', '43R']
bfi['N'] = ['4', '9R', '14', '19', '24R', '29', '34R', '39']
bfi['O'] = ['5', '10', '15', '20', '25', '30', '35R', '40', '41R', '44']
bfi_score = {}
for k, v in bfi.items():
for i in v:
reverse_scored = 'R' in i
if reverse_scored:
i = i[:-1]
item = int(i)
bfi_score[item] = {'factor': k,
'reverse_scored': reverse_scored}
bfi_score
# In[131]:
print('ALTER TYPE')
print(' (F8.2)\n'.join(others) + ' (F8.2)')
print('.')
# Oops - these aren't the values in the dfo_flat output. Need to do this there instead,
# or import from the CSV I made there.
# In[190]:
varlist = []
for k, v in bfi_score.items():
name = "SCAL_bfi_item" + str(k)
factor = v['factor']
label = "BFI item {n} ({f})".format(n=k, f=factor)
return
#bfi_vars