#!/usr/bin/env python # coding: utf-8 # In[104]: #from __future__ import print_function import numpy as np import pandas as pd from collections import OrderedDict #sorting participant df dict before pd.concat() import matplotlib.pylab as plt get_ipython().run_line_magic('matplotlib', 'inline') pd.options.display.mpl_style = 'default' import cPickle as pickle pilot_data = ['010', '011', '012', '013', '014',] non_english_fluent = ['023', '031', '045', '050', '070', '106',] left_handed = ['042', '088',] pro_inst_skill = ['026', '037'] excluded_all_tasks = pilot_data + non_english_fluent + left_handed + pro_inst_skill # ## Utility functions # In[105]: def col_matches(df, regex): 'returns a list of columns in a df that match a regex string.' import re cols = list(enumerate(df.columns)) matches = [c for (i, c) in cols if re.findall(regex, c)] return matches def compare_transformations(df, columns, functions, **kwargs): print('raw') df[columns].hist(**kwargs) plt.show() for name, func in functions.items(): print(name) df[columns].apply(func).hist(**kwargs) plt.show() def quickcompare(r, df, size=(15,7)): inverse = lambda x: 1.0/x return compare_transformations(df, col_matches(df, r), {'inverse': inverse, 'log1p': np.log1p, 'sqrt': np.sqrt, }, figsize=size) # using this for inline documentation so that it's clear # that the printing statement isn't part of the necessary # transformation code. def html_print(df): from IPython.display import HTML try: out = df.to_html() except AttributeError: out = pd.DataFrame(df).to_html() return HTML(out) def htmljoin(df_list, delimiter=''): from IPython.display import HTML return HTML(delimiter.join([x.to_html() for x in df_list])) def col_matches(df, regex): import re cols = list(enumerate(df.columns)) matches = [c for (i, c) in cols if re.findall(regex, c)] return matches def concat_matches(df, *args): assert all([len(r) for r in args]) import re col_match_lists = [col_matches(df, regex) for regex in args] col_set = [df[matches] for matches in col_match_lists] if len(col_set) == 0: return None elif len(col_set) == 1: return col_set[0] else: return pd.concat(col_set, axis=1) def show_frames(frame_list, delimiter=''): from IPython.display import HTML if len(frame_list) == len(delimiter): html_out = "" item_template = '

{}

{}
' for i, tup in enumerate(zip(frame_list, delimiter)): frame = tup[0] tag = tup[1] html_out += item_template.format(tag, frame.to_html()) return HTML(html_out) else: html_out = [df.to_html() for df in frame_list] return HTML(delimiter.join(html_out)) def hist_all(df, *args, **kwargs): numcols = len(df.columns) if numcols > 30: yn = raw_input(str(numcols) + " columns. Proceed?") if 'n' in yn: return None for c in df: print(c) try: plt.hist(df[c]) plt.show() except: print("\t(can't histogram this)\n") def scatter_all(df, print_max=None, *args, **kwargs): from itertools import combinations numcols = len(df.columns) if numcols > 6: yn = raw_input(str(numcols) + " columns. Proceed?") if 'n' in yn: return None combos = combinations(df.columns, 2) for c in combos: print(c) x = df[c[0]] y = df[c[1]] dfc = pd.concat([x, y], axis=1) xsort = dfc.sort(columns=dfc.columns[0], inplace=False) ysort = dfc.sort(columns=dfc.columns[1], inplace=False) #print(dfc) try: dfc.plot(kind='scatter', x=0, y=1) plt.show() except: print("can't plot") if print_max: print(xsort.head(print_max)) print(ysort.head(print_max)) # ## Importing data, exporting partial info to CSV for SPSS # In[106]: pfilenames = "c:/db_pickles/pickle - dfo-{measure} - {updated}.pickle" full_updated = '2014-10-13a' #pfile = pfilenames.format(measure='full', updated=full_updated) pfile = pfilenames.format(measure='flat', updated=full_updated) print(pfile) with open(pfile) as f: dfo = pickle.load(f) #for quick searches later match = lambda x: concat_matches(dfo, x) dfo = dfo.replace(77777, np.nan) dfo = dfo.replace('77777', np.nan) #task_pids = {k: sorted(set(v.index.get_level_values('pid'))) # for (k, v) in task_frames.items()} to_drop = set(dfo.index).intersection(excluded_all_tasks) dfo = dfo.drop(to_drop) # In[107]: dfo.count() # In[108]: match('order').T # In[109]: pasted_scales = ''' # the only 'order' variable needed when just looking at ISIP tasks SCAL_order_500ms_first SCAL_sex_femalezero SCAL_orders_iso SCAL_orders_phase SCAL_orders_linear SCAL_calc_wasivocab_tscore SCAL_calc_wasimatrix_tscore SCAL_calc_wasi_tscore_total SCAL_calc_fsiq2 SCAL_calc_bfi_extraversion SCAL_calc_bfi_agreeableness SCAL_calc_bfi_conscientiousness SCAL_calc_bfi_neuroticism SCAL_calc_bfi_openness # compare with usefulness of constructed index SCAL_qmusic_dancelevel SCAL_qmusic_instrumentlevel SCAL_qmusic_drumlevel SCAL_qmusic_behaviors_12_friendstaste # comment SCAL_qmusic_behaviors_13_sharingint SCAL_qmusic_behaviors_14_getinterest ''' pasted_isip = ''' #from: list(match('local$|drift$').columns) I5P4_lagdev_local I8P4_lagdev_local I8P4_lagdev_drift I5P4_lagdev_drift I8L2_lag2devsq_local I5L2_lag2devsq_local I8L2_lag2devsq_drift I5L2_lag2devsq_drift #needed for filtering out a P that didn't do many taps I8P4_ints_count I5P4_ints_count I8L2_ints_count I5L2_ints_count ''' pasted_sms = ''' ''' def clean_pasted_vars(pstring): pasted_vars = pstring.split('\n') #keep line contents before comment pasted_vars = [i.split('#')[0] for i in pasted_vars] #remove hidden whitespace and blank lines pasted_vars = [i.strip() for i in pasted_vars] pasted_vars = filter(lambda i: i != "", pasted_vars) return pasted_vars df_scales = dfo[clean_pasted_vars(pasted_scales)] df_isip = dfo[clean_pasted_vars(pasted_isip)] df_isip = df_isip.rename(columns=lambda x: x.replace('lagdev_',"")) df_isip = df_isip.rename(columns=lambda x: x.replace('lag2devsq_',"")) for c in ['I8P4_drift', 'I8P4_local']: ISI = 800 df_isip[c + 'perc'] = df_isip[c] * 100. / ISI for c in ['I5P4_drift', 'I5P4_local']: ISI = 500 df_isip[c + 'perc'] = df_isip[c] * 100. / ISI # In[110]: df_isip.T # In[111]: # (missing values propagate in pandas arithmetic operations) total_hours = (dfo.SCAL_qmusic_drumhours + dfo.SCAL_qmusic_instrumenthours + dfo.SCAL_qmusic_dancehours) any_hours = (total_hours > 0).astype(int) #skipna = False: if any missing values, produce a missing-value result max_skill_level = pd.concat([dfo.SCAL_qmusic_dancelevel, dfo.SCAL_qmusic_instrumentlevel, dfo.SCAL_qmusic_drumlevel], axis=1).T.max(skipna=False) sum_skill_level = pd.concat([dfo.SCAL_qmusic_dancelevel, dfo.SCAL_qmusic_instrumentlevel, dfo.SCAL_qmusic_drumlevel], axis=1).T.sum(skipna=False) social_importance = pd.concat([dfo.SCAL_qmusic_behaviors_12_friendstaste, dfo.SCAL_qmusic_behaviors_13_sharingint, dfo.SCAL_qmusic_behaviors_14_getinterest,], axis=1).T.sum(skipna=False) # (there are no missing values for these three vars) df_constructed = pd.concat(axis=1, objs=[any_hours, max_skill_level, sum_skill_level, social_importance], keys=['qmusic_calc_anyhours', 'qmusic_calc_maxskill', 'qmusic_calc_sumskill', 'qmusic_calc_socialimp']) # In[112]: df_constructed[df_constructed.qmusic_calc_maxskill.isnull()==True] # In[113]: def truncate(s): z_limit = 2.97 maxval = s.mean() + z_limit * s.std() minval = s.mean() - z_limit * s.std() print "\n" + s.name print "limits: {}, {}".format(maxval, minval) assert minval < s.mean() < maxval def truncval(val): tstr = "truncated {} to {}." if val > maxval: print tstr.format(val, maxval) return maxval elif val < minval: print tstr.format(val, minval) if 'DPsd' in s.name: print "WARNING: summary data should not have to be truncated in this direction." return minval else: return val out = s.apply(truncval) if 'DPsd' in s.name: #print('checking...') assert out.min() >= 0 return out def test_trunc(s): print "Original" s.hist() plt.show() print "Truncated" truncate(s).hist() plt.show() test_trunc(df_isip.I5P4_drift) # In[114]: drifts = concat_matches(df_isip, 'P4_drift$|local$').apply(truncate) drifts.head(3) # In[115]: drifts.plot(kind='scatter', x=0,y=1) #Interesting issue with p. 55 (the outlier on IP54_drift). # It appears legitimate: in general the local variation is very # small-- but there's a lot of variability, because the subject # drifted way down to around 400ms, then jumped up to around 550 # immediately-- so there were only a couple of intervals where # there was a big change from one interval to the next. # especially if smoothing across four intervals..... # In[116]: df_isip_out = pd.DataFrame(index = df_isip.index) for c in df_isip.columns: if 'ints_count' in c: df_isip_out[c] = df_isip[c] else: df_isip_out[c + '_trunc'] = truncate(df_isip[c]) #del df_isip[c] # In[117]: df_isip_out.T # In[120]: #list(match('DPm$|DPsd$')) match('DP').T # In[121]: df_sms = match('DP') df_sms_out = pd.DataFrame(index = df_sms.index) for c in df_sms.columns: trimname = 's_' + c[5:] if ("DPct" in c) or ("DPm" in c): df_sms_out[trimname] = df_sms[c] else: df_sms_out[trimname + '_trunc'] = truncate(df_sms[c]) df_sms_out.T # In[122]: df_nonzero_transformed = match('nonzero') df_log_transformed = match('ln1p') isip_using = ['I5P4_local_trunc', 'I8P4_local_trunc', 'I5P4_drift_trunc', 'I8P4_drift_trunc',] df_log_isips = np.log(df_isip_out[isip_using]) df_log_isips.columns = [c + "_log" for c in df_log_isips.columns] to_log = [c for c in df_sms_out if "DPsd" in c] df_log_sms = np.log(df_sms_out[to_log]) df_log_sms.columns = [c + "_log" for c in df_log_sms.columns] df_log_sms # In[123]: df_to_analyze = pd.concat(axis=1, objs=[df_scales, df_constructed, df_log_transformed, df_nonzero_transformed, df_isip_out, df_log_isips, df_sms_out, df_log_sms, ]) # In[124]: #concat_matches(df_to_analyze, 'I?P4_local_trunc|I?P4_drift_trunc').T concat_matches(df_to_analyze, 'log$').T # In[125]: # TO DO: # Calculate z scores for each DPsd # Calculate the mean of the two z scores for each 500/800 pairing # See if the value of this still correlates with the 500-first/800-first order variable remove_unused = [c for c in df_to_analyze.columns if ( '_psk_' in c or 's_lint_' in c or 's_linj_' in c)] for c in remove_unused: del df_to_analyze[c] to_combine = concat_matches(df_to_analyze, 'DPsd_trunc') #for p in list(to_combine.columns): #print(p) for c in ['I5P4_local_trunc', 'I8P4_local_trunc', 'I5P4_drift_trunc', 'I8P4_drift_trunc',]: to_combine[c] = df_to_analyze[c] z_to_combine = (to_combine.mean() - to_combine) / to_combine.std() #proper column-wise z score output was confirmed df_to_analyze['IP4_local_trunc_mz58'] = ( z_to_combine['I5P4_local_trunc'] + z_to_combine['I8P4_local_trunc']) / 2 df_to_analyze['IP4_drift_trunc_mz58'] = ( z_to_combine['I5P4_drift_trunc'] + z_to_combine['I8P4_drift_trunc']) / 2 df_to_analyze['iso_j_DPsd_trunc_mz58'] = ( z_to_combine['s_iso5j_DPsd_trunc'] + z_to_combine['s_iso8j_DPsd_trunc']) / 2 df_to_analyze['iso_t1_DPsd_trunc_mz58'] = ( z_to_combine['s_iso5t1_DPsd_trunc'] + z_to_combine['s_iso8t1_DPsd_trunc']) / 2 df_to_analyze['iso_t2_DPsd_trunc_mz58'] = ( z_to_combine['s_iso5t2_DPsd_trunc'] + z_to_combine['s_iso8t2_DPsd_trunc']) / 2 df_to_analyze['lin_j_DPsd_trunc_mz58'] = ( z_to_combine['s_lin5j_DPsd_trunc'] + z_to_combine['s_lin8j_DPsd_trunc']) / 2 df_to_analyze['lin_t_DPsd_trunc_mz58'] = ( z_to_combine['s_lin5t_DPsd_trunc'] + z_to_combine['s_lin8t_DPsd_trunc']) / 2 df_to_analyze['phase_j_nrm_DPsd_trunc_mz58'] = ( z_to_combine['s_phase5j_nrm_DPsd_trunc'] + z_to_combine['s_phase8j_nrm_DPsd_trunc']) / 2 df_to_analyze['phase_j_psr_DPsd_trunc_mz58'] = ( z_to_combine['s_phase5j_psr_DPsd_trunc'] + z_to_combine['s_phase8j_psr_DPsd_trunc']) / 2 df_to_analyze['phase_t_nrm_DPsd_trunc_mz58'] = ( z_to_combine['s_phase5t_nrm_DPsd_trunc'] + z_to_combine['s_phase8t_nrm_DPsd_trunc']) / 2 df_to_analyze['phase_t_psr_DPsd_trunc_mz58'] = ( z_to_combine['s_phase5t_psr_DPsd_trunc'] + z_to_combine['s_phase8t_psr_DPsd_trunc']) / 2 #null values propagate to new measure (confirmed) #df_to_analyze['IP4_drift_trunc_mz58'][20:] # In[126]: update = {'measure': 'subset_to_spss', 'updated': '2014-10-15c'} pfilenames = "c:/db_pickles/pickle - dfo-{measure} - {updated}.{ext}" output_file_csv = pfilenames.format(measure=update['measure'], updated=update['updated'], ext="csv") output_file_pickle = pfilenames.format(measure=update['measure'], updated=update['updated'], ext="pickle") df_to_analyze.to_pickle(output_file_pickle) dfo_to_analyze_missing_coded = df_to_analyze.replace(np.nan, '77777') dfo_to_analyze_missing_coded.to_csv(output_file_csv) print("\nSAVED: {}\n".format(output_file_csv)) df_to_analyze.T # In[22]: df_to_analyze.count().to_csv('non-null counts 2014-10-15b.csv') # In[95]: dfa = df_to_analyze for p in concat_matches(df_to_analyze, '_log'): print p # In[103]: paste_1 = (''' s_iso5t2_DPsd_trunc_log s_iso8t2_DPsd_trunc_log s_iso5j_DPsd_trunc_log s_iso8j_DPsd_trunc_log s_phase8j_psr_DPsd_trunc_log s_phase8t_psr_DPsd_trunc_log s_phase5j_psr_DPsd_trunc_log s_phase5t_psr_DPsd_trunc_log ''') design_1 = clean_pasted_vars(paste_1) scatter_all(dfa[design_1]) # In[98]: design_2 = clean_pasted_vars(''' s_lin5t_DPsd_trunc_log s_lin8t_DPsd_trunc_log s_lin5j_DPsd_trunc_log s_lin8j_DPsd_trunc_log ''') scatter_all(dfa[design_2]) # In[99]: design_3 = clean_pasted_vars(''' I5P4_local_trunc_log I8P4_local_trunc_log I5P4_drift_trunc_log I8P4_drift_trunc_log ''') scatter_all(dfa[design_3]) # In[101]: match('drumlevel').sort(columns='SCAL_qmusic_drumlevel').tail(20) # In[102]: match('instrumentlevel').sort(columns='SCAL_qmusic_instrumentlevel').median() # In[207]: def stack_rm_case(case_series): total_stacked_vars = 12 caseid = case_series.name #caseid_repeated = [caseid] * total_stacked_vars #caseid_list = {} tasktype = {} targetioi = {} targetstim = {} tasktype['s_iso5t2_DPsd_trunc_log'] = 1 tasktype['s_iso8t2_DPsd_trunc_log'] = 1 tasktype['s_iso5j_DPsd_trunc_log'] = 1 tasktype['s_iso8j_DPsd_trunc_log'] = 1 tasktype['s_phase8j_psr_DPsd_trunc_log'] = 2 tasktype['s_phase8t_psr_DPsd_trunc_log'] = 2 tasktype['s_phase5j_psr_DPsd_trunc_log'] = 2 tasktype['s_phase5t_psr_DPsd_trunc_log'] = 2 tasktype['s_phase5t_nrm_DPsd_trunc_log'] = 3 tasktype['s_phase8t_nrm_DPsd_trunc_log'] = 3 tasktype['s_phase5j_nrm_DPsd_trunc_log'] = 3 tasktype['s_phase8j_nrm_DPsd_trunc_log'] = 3 tasktype['s_iso5t2_DPsd_trunc_log'] = 1 tasktype['s_iso8t2_DPsd_trunc_log'] = 1 tasktype['s_iso5j_DPsd_trunc_log'] = 1 tasktype['s_iso8j_DPsd_trunc_log'] = 1 tasktype['s_phase8j_psr_DPsd_trunc_log'] = 2 tasktype['s_phase8t_psr_DPsd_trunc_log'] = 2 tasktype['s_phase5j_psr_DPsd_trunc_log'] = 2 tasktype['s_phase5t_psr_DPsd_trunc_log'] = 2 tasktype['s_phase5t_nrm_DPsd_trunc_log'] = 3 tasktype['s_phase8t_nrm_DPsd_trunc_log'] = 3 tasktype['s_phase5j_nrm_DPsd_trunc_log'] = 3 tasktype['s_phase8j_nrm_DPsd_trunc_log'] = 3 targetioi['s_iso5t2_DPsd_trunc_log'] = 0 targetioi['s_iso8t2_DPsd_trunc_log'] = 1 targetioi['s_iso5j_DPsd_trunc_log'] = 0 targetioi['s_iso8j_DPsd_trunc_log'] = 1 targetioi['s_phase8j_psr_DPsd_trunc_log'] = 1 targetioi['s_phase8t_psr_DPsd_trunc_log'] = 1 targetioi['s_phase5j_psr_DPsd_trunc_log'] = 0 targetioi['s_phase5t_psr_DPsd_trunc_log'] = 0 targetioi['s_phase5t_nrm_DPsd_trunc_log'] = 0 targetioi['s_phase8t_nrm_DPsd_trunc_log'] = 1 targetioi['s_phase5j_nrm_DPsd_trunc_log'] = 0 targetioi['s_phase8j_nrm_DPsd_trunc_log'] = 1 targetstim['s_iso5t2_DPsd_trunc_log'] = 0 targetstim['s_iso8t2_DPsd_trunc_log'] = 0 targetstim['s_iso5j_DPsd_trunc_log'] = 1 targetstim['s_iso8j_DPsd_trunc_log'] = 1 targetstim['s_phase8j_psr_DPsd_trunc_log'] = 1 targetstim['s_phase8t_psr_DPsd_trunc_log'] = 0 targetstim['s_phase5j_psr_DPsd_trunc_log'] = 1 targetstim['s_phase5t_psr_DPsd_trunc_log'] = 0 targetstim['s_phase5t_nrm_DPsd_trunc_log'] = 0 targetstim['s_phase8t_nrm_DPsd_trunc_log'] = 0 targetstim['s_phase5j_nrm_DPsd_trunc_log'] = 1 targetstim['s_phase8j_nrm_DPsd_trunc_log'] = 1 caseid_repeated = {k: caseid for k in tasktype.keys()} stackedvars = pd.DataFrame({'caseid': caseid_repeated, 'casedata': acase, 'tasktype': tasktype, 'targetioi': targetioi, 'targetstim': targetstim, }, #index = acase.T.index ) stackedvars.index.name='original_varname' case_out = stackedvars.reset_index('original_varname') return case_out repmeas = concat_matches(df_to_analyze, 'psr.*log|nrm.*log|iso.t2.*log|iso.j.*log') cases = [stack_rm_case(repmeas.loc[p]) for p in repmeas.index] stacked = pd.concat(cases, axis=0) stacked.index = range(len(stacked)) stacked.index.name = "st_row" stacked = stacked.reset_index('st_row') stacked = stacked.set_index('caseid') # In[208]: df_to_analyze['SCAL_calc_fsiq2'] # In[219]: df_to_analyze.loc['015', staticvar] # In[214]: staticvar = 'SCAL_calc_fsiq2' ids = sorted(set(stacked.index)) for caseid in ids: print(caseid) stacked[staticvar] = np.nan stacked.loc[caseid, staticvar] = df_to_analyze.loc[caseid, staticvar] #slc = stacked.loc[stacked.caseid=='015'] #slc.somevarname = 'the_value' #stacked.to_csv('stacked_test.csv') stacked # In[171]: df_to_analyze.loc[caseid, staticvar] # In[313]: print("NULL VALUES (INCLUDING REMOVED FOR INCOMPLETE TAP SETS):\n\n") for c in df_to_analyze: print(c) s = df_to_analyze[c] print(list(s[s.isnull()].index)) print('') # # After exporting CSV: looking at distributions here... # ## Descriptives (into manuscript) # In[209]: dfa = df_to_analyze get = lambda r: (list(concat_matches(dfo, r).columns), concat_matches(dfo, r)) geta = lambda r: (list(concat_matches(df_to_analyze, r).columns), concat_matches(df_to_analyze, r)) firstcol = lambda df: df.T.iloc[0] firstcol(match('participant_age')).describe() # In[157]: sex = firstcol(match('sex_femalezero')) is_female = (sex==0) is_male = (sex==1) assert is_female[is_female==True].count() == 60 assert is_female[is_female==False].count() == 39 assert is_male[is_male==True].count() == 39 assert is_male[is_male==False].count() == 60 # In[159]: var1 = firstcol(match('participant_age')) print (" females") print firstcol(match('participant_age'))[is_female].describe() print print (" males") print firstcol(match('participant_age'))[is_male].describe() # In[171]: names, df = get('white') print 'female' print df[is_female].sum() print df[is_female].count() print print 'male' print df[is_male].sum() print df[is_male].count() # In[133]: match('participant_age').columns #dfo['SCAL_participant_age'].name # In[217]: names, df = get('I?P4_ints_count') df.describe() # In[228]: names, df = geta('I?P4_drift_trunc$') df.describe() # In[261]: names, df = geta('s_.*DPsd_trunc$') dtable = df.describe().T[:14] reformat = np.round(dtable[['mean', 'std', 'count']], 4) reformat # In[264]: names, df = geta('nrm_DPsd_trunc$|psr_DPsd_trunc$') dtable = df.describe().T #[14:-6] reformat = dtable[['mean', 'std', 'count']] reformat # In[269]: names, df = get('DPsd') dtable = df.describe().T[:14] reformat = np.round(dtable[['mean', 'std', 'count']], 4) reformat # In[203]: names, df = get('I5P4_ints_count') df.std() # ## var1 # In[76]: dfa = df_to_analyze matcha = lambda x: concat_matches(dfa, x) isips = matcha('P4_drift_trunc|P4_local_trunc') isips # In[82]: dfa = df_to_analyze matcha = lambda x: concat_matches(dfa, x) #isips = matcha('P4_drift_trunc|P4_local_trunc') smscols = matcha('^s_.*DPsd_trunc$') #scatter_all(isips, print_max=3) #scatter_all(np.log(isips), print_max=3) smscols.T # In[47]: dft1 = df_to_analyze['s_phase5t_s4a_DPm_trunc'] dft2 = df_to_analyze['s_phase8j_s4a_DPm_trunc'] #dft2.corr(dft1) dft1.corr(dft2) # In[64]: #mna = match('5._DPm|8._DPm|5.2_DPm|8.2_DPm') #mna.to_csv('perc_negative_asynchrony_20141008.csv') # In[18]: phase_sections_means = match('a_DPm|b_DPm') phase_sections_sd = match('a_DPsd|b_DPsd') match('nonzero').T # In[4]: #for c in range(35): # s = phase_sections_sd.ix[:,c] # m = phase_sections_means.ix[:,c] # print phase_sections_sd.columns[c] # print phase_sections_means.columns[c] # print s.corr(m) # In[4]: #matchq('behaviors_') pasted = ''' SCAL_sex_femalezero SCAL_calc_wasivocab_totalrawscore SCAL_calc_wasimatrix_totalscore SCAL_calc_wasivocab_tscore SCAL_calc_wasimatrix_tscore SCAL_calc_wasi_tscore_total SCAL_calc_fsiq2 SCAL_calc_bfi_extraversion SCAL_calc_bfi_agreeableness SCAL_calc_bfi_conscientiousness SCAL_calc_bfi_neuroticism SCAL_calc_bfi_openness SCAL_session_taskorder SCAL_order_500ms_first SCAL_order_rhythmfirst SCAL_qbasic_hearingdeficityn SCAL_qbasic_injuriesyn SCAL_qbasic_exerciseyn SCAL_qbasic_neurodisorderyn SCAL_qmusic_singingyn SCAL_qmusic_singinghours --> NONZERO SCAL_qmusic_singingtimes --> NONZERO SCAL_qmusic_dancelevel --> LN1P SCAL_qmusic_instrumentlevel --> good SCAL_qmusic_dancehours --> NONZERO SCAL_qmusic_instrumenthours --> NONZERO SCAL_qmusic_danceyn SCAL_qmusic_instrumentyn SCAL_qmusic_gameyn SCAL_qmusic_drumsyn SCAL_qmusic_gamenames --> string SCAL_qmusic_gamehoursall --> NONZERO SCAL_qmusic_gamehoursdrumsticks --> NONZERO SCAL_qmusic_drumstyles --> string SCAL_qmusic_drumhours --> NONZERO SCAL_qmusic_drumlevel --> NONZERO SCAL_qmusic_behaviors_07_yourself --> LN1P SCAL_qmusic_behaviors_08_otherprs --> LN1P SCAL_qmusic_behaviors_09_danceprv --> LN1P SCAL_qmusic_behaviors_10_dancepub --> NONZERO SCAL_qmusic_behaviors_11_urgemove --> NONZERO SCAL_qmusic_behaviors_12_friendstaste --> good SCAL_qmusic_behaviors_13_sharingint --> good SCAL_qmusic_behaviors_14_getinterest --> good ''' tolist = pasted.split('\n') nonzero = filter(lambda i: i.split(" ")[-1] == "NONZERO", tolist) nonzero = [i.split(" ")[0] for i in nonzero] assert len(nonzero) == pasted.count('NONZERO') LN1P = filter(lambda i: i.split(" ")[-1] == "LN1P", tolist) LN1P = [i.split(" ")[0] for i in LN1P] assert len(LN1P) == pasted.count('LN1P') tolist = [i.replace("--> good", "") for i in tolist] tolist = filter(lambda i: "-->" not in i, tolist) tolist = [i.strip() for i in tolist] tolist = filter(lambda i: i != "", tolist) LN1P # In[5]: match = lambda x: concat_matches(dfo, x) df_q = match('SCAL_qbasic|SCAL_qmusic') matchq = lambda x: concat_matches(df_q, x) rnot = lambda r: '^((?!' + r + ').)*$' #scales = concat_matches(scales, '^((?!notes).)*$') #hacky "does not contain 'notes' matcher # In[6]: scales_keep = dfo[['SCAL_qmusic_instrumentlevel', 'SCAL_qmusic_behaviors_12_friendstaste', ]] plist = lambda l: '\n'.join(l) print plist(match('SCAL_').columns) #print('\n'.join(list(match('SCAL_').columns))) # In[7]: dfo['SCAL_orders_psh_first'] = (dfo.SCAL_orders_phase==0).astype(int) dfo['SCAL_orders_lin_first'] = (dfo.SCAL_orders_linear==0).astype(int) dfo['SCAL_orders_iso_first'] = (dfo.SCAL_orders_iso==0).astype(int) match('orders').head(4).T # In[8]: dff = dfo[tolist] dff.T # In[9]: match('order').T # In[ ]: hrs = dfo.SCAL_qmusic_danceyn hrs[hrs > 0].count() total = dfo.SCAL_qmusic_drumhours + dfo.SCAL_qmusic_instrumenthours + dfo.SCAL_qmusic_dancehours def filter_outliers(series): # Tabachnik & fidell call +- 3.29 SD a removable/truncatable outlier return series[np.abs(series) <= 3.29 * series.std()] trunc_count = 0 def truncate_outliers(series): # Tabachnik & fidell call +- 3.29 SD a removable/truncatable outlier maxval = series.mean() + 3.29 * series.std() minval = series.mean() - 3.29 * series.std() trunc_count = 0 def trunc(val): if val > maxval: trunc_count += 1 return maxval elif val < minval: trunc_count += 1 return minval else: return val s = series.apply(trunc) print('truncated {} of {} cases.'.format(trunc_count, len(s))) return s truncate_outliers(total).hist() # In[11]: dfo # In[62]: #dfo.scales.bfi_item39.hist() dfo.sms.phase5t_DPsd.apply(lambda x: 1/x).hist() # ## SPSS SYNTAX GENERATION # In[183]: def variable_labels_syntax(varlist): var_labels = "VARIABLE LABELS \n{vlist}." vl_item = " {var} '{label}'\n" vl_list = '\n'.join([vl_item.format(var=v, label=l) for (v, l) in varlist]) return var_labels.format(vlist=vl_list) #testing print variable_labels_syntax(varlist = [("fff", "sssss")]) # In[165]: bfi={} bfi['E'] = ['1', '6R', '11', '16', '21R', '26', '31R', '36'] bfi['A'] = ['2R', '7', '12R', '17', '22', '27R', '32', '37R', '42'] bfi['C'] = ['3', '8R', '13', '18R', '23R', '28', '33', '38', '43R'] bfi['N'] = ['4', '9R', '14', '19', '24R', '29', '34R', '39'] bfi['O'] = ['5', '10', '15', '20', '25', '30', '35R', '40', '41R', '44'] bfi_score = {} for k, v in bfi.items(): for i in v: reverse_scored = 'R' in i if reverse_scored: i = i[:-1] item = int(i) bfi_score[item] = {'factor': k, 'reverse_scored': reverse_scored} bfi_score # In[131]: print('ALTER TYPE') print(' (F8.2)\n'.join(others) + ' (F8.2)') print('.') # Oops - these aren't the values in the dfo_flat output. Need to do this there instead, # or import from the CSV I made there. # In[190]: varlist = [] for k, v in bfi_score.items(): name = "SCAL_bfi_item" + str(k) factor = v['factor'] label = "BFI item {n} ({f})".format(n=k, f=factor) return #bfi_vars