#!/usr/bin/env python # coding: utf-8 # In[1]: import os from genomics.popgen.plink.convert import to_eigen from genomics.popgen.pca import plot, smart get_ipython().run_line_magic('matplotlib', 'inline') # ## Meta-data load # In[2]: f = open('relationships_w_pops_121708.txt') ind_pop = {} f.readline() # header for l in f: toks = l.rstrip().split('\t') fam_id = toks[0] ind_id = toks[1] pop = toks[-1] ind_pop['/'.join([fam_id, ind_id])] = pop f.close() ind_pop['2469/NA20281'] = ind_pop['2805/NA20281'] # ## Requires plink from data preparation # In[3]: to_eigen('hapmap10_auto_noofs_ld_12', 'hapmap10_auto_noofs_ld_12') # ## Running smartpca # In[4]: ctrl = smart.SmartPCAController('hapmap10_auto_noofs_ld_12') ctrl.run() # In[5]: wei, wei_perc, ind_comp = smart.parse_evec('hapmap10_auto_noofs_ld_12.evec', 'hapmap10_auto_noofs_ld_12.eval') # In[6]: plot.render_pca(ind_comp, 1, 2, cluster=ind_pop) #put weights # In[7]: plot.render_pca_eight(ind_comp, cluster=ind_pop) # In[8]: markers = { 'CHB': '*', 'CHD': '*', 'JPT': '*', 'GIH': '*', 'CEU': 'v', 'TSI': 'v', 'MEX': 'v', 'ASW': 'o', 'LWK': 'o', 'YRI': 'o', 'MKK': 'o' } # In[9]: #See also, the hapmap paper for the PCA #Paper on smartpca # ## With scikit-learn # In[10]: from sklearn.decomposition import PCA import numpy as np # In[11]: f = open('hapmap10_auto_noofs_ld_12.ped') ninds = 0 ind_order = [] for line in f: ninds += 1 toks = line[:100].replace(' ', '\t').split('\t') # for speed fam_id = toks[0] ind_id = toks[1] ind_order.append('%s/%s' % (fam_id, ind_id)) nsnps = (len(line.replace(' ', '\t').split('\t')) - 6) // 2 print (nsnps) f.close() # In[12]: pca_array = np.empty((ninds, nsnps), dtype=int) print pca_array.shape f = open('hapmap10_auto_noofs_ld_12.ped') for ind, line in enumerate(f): snps = line.replace(' ', '\t').split('\t')[6:] for pos in range(len(snps) // 2): a1 = int(snps[2 * pos]) a2 = int(snps[2 * pos]) my_code = a1 + a2 - 2 pca_array[ind, pos] = my_code f.close() #slow # In[13]: my_pca = PCA(n_components=8) my_pca.fit(pca_array) trans = my_pca.transform(pca_array) #Memory required # In[14]: sc_ind_comp = {} for i, ind_pca in enumerate(trans): sc_ind_comp[ind_order[i]] = ind_pca plot.render_pca_eight(sc_ind_comp, cluster=ind_pop) # In[ ]: