#!/usr/bin/env python # coding: utf-8 # In[23]: import numpy as np import os.path import urllib import gzip import itertools from sklearn import preprocessing from sklearn.decomposition import PCA from sklearn.cluster import KMeans from sklearn.mixture import GaussianMixture get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib import matplotlib.pyplot as plt fig_size = [8, 8] plt.rcParams["figure.figsize"] = fig_size # In[6]: def fasta_reader(filename): from Bio.SeqIO.FastaIO import FastaIterator with open(filename) as handle: for record in FastaIterator(handle): yield record # In[13]: for entry in fasta_reader("sequence.fasta"): genome = str(entry.seq) #This is sequence of specific fasta entry # In[25]: GENOMESIZE = len(genome) letters=list(set(genome)) # In[15]: BLOCKSIZE=300 WORDSIZE=[1,2,3,4] NBLOCKS=GENOMESIZE//BLOCKSIZE # In[26]: def getFeatures(genome, NBLOCKS, WORDSIZE): features={x : np.zeros((NBLOCKS, 4**x)) for x in WORDSIZE} for ws in WORDSIZE: lookUp = { y : x for x,y in enumerate([''.join(l) for l in itertools.product(*[letters]*ws)]) } for b in range(NBLOCKS): block = genome[b*BLOCKSIZE:(b+1)*BLOCKSIZE] for i in range(BLOCKSIZE//ws): word = block[i*ws:(i+1)*ws] features[ws][b,lookUp[word]] += 1 return features features = getFeatures(genome, NBLOCKS, WORDSIZE) # In[27]: def standardize(features, WORDSIZE): for ws in WORDSIZE: std_scale = preprocessing.StandardScaler().fit(features[ws]) features[ws] = std_scale.transform(features[ws]) return features features = standardize(features, WORDSIZE) # In[28]: def runPCA(features, WORDSIZE, n_components=2): featuresPCA={} for ws in WORDSIZE: pca = PCA(n_components=n_components).fit(features[ws]) featuresPCA[ws] = pca.transform(features[ws]) return featuresPCA featuresPCA = runPCA(features, WORDSIZE) # In[29]: fig, axes = plt.subplots(2,2) axes[0,0].scatter(featuresPCA[1].T[0], featuresPCA[1].T[1], s=0.25) axes[0,1].scatter(featuresPCA[2].T[0], featuresPCA[2].T[1], s=0.25) axes[1,0].scatter(featuresPCA[3].T[0], featuresPCA[3].T[1], s=0.25) axes[1,1].scatter(featuresPCA[4].T[0], featuresPCA[4].T[1], s=0.25) # In[33]: kmeans = KMeans(n_clusters=7, random_state=0).fit(features[3]) # In[34]: fig, ax = plt.subplots(1,1) ax.scatter(featuresPCA[3].T[0], featuresPCA[3].T[1], s=0.25, c=kmeans.labels_)