import numpy as np
import os.path
import urllib
import gzip
import itertools
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
fig_size = [8, 8]
plt.rcParams["figure.figsize"] = fig_size
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/matplotlib/font_manager.py:279: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment. 'Matplotlib is building the font cache using fc-list. '
def fasta_reader(filename):
from Bio.SeqIO.FastaIO import FastaIterator
with open(filename) as handle:
for record in FastaIterator(handle):
yield record
for entry in fasta_reader("sequence.fasta"):
genome = str(entry.seq) #This is sequence of specific fasta entry
GENOMESIZE = len(genome)
letters=list(set(genome))
BLOCKSIZE=300
WORDSIZE=[1,2,3,4]
NBLOCKS=GENOMESIZE//BLOCKSIZE
def getFeatures(genome, NBLOCKS, WORDSIZE):
features={x : np.zeros((NBLOCKS, 4**x)) for x in WORDSIZE}
for ws in WORDSIZE:
lookUp = { y : x for x,y in enumerate([''.join(l) for l in itertools.product(*[letters]*ws)]) }
for b in range(NBLOCKS):
block = genome[b*BLOCKSIZE:(b+1)*BLOCKSIZE]
for i in range(BLOCKSIZE//ws):
word = block[i*ws:(i+1)*ws]
features[ws][b,lookUp[word]] += 1
return features
features = getFeatures(genome, NBLOCKS, WORDSIZE)
def standardize(features, WORDSIZE):
for ws in WORDSIZE:
std_scale = preprocessing.StandardScaler().fit(features[ws])
features[ws] = std_scale.transform(features[ws])
return features
features = standardize(features, WORDSIZE)
def runPCA(features, WORDSIZE, n_components=2):
featuresPCA={}
for ws in WORDSIZE:
pca = PCA(n_components=n_components).fit(features[ws])
featuresPCA[ws] = pca.transform(features[ws])
return featuresPCA
featuresPCA = runPCA(features, WORDSIZE)
fig, axes = plt.subplots(2,2)
axes[0,0].scatter(featuresPCA[1].T[0], featuresPCA[1].T[1], s=0.25)
axes[0,1].scatter(featuresPCA[2].T[0], featuresPCA[2].T[1], s=0.25)
axes[1,0].scatter(featuresPCA[3].T[0], featuresPCA[3].T[1], s=0.25)
axes[1,1].scatter(featuresPCA[4].T[0], featuresPCA[4].T[1], s=0.25)
<matplotlib.collections.PathCollection at 0x10fb23978>
kmeans = KMeans(n_clusters=7, random_state=0).fit(features[3])
fig, ax = plt.subplots(1,1)
ax.scatter(featuresPCA[3].T[0], featuresPCA[3].T[1], s=0.25, c=kmeans.labels_)
<matplotlib.collections.PathCollection at 0x10fca92e8>