Use CAP3 to cluster neoblast transcripts:
cap3 aboobaker.graveley.pearson.rajewsky.renamed.fa > cap3.out 2> cap3.err
import gzip, os, sys
#import custom functions for displaying tables, bash commands
sys.path.append(os.path.abspath("/home/damian/"))
from dk_ipython import *
from IPython.display import HTML
HTML(addToggle())
prefix = '/home/share/projects/smed_neoblast/cap3/'
faFile = open(prefix + 'aboobaker.graveley.pearson.rajewsky.renamed.fa')
from collections import defaultdict
sourceKey = {'a':'Aboobaker','P':'Pearson','R':'Rajewsky','G':'Graveley'}
sources = defaultdict(int)
for line in faFile:
if line[0] == '>':
source = sourceKey[line[1]]
sources[source] += 1
print 'Data sources used:'
for source, count in sources.items():
print ' ' + source + ' - ' + commas(count) + ' transcripts'
Data sources used: Pearson - 2,147 transcripts Graveley - 4,056 transcripts Rajewsky - 7,696 transcripts Aboobaker - 823 transcripts
inFile = open('/home/share/projects/smed_neoblast/cap3/cap3.out')
clusters = []
block = []
used = defaultdict(int)
for line in inFile:
if line.find('*******************') != -1:
break
for line in inFile:
if line.strip() == '':
break
if line.find('*******************') != -1:
if len(block) != 0:
block.sort()
aBlock = tuple(set(block))
if len(block) > 1:
for x in block:
used[x] += 1
clusters.append(tuple(set(block)))
block = []
else:
name = sourceKey[line.strip().split()[0][0]]
block.append(name)
from collections import Counter
clusterCount = Counter(clusters).items()
clusterCount.sort(key = lambda x : x[1], reverse = True)
comboTable = ListTable()
comboHeader = ['Pearson','Graveley','Rajewsky','Aboobaker']
comboTable.append(comboHeader + ['Number of clusters'])
for combo, count in clusterCount:
row = []
for c in comboHeader:
if c in combo:
row.append('X')
else:
row.append('-')
row.append(count)
comboTable.append(row)
print 'Cluster compositions:'
print '*Number of clusters total with members from at least 2 or more sources:', commas(len(clusters))
print '*Number of clusters with the composition described in the row'
print '*For example, there are 81 clusters containing at least one transcript from all 4 sources'
comboTable
Cluster compositions: *Number of clusters total with members from at least 2 or more sources: 2,577 *Number of clusters with the composition described in the row *For example, there are 81 clusters containing at least one transcript from all 4 sources
Pearson | Graveley | Rajewsky | Aboobaker | Number of clusters |
X | - | X | - | 698 |
- | X | X | - | 661 |
- | - | X | - | 321 |
X | - | X | X | 212 |
X | X | X | - | 170 |
- | - | X | X | 131 |
- | X | - | - | 105 |
X | X | X | X | 81 |
X | X | - | - | 63 |
- | X | X | X | 57 |
- | X | - | X | 34 |
X | - | - | X | 30 |
X | X | - | X | 10 |
X | - | - | - | 4 |
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sourceCluster = defaultdict(int)
for combo, count in clusterCount:
if len(combo) > 1:
for item in combo:
sourceCluster[item] += count
df = pd.concat([pd.Series(sources),pd.Series(sourceCluster)],axis=1)
df.columns = ['Number of neoblast transcripts','Number of neoblast transcripts clustered']
df.plot(kind='bar',figsize=[12,6])
<matplotlib.axes.AxesSubplot at 0x8abc210>
usedTable = ListTable()
usedTable.append(['Source','Number of neoblast transcripts','Number of transcripts clustered','% clustered'])
for c in comboHeader:
usedTable.append([c,commas(sources[c]),commas(used[c]),str(float(used[c]) / sources[c] * 100)[:4]])
usedTable
Source | Number of neoblast transcripts | Number of transcripts clustered | % clustered |
Pearson | 2,147 | 1,383 | 64.4 |
Graveley | 4,056 | 1,460 | 35.9 |
Rajewsky | 7,696 | 3,172 | 41.2 |
Aboobaker | 823 | 575 | 69.8 |