Cluster neoblast transcripts with CAP3

Use CAP3 to cluster neoblast transcripts:

cap3 aboobaker.graveley.pearson.rajewsky.renamed.fa > cap3.out 2> cap3.err
In [1]:
import gzip, os, sys
#import custom functions for displaying tables, bash commands
sys.path.append(os.path.abspath("/home/damian/"))
from dk_ipython import *
from IPython.display import HTML
HTML(addToggle())
Out[1]:
The raw code for this IPython notebook is by default hidden for easier reading. To toggle on/off the raw code, click here.
In [7]:
prefix = '/home/share/projects/smed_neoblast/cap3/'

faFile = open(prefix + 'aboobaker.graveley.pearson.rajewsky.renamed.fa')

from collections import defaultdict

sourceKey = {'a':'Aboobaker','P':'Pearson','R':'Rajewsky','G':'Graveley'}
sources = defaultdict(int)
for line in faFile:
    if line[0] == '>':
        source = sourceKey[line[1]]
        sources[source] += 1

print 'Data sources used:'
for source, count in sources.items():
    print '  ' + source + ' - ' + commas(count) + ' transcripts'
Data sources used:
  Pearson - 2,147 transcripts
  Graveley - 4,056 transcripts
  Rajewsky - 7,696 transcripts
  Aboobaker - 823 transcripts
In [62]:
inFile = open('/home/share/projects/smed_neoblast/cap3/cap3.out')

clusters = []
block = []
used = defaultdict(int)
for line in inFile:
    if line.find('*******************') != -1:
        break
        
for line in inFile:
    if line.strip() == '':
        break
    if line.find('*******************') != -1:
        if len(block) != 0:
            block.sort()
            aBlock = tuple(set(block))
            if len(block) > 1:
                for x in block:
                    used[x] += 1
                clusters.append(tuple(set(block)))
        
        block = []
    else:
        name = sourceKey[line.strip().split()[0][0]]
        block.append(name)
In [98]:
from collections import Counter

clusterCount = Counter(clusters).items()
clusterCount.sort(key = lambda x : x[1], reverse = True)

comboTable = ListTable()
comboHeader = ['Pearson','Graveley','Rajewsky','Aboobaker']
comboTable.append(comboHeader + ['Number of clusters'])
for combo, count in clusterCount:
    row = []
    for c in comboHeader:
        if c in combo:
            row.append('X')
        else:
            row.append('-')
    row.append(count)
    comboTable.append(row)
    
print 'Cluster compositions:'
print '*Number of clusters total with members from at least 2 or more sources:', commas(len(clusters))
print '*Number of clusters with the composition described in the row'
print '*For example, there are 81 clusters containing at least one transcript from all 4 sources'
comboTable
Cluster compositions:
*Number of clusters total with members from at least 2 or more sources: 2,577
*Number of clusters with the composition described in the row
*For example, there are 81 clusters containing at least one transcript from all 4 sources
Out[98]:
PearsonGraveleyRajewskyAboobakerNumber of clusters
X-X-698
-XX-661
--X-321
X-XX212
XXX-170
--XX131
-X--105
XXXX81
XX--63
-XXX57
-X-X34
X--X30
XX-X10
X---4
In [141]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

sourceCluster = defaultdict(int)
for combo, count in clusterCount:
    if len(combo) > 1:
        for item in combo:
            sourceCluster[item] += count

df = pd.concat([pd.Series(sources),pd.Series(sourceCluster)],axis=1)
df.columns = ['Number of neoblast transcripts','Number of neoblast transcripts clustered']
df.plot(kind='bar',figsize=[12,6])
Out[141]:
<matplotlib.axes.AxesSubplot at 0x8abc210>
In [64]:
usedTable = ListTable()
usedTable.append(['Source','Number of neoblast transcripts','Number of transcripts clustered','% clustered'])
for c in comboHeader:
    usedTable.append([c,commas(sources[c]),commas(used[c]),str(float(used[c]) / sources[c] * 100)[:4]])
    
usedTable
Out[64]:
SourceNumber of neoblast transcriptsNumber of transcripts clustered% clustered
Pearson2,1471,38364.4
Graveley4,0561,46035.9
Rajewsky7,6963,17241.2
Aboobaker82357569.8