import gzip, os, sys #import custom functions for displaying tables, bash commands sys.path.append(os.path.abspath("/home/damian/")) from dk_ipython import * from IPython.display import HTML HTML(addToggle()) prefix = '/home/share/projects/smed_neoblast/cap3/' faFile = open(prefix + 'aboobaker.graveley.pearson.rajewsky.renamed.fa') from collections import defaultdict sourceKey = {'a':'Aboobaker','P':'Pearson','R':'Rajewsky','G':'Graveley'} sources = defaultdict(int) for line in faFile: if line[0] == '>': source = sourceKey[line[1]] sources[source] += 1 print 'Data sources used:' for source, count in sources.items(): print ' ' + source + ' - ' + commas(count) + ' transcripts' inFile = open('/home/share/projects/smed_neoblast/cap3/cap3.out') clusters = [] block = [] used = defaultdict(int) for line in inFile: if line.find('*******************') != -1: break for line in inFile: if line.strip() == '': break if line.find('*******************') != -1: if len(block) != 0: block.sort() aBlock = tuple(set(block)) if len(block) > 1: for x in block: used[x] += 1 clusters.append(tuple(set(block))) block = [] else: name = sourceKey[line.strip().split()[0][0]] block.append(name) from collections import Counter clusterCount = Counter(clusters).items() clusterCount.sort(key = lambda x : x[1], reverse = True) comboTable = ListTable() comboHeader = ['Pearson','Graveley','Rajewsky','Aboobaker'] comboTable.append(comboHeader + ['Number of clusters']) for combo, count in clusterCount: row = [] for c in comboHeader: if c in combo: row.append('X') else: row.append('-') row.append(count) comboTable.append(row) print 'Cluster compositions:' print '*Number of clusters total with members from at least 2 or more sources:', commas(len(clusters)) print '*Number of clusters with the composition described in the row' print '*For example, there are 81 clusters containing at least one transcript from all 4 sources' comboTable import pandas as pd import seaborn as sns import matplotlib.pyplot as plt %matplotlib inline sourceCluster = defaultdict(int) for combo, count in clusterCount: if len(combo) > 1: for item in combo: sourceCluster[item] += count df = pd.concat([pd.Series(sources),pd.Series(sourceCluster)],axis=1) df.columns = ['Number of neoblast transcripts','Number of neoblast transcripts clustered'] df.plot(kind='bar',figsize=[12,6]) usedTable = ListTable() usedTable.append(['Source','Number of neoblast transcripts','Number of transcripts clustered','% clustered']) for c in comboHeader: usedTable.append([c,commas(sources[c]),commas(used[c]),str(float(used[c]) / sources[c] * 100)[:4]]) usedTable