In [2]:

import pyupset as pyu
import matplotlib as mpl
import matplotlib.pyplot as plt
from pickle import load
import pandas as pd
import glob 

%matplotlib inline

In [85]:

gene_table = pd.read_table('gene_presence_absence.csv', sep=',')

In [86]:

col_names = list(x for x in gene_table.columns if x.startswith('t_'))
col_names.sort()

col_names.insert(0, 'Gene')
print(col_names)

gene_table = pd.DataFrame(gene_table, columns=col_names)

['Gene', 't_centrarchi', 't_lynn', 't_maritimum', 't_mesophilum', 't_ovolyticum', 't_rebecca', 't_soleae', 't_sp47']

In [87]:

df = gene_table.groupby("Gene").count()
df.reset_index(level=0, inplace=True)

df[:10]

Out[87]:

	Gene	t_centrarchi	t_lynn	t_mesophilum	t_rebecca	t_soleae	t_sp47
0	aadR	0	1	0	1	0	0
1	aarA	0	0	0	0	1	0
2	aarA_1	0	1	0	1	0	0
3	aarA_2	0	1	0	1	0	0
4	aat	0	1	0	1	0	1
5	accA	1	1	0	1	0	0
6	accA1	0	1	1	1	0	0
7	accA_2	0	1	0	0	0	1
8	accB	0	1	0	0	0	0
9	accC	1	1	0	1	1	0

In [89]:

grouped_d = {}
for spname in df.columns:
    if spname == 'Gene':
        continue
    sp_df = pd.DataFrame(df, columns=["Gene", spname])
    sp_df = sp_df.loc[sp_df[spname] == 1]
    print(spname, len(sp_df))
    grouped_d[spname] = sp_df

t_centrarchi 2475
t_lynn 3013
t_maritimum 141
t_mesophilum 578
t_ovolyticum 930
t_rebecca 3096
t_soleae 1428
t_sp47 692

In [91]:

keep = ['t_lynn', 't_rebecca', 't_mesophilum', 't_centrarchi', 't_soleae']

keylist = list(grouped_d.keys())
for k in keylist:
    if k not in keep:
        del grouped_d[k]

In [92]:

pplot=pyu.plot(grouped_d)

In [ ]: