Notebook

In [1]:

# Run this only if grabbing a new set of data -- the default on this notebook is CC and PF raw
# run this at the start of each notebook
init_ipy()

In [2]:

#load all data from hofmockel project
from datetime import datetime
wgs = [u'mgm4509403.3', u'mgm4511170.3', u'mgm4509405.3', u'mgm4509406.3', u'mgm4509401.3', u'mgm4509404.3', u'mgm4510009.3', u'mgm4512946.3', u'mgm4509399.3', u'mgm4510007.3', u'mgm4511171.3', u'mgm4511176.3', u'mgm4511169.3', u'mgm4509397.3', u'mgm4509402.3', u'mgm4511167.3', u'mgm4511174.3', u'mgm4510008.3', u'mgm4512947.3', u'mgm4511172.3', u'mgm4509407.3', u'mgm4510006.3', u'mgm4511177.3', u'mgm4509400.3', u'mgm4511168.3', u'mgm4509396.3', u'mgm4511166.3', u'mgm4511175.3', u'mgm4512949.3', u'mgm4512948.3', u'mgm4509398.3', u'mgm4511173.3']
data = load_object('hofmockel_wgs_48')
print str(datetime.now())

2013-02-18 20:17:11.200202

In [3]:

# selecting only assemblies across aggregates, micro fraction

sub_selected_datasets=['mgm4512893.3','mgm4512894.3','mgm4512895.3','mgm4512896.3','mgm4512897.3','mgm4512898.3','mgm4512899.3','mgm4512900.3','mgm4512901.3','mgm4512902.3','mgm4512903.3']
data.display_mgs = sub_selected_datasets #this sets these subsets as your data variable now

In [5]:

#saving that data to file
raw_file = 'demo-test.tab'
data.function['abundance'].dump(fname=raw_file, fformat='tab', normalize=0, cols=sub_selected_datasets, col_name=True, row_full=True, scale=None)

In [6]:

#Loading the data into an array and normalizing by gyrB abundance values

import numpy as np

f = open(raw_file, 'r')
list_values = []
content = f.readlines()
headers = content[0].rstrip().split('\t')[1:]
headers_cleaned = []


for x in headers:
    x = x.split(' / ')[0]
    headers_cleaned.append(x) 

annotations = []
cleaned = []

for n, line in enumerate(content[1:]):
    dat = line.rstrip().split('\t')
    annotation = dat[0]
    abundances = dat[1:]
    annotations.append(annotation)
    cleaned.append(abundances)

clean_array = np.array(cleaned, dtype=int)
total_counts = clean_array.sum(axis=0, dtype=float)
std_total = np.divide(clean_array, total_counts)
std_total_sorted = std_total[std_total[:,0].argsort()]
std_total_sorted_rev = std_total_sorted[::-1]
total_hkg = array([896,	3193	,1419	,5088	,2342,	1290,	589,	3828,	4149,	1943,	903], dtype=float) #note used  (SS04640) rather than (SS04489)
std_total = np.divide(clean_array, total_hkg)
std_total_sorted = std_total[std_total[:,0].argsort()]
std_total_sorted_rev = std_total_sorted[::-1]

In [7]:

#Searching for function of interest containing nitrogen

text_of_interest = 'nitrogen' 
threshold = 100 #the sum of total observations you want to be minimum
list_of_rows = []

count = 0
for n, x in enumerate(annotations):
    if text_of_interest.lower() in x.lower():
        if clean_array[n, :].sum() > threshold:
            standardized = np.divide(clean_array[n], total_hkg)
            list_of_rows.append(n)

In [8]:

#Putting interesting results into lists to start plotting 

list_of_functions = []
list_of_data = []

for x in list_of_rows:
    list_of_functions.append(annotations[x])
    list_of_data.append(list(np.divide(clean_array[x,:], total_hkg)))

In [34]:

#Plots box whisker plot of interesting results, in 20 function increments

import matplotlib
import numpy as np

bottom = 0
top=20                 


d_big = {}
for n, x in enumerate(list_of_functions[bottom:top]):
    d = {}
    
    micro_d = [list_of_data[n][1], list_of_data[n][4], list_of_data[n][7], list_of_data[n][10]]
    small_d = [list_of_data[n][2], list_of_data[n][8]]
    large_d = [list_of_data[n][0], list_of_data[n][6], list_of_data[n][9]]
    whole_d = [list_of_data[n][3], list_of_data[n][5]]
    
    d['large']=large_d
    d['micro']=micro_d
    d['small']=small_d
    d['whole']=whole_d
    d_big[x] = d

#print d_big



data2 = []

for key in list_of_functions[bottom:top]:
    for x in ['micro', 'small', 'large', 'whole']:
        data2.append(d_big[key][x])

fig = plt.figure(figsize=[12,12])
ax1 = fig.add_subplot(111)

bp = plt.boxplot(data2, notch=0, sym='+', vert=1, whis=1.5)
plt.setp(bp['boxes'], color='black')
plt.setp(bp['whiskers'], color='black')
plt.setp(bp['fliers'], color='red', marker='+')


ax1.yaxis.grid(True, linestyle='-', which='major', color='lightgrey',
              alpha=0.5)
ax1.set_axisbelow(True)
ax1.set_title('Nitrogen mania')
ax1.set_xlabel('Different nitrogen functions')
ax1.set_ylabel('Abundance normalized by gyrB')

boxColors = ['r','b', 'c', 'm']
numBoxes = len(list_of_functions[bottom:top])*4
# Now fill the boxes with desired colors
medians = range(numBoxes)
for i in range(numBoxes):
  box = bp['boxes'][i]
  boxX = []
  boxY = []
  for j in range(5):
      boxX.append(box.get_xdata()[j])
      boxY.append(box.get_ydata()[j])
  boxCoords = zip(boxX,boxY)
  # Alternate between Dark Khaki and Royal Blue
  k = i % 4
  boxPolygon = Polygon(boxCoords, facecolor=boxColors[k])
  ax1.add_patch(boxPolygon)
  # Now draw the median lines back over what we just filled in
  med = bp['medians'][i]
  medianX = []
  medianY = []
  for j in range(2):
      medianX.append(med.get_xdata()[j])
      medianY.append(med.get_ydata()[j])
      plt.plot(medianX, medianY, 'k')
      medians[i] = medianY[0]
  # Finally, overplot the sample averages, with horizontal alignment
  # in the center of each box
  plt.plot([np.average(med.get_xdata())], [np.average(data2[i])],
           color='w', marker='*', markeredgecolor='k')

xticks_list = []
for x in  list_of_functions[bottom:top]:
    xticks_list.append('')
    xticks_list.append(x)
    xticks_list.append('')
    xticks_list.append('')
  
    
xtickNames = plt.setp(ax1, xticklabels=xticks_list)
plt.setp(xtickNames, rotation=90, fontsize=12)

# Finally, add a basic legend
plt.figtext(0.25, .8, 'Micro' ,
           backgroundcolor=boxColors[0], weight='roman',
           size='x-large', color='white')
plt.figtext(0.25, .78, 'Small', backgroundcolor=boxColors[1],
           weight='roman', size='x-large', color='white')
plt.figtext(0.25, .76, 'Large', backgroundcolor=boxColors[2],
          weight='roman', size='x-large', color='white')
plt.figtext(0.25, .735, 'Whole', backgroundcolor=boxColors[3],
           weight='roman', size='x-large', color='white')
show()

In [35]:

#Grab phylogeny of specific subsystem

f1 = 'SS00496'
matrix_str = data.function['abundance'].dump(fformat='tab', rows=f1, cols=sub_selected_datasets, normalize=0, col_name=False, row_full=False, scale=None)
sub_rows, cols, sub_matrix = data.function['abundance'].sub_matrix(normalize=0, rows=f1, cols=sub_selected_datasets)
sub_cols = []
for i, col in enumerate(cols):
    if sum(slice_column(sub_matrix, i)) > 0:
        sub_cols.append(col)

In [36]:

#print out the phylogeny into file

new_data = Analysis(ids=sub_cols, annotation='organism', level='phylum', filters=sub_rows)
filename = 'test-demo2.tab'
new_data.dump(fname=filename, fformat='tab', cols=None, normalize=0, col_name=True, scale=None, row_min=0)

In [37]:

#Put this data into an array

f = open(filename, 'r')

list_values = []
content = f.readlines()
headers = content[0].rstrip().split('\t')[1:]
headers_cleaned = []

for x in headers:
    x = x.split(' / ')[0]
    headers_cleaned.append(x) 

    
annotations = []
cleaned = []

for n, line in enumerate(content[1:]):
    dat = line.rstrip().split('\t')
    annotation = dat[0]
    abundances = dat[1:]
    annotations.append(annotation)
    cleaned.append(abundances)


clean_array = np.array(cleaned, dtype=int)
total_counts = clean_array.sum(axis=0, dtype=float)
std_total = np.divide(clean_array, total_counts)
std_total_sorted = std_total[std_total[:,0].argsort()]
std_total_sorted_rev = std_total_sorted[::-1]


sort_index = std_total[:,0].argsort()
annotations_sorted = []
for x in sort_index:
    annotations_sorted.append(annotations[x])
annotations_sorted.reverse()

In [42]:

# Put into a plot

import matplotlib
import numpy as np

large1 = list(std_total_sorted_rev[:,0])
micro1 = list(std_total_sorted_rev[:,1])
small1 = list(std_total_sorted_rev[:,2])
whole1 = list(std_total_sorted_rev[:,3])
micro2 = list(std_total_sorted_rev[:,4])
whole2 = list(std_total_sorted_rev[:,5])
large2 = list(std_total_sorted_rev[:,6])
micro3 = list(std_total_sorted_rev[:,7])
small2 = list(std_total_sorted_rev[:,8])
large3 = list(std_total_sorted_rev[:,9])
micro4 = list(std_total_sorted_rev[:,10])

bottom = 0
top=10

d_big = {}
for n, x in enumerate(annotations_sorted):
    if bottom <= n < top:
        d = {}

        large_total = [large1[n], large2[n], large3[n]]
        micro_total = [micro1[n], micro2[n], micro3[n], micro4[n]]
        whole_total = [whole1[n], whole2[n]]
        small_total = [small1[n], small2[n]]
        d['large']=large_total
        d['micro']=micro_total
        d['small']=small_total
        d['whole']=whole_total
        d_big[x] = d

data = []

for key in annotations_sorted[bottom:top]:
    for x in ['micro', 'small', 'large', 'whole']:
        data.append(d_big[key][x])

fig = plt.figure(figsize=[12,12])
ax1 = fig.add_subplot(111)

bp = plt.boxplot(data, notch=0, sym='+', vert=1, whis=1.5)
plt.setp(bp['boxes'], color='black')
plt.setp(bp['whiskers'], color='black')
plt.setp(bp['fliers'], color='red', marker='+')


ax1.yaxis.grid(True, linestyle='-', which='major', color='lightgrey',
              alpha=0.5)
ax1.set_axisbelow(True)
ax1.set_title(' Associated phyla ' + str(bottom) + ' to ' + str(top))
ax1.set_xlabel('Phyla')
ax1.set_ylabel('Relative Fraction')

boxColors = ['r','b', 'c', 'm']
numBoxes = len(annotations_sorted[bottom:top])*4#numDists*2
# Now fill the boxes with desired colors
medians = range(numBoxes)
for i in range(numBoxes):
  box = bp['boxes'][i]
  boxX = []
  boxY = []
  for j in range(5):
      boxX.append(box.get_xdata()[j])
      boxY.append(box.get_ydata()[j])
  boxCoords = zip(boxX,boxY)
  # Alternate between Dark Khaki and Royal Blue
  k = i % 4
  boxPolygon = Polygon(boxCoords, facecolor=boxColors[k])
  ax1.add_patch(boxPolygon)
  # Now draw the median lines back over what we just filled in
  med = bp['medians'][i]
  medianX = []
  medianY = []
  for j in range(2):
      medianX.append(med.get_xdata()[j])
      medianY.append(med.get_ydata()[j])
      plt.plot(medianX, medianY, 'k')
      medians[i] = medianY[0]
  # Finally, overplot the sample averages, with horizontal alignment
  # in the center of each box
  plt.plot([np.average(med.get_xdata())], [np.average(data[i])],
           color='w', marker='*', markersize=3, markeredgecolor='k')


xticks_list = []
for x in annotations_sorted:
    xticks_list.append('')
    xticks_list.append('')
    xticks_list.append(x)
    xticks_list.append('')
    
xtickNames = plt.setp(ax1, xticklabels=xticks_list)
plt.setp(xtickNames, rotation=90, fontsize=12)

# Finally, add a basic legend
plt.figtext(0.25, .8, 'Micro' ,
           backgroundcolor=boxColors[0], weight='roman',
           size='x-large', color='white')
plt.figtext(0.25, .78, 'Small', backgroundcolor=boxColors[1],
           weight='roman', size='x-large', color='white')
plt.figtext(0.25, .76, 'Large', backgroundcolor=boxColors[2],
          weight='roman', size='x-large', color='white')
plt.figtext(0.25, .735, 'Whole', backgroundcolor=boxColors[3],
           weight='roman', size='x-large', color='white')
show()
           

In [ ]: