# Run this only if grabbing a new set of data -- the default on this notebook is CC and PF raw
# run this at the start of each notebook
init_ipy()
#load all data from hofmockel project
from datetime import datetime
wgs = [u'mgm4509403.3', u'mgm4511170.3', u'mgm4509405.3', u'mgm4509406.3', u'mgm4509401.3', u'mgm4509404.3', u'mgm4510009.3', u'mgm4512946.3', u'mgm4509399.3', u'mgm4510007.3', u'mgm4511171.3', u'mgm4511176.3', u'mgm4511169.3', u'mgm4509397.3', u'mgm4509402.3', u'mgm4511167.3', u'mgm4511174.3', u'mgm4510008.3', u'mgm4512947.3', u'mgm4511172.3', u'mgm4509407.3', u'mgm4510006.3', u'mgm4511177.3', u'mgm4509400.3', u'mgm4511168.3', u'mgm4509396.3', u'mgm4511166.3', u'mgm4511175.3', u'mgm4512949.3', u'mgm4512948.3', u'mgm4509398.3', u'mgm4511173.3']
data = load_object('hofmockel_wgs_48')
print str(datetime.now())
2013-02-18 20:17:11.200202
# selecting only assemblies across aggregates, micro fraction
sub_selected_datasets=['mgm4512893.3','mgm4512894.3','mgm4512895.3','mgm4512896.3','mgm4512897.3','mgm4512898.3','mgm4512899.3','mgm4512900.3','mgm4512901.3','mgm4512902.3','mgm4512903.3']
data.display_mgs = sub_selected_datasets #this sets these subsets as your data variable now
#saving that data to file
raw_file = 'demo-test.tab'
data.function['abundance'].dump(fname=raw_file, fformat='tab', normalize=0, cols=sub_selected_datasets, col_name=True, row_full=True, scale=None)
#Loading the data into an array and normalizing by gyrB abundance values
import numpy as np
f = open(raw_file, 'r')
list_values = []
content = f.readlines()
headers = content[0].rstrip().split('\t')[1:]
headers_cleaned = []
for x in headers:
x = x.split(' / ')[0]
headers_cleaned.append(x)
annotations = []
cleaned = []
for n, line in enumerate(content[1:]):
dat = line.rstrip().split('\t')
annotation = dat[0]
abundances = dat[1:]
annotations.append(annotation)
cleaned.append(abundances)
clean_array = np.array(cleaned, dtype=int)
total_counts = clean_array.sum(axis=0, dtype=float)
std_total = np.divide(clean_array, total_counts)
std_total_sorted = std_total[std_total[:,0].argsort()]
std_total_sorted_rev = std_total_sorted[::-1]
total_hkg = array([896, 3193 ,1419 ,5088 ,2342, 1290, 589, 3828, 4149, 1943, 903], dtype=float) #note used (SS04640) rather than (SS04489)
std_total = np.divide(clean_array, total_hkg)
std_total_sorted = std_total[std_total[:,0].argsort()]
std_total_sorted_rev = std_total_sorted[::-1]
#Searching for function of interest containing nitrogen
text_of_interest = 'nitrogen'
threshold = 100 #the sum of total observations you want to be minimum
list_of_rows = []
count = 0
for n, x in enumerate(annotations):
if text_of_interest.lower() in x.lower():
if clean_array[n, :].sum() > threshold:
standardized = np.divide(clean_array[n], total_hkg)
list_of_rows.append(n)
#Putting interesting results into lists to start plotting
list_of_functions = []
list_of_data = []
for x in list_of_rows:
list_of_functions.append(annotations[x])
list_of_data.append(list(np.divide(clean_array[x,:], total_hkg)))
#Plots box whisker plot of interesting results, in 20 function increments
import matplotlib
import numpy as np
bottom = 0
top=20
d_big = {}
for n, x in enumerate(list_of_functions[bottom:top]):
d = {}
micro_d = [list_of_data[n][1], list_of_data[n][4], list_of_data[n][7], list_of_data[n][10]]
small_d = [list_of_data[n][2], list_of_data[n][8]]
large_d = [list_of_data[n][0], list_of_data[n][6], list_of_data[n][9]]
whole_d = [list_of_data[n][3], list_of_data[n][5]]
d['large']=large_d
d['micro']=micro_d
d['small']=small_d
d['whole']=whole_d
d_big[x] = d
#print d_big
data2 = []
for key in list_of_functions[bottom:top]:
for x in ['micro', 'small', 'large', 'whole']:
data2.append(d_big[key][x])
fig = plt.figure(figsize=[12,12])
ax1 = fig.add_subplot(111)
bp = plt.boxplot(data2, notch=0, sym='+', vert=1, whis=1.5)
plt.setp(bp['boxes'], color='black')
plt.setp(bp['whiskers'], color='black')
plt.setp(bp['fliers'], color='red', marker='+')
ax1.yaxis.grid(True, linestyle='-', which='major', color='lightgrey',
alpha=0.5)
ax1.set_axisbelow(True)
ax1.set_title('Nitrogen mania')
ax1.set_xlabel('Different nitrogen functions')
ax1.set_ylabel('Abundance normalized by gyrB')
boxColors = ['r','b', 'c', 'm']
numBoxes = len(list_of_functions[bottom:top])*4
# Now fill the boxes with desired colors
medians = range(numBoxes)
for i in range(numBoxes):
box = bp['boxes'][i]
boxX = []
boxY = []
for j in range(5):
boxX.append(box.get_xdata()[j])
boxY.append(box.get_ydata()[j])
boxCoords = zip(boxX,boxY)
# Alternate between Dark Khaki and Royal Blue
k = i % 4
boxPolygon = Polygon(boxCoords, facecolor=boxColors[k])
ax1.add_patch(boxPolygon)
# Now draw the median lines back over what we just filled in
med = bp['medians'][i]
medianX = []
medianY = []
for j in range(2):
medianX.append(med.get_xdata()[j])
medianY.append(med.get_ydata()[j])
plt.plot(medianX, medianY, 'k')
medians[i] = medianY[0]
# Finally, overplot the sample averages, with horizontal alignment
# in the center of each box
plt.plot([np.average(med.get_xdata())], [np.average(data2[i])],
color='w', marker='*', markeredgecolor='k')
xticks_list = []
for x in list_of_functions[bottom:top]:
xticks_list.append('')
xticks_list.append(x)
xticks_list.append('')
xticks_list.append('')
xtickNames = plt.setp(ax1, xticklabels=xticks_list)
plt.setp(xtickNames, rotation=90, fontsize=12)
# Finally, add a basic legend
plt.figtext(0.25, .8, 'Micro' ,
backgroundcolor=boxColors[0], weight='roman',
size='x-large', color='white')
plt.figtext(0.25, .78, 'Small', backgroundcolor=boxColors[1],
weight='roman', size='x-large', color='white')
plt.figtext(0.25, .76, 'Large', backgroundcolor=boxColors[2],
weight='roman', size='x-large', color='white')
plt.figtext(0.25, .735, 'Whole', backgroundcolor=boxColors[3],
weight='roman', size='x-large', color='white')
show()
#Grab phylogeny of specific subsystem
f1 = 'SS00496'
matrix_str = data.function['abundance'].dump(fformat='tab', rows=f1, cols=sub_selected_datasets, normalize=0, col_name=False, row_full=False, scale=None)
sub_rows, cols, sub_matrix = data.function['abundance'].sub_matrix(normalize=0, rows=f1, cols=sub_selected_datasets)
sub_cols = []
for i, col in enumerate(cols):
if sum(slice_column(sub_matrix, i)) > 0:
sub_cols.append(col)
#print out the phylogeny into file
new_data = Analysis(ids=sub_cols, annotation='organism', level='phylum', filters=sub_rows)
filename = 'test-demo2.tab'
new_data.dump(fname=filename, fformat='tab', cols=None, normalize=0, col_name=True, scale=None, row_min=0)
#Put this data into an array
f = open(filename, 'r')
list_values = []
content = f.readlines()
headers = content[0].rstrip().split('\t')[1:]
headers_cleaned = []
for x in headers:
x = x.split(' / ')[0]
headers_cleaned.append(x)
annotations = []
cleaned = []
for n, line in enumerate(content[1:]):
dat = line.rstrip().split('\t')
annotation = dat[0]
abundances = dat[1:]
annotations.append(annotation)
cleaned.append(abundances)
clean_array = np.array(cleaned, dtype=int)
total_counts = clean_array.sum(axis=0, dtype=float)
std_total = np.divide(clean_array, total_counts)
std_total_sorted = std_total[std_total[:,0].argsort()]
std_total_sorted_rev = std_total_sorted[::-1]
sort_index = std_total[:,0].argsort()
annotations_sorted = []
for x in sort_index:
annotations_sorted.append(annotations[x])
annotations_sorted.reverse()
# Put into a plot
import matplotlib
import numpy as np
large1 = list(std_total_sorted_rev[:,0])
micro1 = list(std_total_sorted_rev[:,1])
small1 = list(std_total_sorted_rev[:,2])
whole1 = list(std_total_sorted_rev[:,3])
micro2 = list(std_total_sorted_rev[:,4])
whole2 = list(std_total_sorted_rev[:,5])
large2 = list(std_total_sorted_rev[:,6])
micro3 = list(std_total_sorted_rev[:,7])
small2 = list(std_total_sorted_rev[:,8])
large3 = list(std_total_sorted_rev[:,9])
micro4 = list(std_total_sorted_rev[:,10])
bottom = 0
top=10
d_big = {}
for n, x in enumerate(annotations_sorted):
if bottom <= n < top:
d = {}
large_total = [large1[n], large2[n], large3[n]]
micro_total = [micro1[n], micro2[n], micro3[n], micro4[n]]
whole_total = [whole1[n], whole2[n]]
small_total = [small1[n], small2[n]]
d['large']=large_total
d['micro']=micro_total
d['small']=small_total
d['whole']=whole_total
d_big[x] = d
data = []
for key in annotations_sorted[bottom:top]:
for x in ['micro', 'small', 'large', 'whole']:
data.append(d_big[key][x])
fig = plt.figure(figsize=[12,12])
ax1 = fig.add_subplot(111)
bp = plt.boxplot(data, notch=0, sym='+', vert=1, whis=1.5)
plt.setp(bp['boxes'], color='black')
plt.setp(bp['whiskers'], color='black')
plt.setp(bp['fliers'], color='red', marker='+')
ax1.yaxis.grid(True, linestyle='-', which='major', color='lightgrey',
alpha=0.5)
ax1.set_axisbelow(True)
ax1.set_title(' Associated phyla ' + str(bottom) + ' to ' + str(top))
ax1.set_xlabel('Phyla')
ax1.set_ylabel('Relative Fraction')
boxColors = ['r','b', 'c', 'm']
numBoxes = len(annotations_sorted[bottom:top])*4#numDists*2
# Now fill the boxes with desired colors
medians = range(numBoxes)
for i in range(numBoxes):
box = bp['boxes'][i]
boxX = []
boxY = []
for j in range(5):
boxX.append(box.get_xdata()[j])
boxY.append(box.get_ydata()[j])
boxCoords = zip(boxX,boxY)
# Alternate between Dark Khaki and Royal Blue
k = i % 4
boxPolygon = Polygon(boxCoords, facecolor=boxColors[k])
ax1.add_patch(boxPolygon)
# Now draw the median lines back over what we just filled in
med = bp['medians'][i]
medianX = []
medianY = []
for j in range(2):
medianX.append(med.get_xdata()[j])
medianY.append(med.get_ydata()[j])
plt.plot(medianX, medianY, 'k')
medians[i] = medianY[0]
# Finally, overplot the sample averages, with horizontal alignment
# in the center of each box
plt.plot([np.average(med.get_xdata())], [np.average(data[i])],
color='w', marker='*', markersize=3, markeredgecolor='k')
xticks_list = []
for x in annotations_sorted:
xticks_list.append('')
xticks_list.append('')
xticks_list.append(x)
xticks_list.append('')
xtickNames = plt.setp(ax1, xticklabels=xticks_list)
plt.setp(xtickNames, rotation=90, fontsize=12)
# Finally, add a basic legend
plt.figtext(0.25, .8, 'Micro' ,
backgroundcolor=boxColors[0], weight='roman',
size='x-large', color='white')
plt.figtext(0.25, .78, 'Small', backgroundcolor=boxColors[1],
weight='roman', size='x-large', color='white')
plt.figtext(0.25, .76, 'Large', backgroundcolor=boxColors[2],
weight='roman', size='x-large', color='white')
plt.figtext(0.25, .735, 'Whole', backgroundcolor=boxColors[3],
weight='roman', size='x-large', color='white')
show()