# Run this only if grabbing a new set of data -- the default on this notebook is CC and PF raw # run this at the start of each notebook init_ipy() #load all data from hofmockel project from datetime import datetime wgs = [u'mgm4509403.3', u'mgm4511170.3', u'mgm4509405.3', u'mgm4509406.3', u'mgm4509401.3', u'mgm4509404.3', u'mgm4510009.3', u'mgm4512946.3', u'mgm4509399.3', u'mgm4510007.3', u'mgm4511171.3', u'mgm4511176.3', u'mgm4511169.3', u'mgm4509397.3', u'mgm4509402.3', u'mgm4511167.3', u'mgm4511174.3', u'mgm4510008.3', u'mgm4512947.3', u'mgm4511172.3', u'mgm4509407.3', u'mgm4510006.3', u'mgm4511177.3', u'mgm4509400.3', u'mgm4511168.3', u'mgm4509396.3', u'mgm4511166.3', u'mgm4511175.3', u'mgm4512949.3', u'mgm4512948.3', u'mgm4509398.3', u'mgm4511173.3'] data = load_object('hofmockel_wgs_48') print str(datetime.now()) # selecting only assemblies across aggregates, micro fraction sub_selected_datasets=['mgm4512893.3','mgm4512894.3','mgm4512895.3','mgm4512896.3','mgm4512897.3','mgm4512898.3','mgm4512899.3','mgm4512900.3','mgm4512901.3','mgm4512902.3','mgm4512903.3'] data.display_mgs = sub_selected_datasets #this sets these subsets as your data variable now #saving that data to file raw_file = 'demo-test.tab' data.function['abundance'].dump(fname=raw_file, fformat='tab', normalize=0, cols=sub_selected_datasets, col_name=True, row_full=True, scale=None) #Loading the data into an array and normalizing by gyrB abundance values import numpy as np f = open(raw_file, 'r') list_values = [] content = f.readlines() headers = content[0].rstrip().split('\t')[1:] headers_cleaned = [] for x in headers: x = x.split(' / ')[0] headers_cleaned.append(x) annotations = [] cleaned = [] for n, line in enumerate(content[1:]): dat = line.rstrip().split('\t') annotation = dat[0] abundances = dat[1:] annotations.append(annotation) cleaned.append(abundances) clean_array = np.array(cleaned, dtype=int) total_counts = clean_array.sum(axis=0, dtype=float) std_total = np.divide(clean_array, total_counts) std_total_sorted = std_total[std_total[:,0].argsort()] std_total_sorted_rev = std_total_sorted[::-1] total_hkg = array([896, 3193 ,1419 ,5088 ,2342, 1290, 589, 3828, 4149, 1943, 903], dtype=float) #note used (SS04640) rather than (SS04489) std_total = np.divide(clean_array, total_hkg) std_total_sorted = std_total[std_total[:,0].argsort()] std_total_sorted_rev = std_total_sorted[::-1] #Searching for function of interest containing nitrogen text_of_interest = 'nitrogen' threshold = 100 #the sum of total observations you want to be minimum list_of_rows = [] count = 0 for n, x in enumerate(annotations): if text_of_interest.lower() in x.lower(): if clean_array[n, :].sum() > threshold: standardized = np.divide(clean_array[n], total_hkg) list_of_rows.append(n) #Putting interesting results into lists to start plotting list_of_functions = [] list_of_data = [] for x in list_of_rows: list_of_functions.append(annotations[x]) list_of_data.append(list(np.divide(clean_array[x,:], total_hkg))) #Plots box whisker plot of interesting results, in 20 function increments import matplotlib import numpy as np bottom = 0 top=20 d_big = {} for n, x in enumerate(list_of_functions[bottom:top]): d = {} micro_d = [list_of_data[n][1], list_of_data[n][4], list_of_data[n][7], list_of_data[n][10]] small_d = [list_of_data[n][2], list_of_data[n][8]] large_d = [list_of_data[n][0], list_of_data[n][6], list_of_data[n][9]] whole_d = [list_of_data[n][3], list_of_data[n][5]] d['large']=large_d d['micro']=micro_d d['small']=small_d d['whole']=whole_d d_big[x] = d #print d_big data2 = [] for key in list_of_functions[bottom:top]: for x in ['micro', 'small', 'large', 'whole']: data2.append(d_big[key][x]) fig = plt.figure(figsize=[12,12]) ax1 = fig.add_subplot(111) bp = plt.boxplot(data2, notch=0, sym='+', vert=1, whis=1.5) plt.setp(bp['boxes'], color='black') plt.setp(bp['whiskers'], color='black') plt.setp(bp['fliers'], color='red', marker='+') ax1.yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5) ax1.set_axisbelow(True) ax1.set_title('Nitrogen mania') ax1.set_xlabel('Different nitrogen functions') ax1.set_ylabel('Abundance normalized by gyrB') boxColors = ['r','b', 'c', 'm'] numBoxes = len(list_of_functions[bottom:top])*4 # Now fill the boxes with desired colors medians = range(numBoxes) for i in range(numBoxes): box = bp['boxes'][i] boxX = [] boxY = [] for j in range(5): boxX.append(box.get_xdata()[j]) boxY.append(box.get_ydata()[j]) boxCoords = zip(boxX,boxY) # Alternate between Dark Khaki and Royal Blue k = i % 4 boxPolygon = Polygon(boxCoords, facecolor=boxColors[k]) ax1.add_patch(boxPolygon) # Now draw the median lines back over what we just filled in med = bp['medians'][i] medianX = [] medianY = [] for j in range(2): medianX.append(med.get_xdata()[j]) medianY.append(med.get_ydata()[j]) plt.plot(medianX, medianY, 'k') medians[i] = medianY[0] # Finally, overplot the sample averages, with horizontal alignment # in the center of each box plt.plot([np.average(med.get_xdata())], [np.average(data2[i])], color='w', marker='*', markeredgecolor='k') xticks_list = [] for x in list_of_functions[bottom:top]: xticks_list.append('') xticks_list.append(x) xticks_list.append('') xticks_list.append('') xtickNames = plt.setp(ax1, xticklabels=xticks_list) plt.setp(xtickNames, rotation=90, fontsize=12) # Finally, add a basic legend plt.figtext(0.25, .8, 'Micro' , backgroundcolor=boxColors[0], weight='roman', size='x-large', color='white') plt.figtext(0.25, .78, 'Small', backgroundcolor=boxColors[1], weight='roman', size='x-large', color='white') plt.figtext(0.25, .76, 'Large', backgroundcolor=boxColors[2], weight='roman', size='x-large', color='white') plt.figtext(0.25, .735, 'Whole', backgroundcolor=boxColors[3], weight='roman', size='x-large', color='white') show() #Grab phylogeny of specific subsystem f1 = 'SS00496' matrix_str = data.function['abundance'].dump(fformat='tab', rows=f1, cols=sub_selected_datasets, normalize=0, col_name=False, row_full=False, scale=None) sub_rows, cols, sub_matrix = data.function['abundance'].sub_matrix(normalize=0, rows=f1, cols=sub_selected_datasets) sub_cols = [] for i, col in enumerate(cols): if sum(slice_column(sub_matrix, i)) > 0: sub_cols.append(col) #print out the phylogeny into file new_data = Analysis(ids=sub_cols, annotation='organism', level='phylum', filters=sub_rows) filename = 'test-demo2.tab' new_data.dump(fname=filename, fformat='tab', cols=None, normalize=0, col_name=True, scale=None, row_min=0) #Put this data into an array f = open(filename, 'r') list_values = [] content = f.readlines() headers = content[0].rstrip().split('\t')[1:] headers_cleaned = [] for x in headers: x = x.split(' / ')[0] headers_cleaned.append(x) annotations = [] cleaned = [] for n, line in enumerate(content[1:]): dat = line.rstrip().split('\t') annotation = dat[0] abundances = dat[1:] annotations.append(annotation) cleaned.append(abundances) clean_array = np.array(cleaned, dtype=int) total_counts = clean_array.sum(axis=0, dtype=float) std_total = np.divide(clean_array, total_counts) std_total_sorted = std_total[std_total[:,0].argsort()] std_total_sorted_rev = std_total_sorted[::-1] sort_index = std_total[:,0].argsort() annotations_sorted = [] for x in sort_index: annotations_sorted.append(annotations[x]) annotations_sorted.reverse() # Put into a plot import matplotlib import numpy as np large1 = list(std_total_sorted_rev[:,0]) micro1 = list(std_total_sorted_rev[:,1]) small1 = list(std_total_sorted_rev[:,2]) whole1 = list(std_total_sorted_rev[:,3]) micro2 = list(std_total_sorted_rev[:,4]) whole2 = list(std_total_sorted_rev[:,5]) large2 = list(std_total_sorted_rev[:,6]) micro3 = list(std_total_sorted_rev[:,7]) small2 = list(std_total_sorted_rev[:,8]) large3 = list(std_total_sorted_rev[:,9]) micro4 = list(std_total_sorted_rev[:,10]) bottom = 0 top=10 d_big = {} for n, x in enumerate(annotations_sorted): if bottom <= n < top: d = {} large_total = [large1[n], large2[n], large3[n]] micro_total = [micro1[n], micro2[n], micro3[n], micro4[n]] whole_total = [whole1[n], whole2[n]] small_total = [small1[n], small2[n]] d['large']=large_total d['micro']=micro_total d['small']=small_total d['whole']=whole_total d_big[x] = d data = [] for key in annotations_sorted[bottom:top]: for x in ['micro', 'small', 'large', 'whole']: data.append(d_big[key][x]) fig = plt.figure(figsize=[12,12]) ax1 = fig.add_subplot(111) bp = plt.boxplot(data, notch=0, sym='+', vert=1, whis=1.5) plt.setp(bp['boxes'], color='black') plt.setp(bp['whiskers'], color='black') plt.setp(bp['fliers'], color='red', marker='+') ax1.yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5) ax1.set_axisbelow(True) ax1.set_title(' Associated phyla ' + str(bottom) + ' to ' + str(top)) ax1.set_xlabel('Phyla') ax1.set_ylabel('Relative Fraction') boxColors = ['r','b', 'c', 'm'] numBoxes = len(annotations_sorted[bottom:top])*4#numDists*2 # Now fill the boxes with desired colors medians = range(numBoxes) for i in range(numBoxes): box = bp['boxes'][i] boxX = [] boxY = [] for j in range(5): boxX.append(box.get_xdata()[j]) boxY.append(box.get_ydata()[j]) boxCoords = zip(boxX,boxY) # Alternate between Dark Khaki and Royal Blue k = i % 4 boxPolygon = Polygon(boxCoords, facecolor=boxColors[k]) ax1.add_patch(boxPolygon) # Now draw the median lines back over what we just filled in med = bp['medians'][i] medianX = [] medianY = [] for j in range(2): medianX.append(med.get_xdata()[j]) medianY.append(med.get_ydata()[j]) plt.plot(medianX, medianY, 'k') medians[i] = medianY[0] # Finally, overplot the sample averages, with horizontal alignment # in the center of each box plt.plot([np.average(med.get_xdata())], [np.average(data[i])], color='w', marker='*', markersize=3, markeredgecolor='k') xticks_list = [] for x in annotations_sorted: xticks_list.append('') xticks_list.append('') xticks_list.append(x) xticks_list.append('') xtickNames = plt.setp(ax1, xticklabels=xticks_list) plt.setp(xtickNames, rotation=90, fontsize=12) # Finally, add a basic legend plt.figtext(0.25, .8, 'Micro' , backgroundcolor=boxColors[0], weight='roman', size='x-large', color='white') plt.figtext(0.25, .78, 'Small', backgroundcolor=boxColors[1], weight='roman', size='x-large', color='white') plt.figtext(0.25, .76, 'Large', backgroundcolor=boxColors[2], weight='roman', size='x-large', color='white') plt.figtext(0.25, .735, 'Whole', backgroundcolor=boxColors[3], weight='roman', size='x-large', color='white') show()