from IPython.display import HTML import pandas as pd import numpy as np import os import matplotlib.pyplot as plt from matplotlib_venn import venn3, venn3_circles, venn3_unweighted import seaborn %pylab inline #These are defined by the way annovar defines precedence. I found empirically that stop_gain > frame_shift in annovar, hence the reverse precedence_dict = { "splicing_variant": 1, "frameshift_variant": 4, "stop_gained": 2, "stop_lost": 3, "inframe_variant": 5, "nonsynonymous_variant": 6, "synonymous_variant": 7, "5_prime_UTR_variant": 8, "3_prime_UTR_variant": 9, "intron_variant": 10, "upstream_gene_variant": 11, "downstream_gene_variant": 12, "intergenic_variant": 13, "intron_variant": 14, "upstream_gene_variant": 15, "regulatory_region_variant": 16, "ignored": 17 } def ranked(col): return max(col, key=lambda val: -1*precedence_dict[val]) with pd.get_store('classified_variant_store.h5') as store: snpeff_subset = store.get("cftr_snpeff_refseq_subset") grouped_snpeff_subset = snpeff_subset.groupby(["Gene_Name", "POS", "REF", "ALT"]) grouped_snpeff_subset = grouped_snpeff_subset.agg({"normalized_so_snpeff": ranked}) grouped_snpeff_subset = grouped_snpeff_subset.rename(columns={"normalized_so_snpeff": "normalized_so_snpeff_max"}).reset_index() grouped_snpeff_subset = pd.merge(grouped_snpeff_subset, snpeff_subset, how="left", on=["POS", "REF", "ALT", "Gene_Name"]) grouped_snpeff_subset = grouped_snpeff_subset[grouped_snpeff_subset["normalized_so_snpeff_max"] == grouped_snpeff_subset["normalized_so_snpeff"]] #kludge ties are broken by taking the first element in the group (ie randomly; this should only really effect the transcript level comparisons, ie hgvs etc) grouped_snpeff_subset = grouped_snpeff_subset.groupby(["Gene_Name", "POS", "REF", "ALT"]).first() agg_snpeff = grouped_snpeff_subset.reset_index() del agg_snpeff["normalized_so_snpeff_max"] del grouped_snpeff_subset del snpeff_subset agg_snpeff.rename(columns={"Gene_Name":"Gene"}, inplace=True) agg_snpeff[100000:100050] with pd.get_store('classified_variant_store.h5') as store: vep_subset = store.get("cftr_vep_refseq_subset") del vep_subset["Feature"] vep_subset.drop_duplicates(inplace=True) grouped_vep_subset = vep_subset.groupby(["Gene", "POS", "REF", "ALT"]) grouped_vep_subset = grouped_vep_subset.agg({"normalized_so_vep": ranked}) grouped_vep_subset = grouped_vep_subset.rename(columns={"normalized_so_vep": "normalized_so_vep_max"}).reset_index() grouped_vep_subset = pd.merge(grouped_vep_subset, vep_subset, how="left", on=["POS", "REF", "ALT", "Gene"]) grouped_vep_subset = grouped_vep_subset[grouped_vep_subset["normalized_so_vep_max"] == grouped_vep_subset["normalized_so_vep"]] grouped_vep_subset = grouped_vep_subset.groupby(["Gene", "POS", "REF", "ALT"]).first() agg_vep = grouped_vep_subset.reset_index() del grouped_vep_subset del vep_subset del agg_vep["normalized_so_vep_max"] agg_vep[80000:80023] with pd.get_store('classified_variant_store.h5') as store: annovar_subset = store.get("cftr_annovar_ensembl_subset") grouped_annovar_subset = annovar_subset.groupby(["Gene", "POS", "REF", "ALT"]) agg_annovar = grouped_annovar_subset.agg({"normalized_so_annovar": ranked}).reset_index() del annovar_subset['normalized_so_annovar'] agg_annovar = pd.merge(agg_annovar, annovar_subset, on=["Gene", "POS", "REF", "ALT"]) del grouped_annovar_subset del annovar_subset agg_annovar[2000:2050] vc_snpeff = agg_snpeff.groupby(["normalized_so_snpeff"]).size() vc_snpeff.name = "SNPeff" vc_vep = agg_vep.groupby(["normalized_so_vep"]).size() vc_vep.name = "VEP" vc_annovar = agg_annovar.groupby(["normalized_so_annovar"]).size() vc_annovar.name = "Annovar" vc_df = pd.DataFrame([vc_snpeff, vc_vep, vc_annovar]) vc_df.transpose().plot(kind="barh", fontsize=13, figsize=(16,8)) master_df = pd.merge(agg_snpeff, agg_vep, how="outer", on=["Gene", "POS", "REF", "ALT"]) master_df = pd.merge(master_df, agg_annovar, how="outer", on=["Gene", "POS", "REF", "ALT"]) master_df[100000:100001] for effect in master_df["normalized_so_snpeff"].unique(): vep_effect = master_df[master_df["normalized_so_vep"] == effect] annovar_effect = master_df[master_df["normalized_so_annovar"] == effect] snpeff_effect = master_df[master_df["normalized_so_snpeff"] == effect] fig = plt.figure(figsize=(10,10), dpi=300) fig.suptitle(effect, fontsize=14, fontweight='bold') v = venn3_unweighted([set(vep_effect.index.values), set(snpeff_effect.index.values), set(annovar_effect.index.values)], set_labels=("VEP", "SNPeff", "Annovar")) plt.plot(fontsize=24) sampletables = '
" + str(num_rows) + " rows
" HTML(sampletables) sampletables = '" + str(num_rows) + " rows
" HTML(sampletables) sampletables ='' for effect in master_df["normalized_so_snpeff"].unique(): sampletables += "" + str(num_rows) + " rows
" HTML(sampletables) sampletables = '" + str(num_rows) + " rows
" HTML(sampletables) sampletables = '" + str(num_rows) + " rows
" HTML(sampletables) sampletables = '" + str(num_rows) + " rows
" HTML(sampletables) sampletables = '" + str(num_rows) + " rows
" HTML(sampletables)