Analysis of Refseq annotations

Unfortunately, due to the fact that on VEP's web interface, the refseq transcript set is grouped with "other transcripts" makes this analysis invalid. I've included this for completeness however.

In [1]:
from IPython.display import HTML
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from matplotlib_venn import venn3, venn3_circles, venn3_unweighted
import seaborn
%pylab inline
Populating the interactive namespace from numpy and matplotlib
In [2]:
#These are defined by the way annovar defines precedence. I found empirically that stop_gain > frame_shift in annovar, hence the reverse
precedence_dict = {
"splicing_variant": 1,
"frameshift_variant": 4,
"stop_gained": 2,
"stop_lost": 3,
"inframe_variant": 5,
"nonsynonymous_variant": 6,
"synonymous_variant": 7,
"5_prime_UTR_variant": 8,
"3_prime_UTR_variant": 9,
"intron_variant": 10,
"upstream_gene_variant": 11,
"downstream_gene_variant": 12,
"intergenic_variant": 13,
"intron_variant": 14,
"upstream_gene_variant": 15,
"regulatory_region_variant": 16,
"ignored": 17
}

def ranked(col):
    return max(col, key=lambda val: -1*precedence_dict[val])
In [3]:
with pd.get_store('classified_variant_store.h5') as store:
    snpeff_subset = store.get("cftr_snpeff_refseq_subset")
grouped_snpeff_subset = snpeff_subset.groupby(["Gene_Name", "POS", "REF", "ALT"])
grouped_snpeff_subset = grouped_snpeff_subset.agg({"normalized_so_snpeff": ranked})
grouped_snpeff_subset = grouped_snpeff_subset.rename(columns={"normalized_so_snpeff": "normalized_so_snpeff_max"}).reset_index()
grouped_snpeff_subset = pd.merge(grouped_snpeff_subset, snpeff_subset, how="left", on=["POS", "REF", "ALT", "Gene_Name"])
grouped_snpeff_subset = grouped_snpeff_subset[grouped_snpeff_subset["normalized_so_snpeff_max"] == grouped_snpeff_subset["normalized_so_snpeff"]]
#kludge ties are broken by taking the first element in the group (ie randomly; this should only really effect the transcript level comparisons, ie hgvs etc)
grouped_snpeff_subset = grouped_snpeff_subset.groupby(["Gene_Name", "POS", "REF", "ALT"]).first()
agg_snpeff = grouped_snpeff_subset.reset_index()
del agg_snpeff["normalized_so_snpeff_max"]
del grouped_snpeff_subset
del snpeff_subset
agg_snpeff.rename(columns={"Gene_Name":"Gene"}, inplace=True)
agg_snpeff[100000:100050]
Out[3]:
Gene POS REF ALT ID Effect Transcript_ID hgvs_snpeff normalized_so_snpeff
100000 CFTR 117199805 A AT . INTRON NM_000492.3 c.1584+97*>+T intron_variant
100001 CFTR 117199805 A ATCA . INTRON NM_000492.3 c.1584+97*>+TCA intron_variant
100002 CFTR 117199805 A C . INTRON NM_000492.3 c.1584+96A>C intron_variant
100003 CFTR 117199805 A G . INTRON NM_000492.3 c.1584+96A>G intron_variant
100004 CFTR 117199805 A T . INTRON NM_000492.3 c.1584+96A>T intron_variant
100005 CFTR 117199805 AT A . INTRON NM_000492.3 c.1584+97*>-T intron_variant
100006 CFTR 117199805 ATA A . INTRON NM_000492.3 c.1584+97*>-TA intron_variant
100007 CFTR 117199805 ATAT A . INTRON NM_000492.3 c.1584+97*>-TAT intron_variant
100008 CFTR 117199806 T A . INTRON NM_000492.3 c.1584+97T>A intron_variant
100009 CFTR 117199806 T C . INTRON NM_000492.3 c.1584+97T>C intron_variant
100010 CFTR 117199806 T G . INTRON NM_000492.3 c.1584+97T>G intron_variant
100011 CFTR 117199806 T TA . INTRON NM_000492.3 c.1584+98*>+A intron_variant
100012 CFTR 117199806 T TAGT . INTRON NM_000492.3 c.1584+98*>+AGT intron_variant
100013 CFTR 117199806 T TC . INTRON NM_000492.3 c.1584+98*>+C intron_variant
100014 CFTR 117199806 T TG . INTRON NM_000492.3 c.1584+98*>+G intron_variant
100015 CFTR 117199806 T TGA . INTRON NM_000492.3 c.1584+98*>+GA intron_variant
100016 CFTR 117199806 T TGAC . INTRON NM_000492.3 c.1584+98*>+GAC intron_variant
100017 CFTR 117199806 T TT . INTRON NM_000492.3 c.1584+98*>+T intron_variant
100018 CFTR 117199806 T TTA . INTRON NM_000492.3 c.1584+98*>+TA intron_variant
100019 CFTR 117199806 TA T . INTRON NM_000492.3 c.1584+98*>-A intron_variant
100020 CFTR 117199806 TAT T . INTRON NM_000492.3 c.1584+98*>-AT intron_variant
100021 CFTR 117199806 TATA T . INTRON NM_000492.3 c.1584+98*>-ATA intron_variant
100022 CFTR 117199807 A AA . INTRON NM_000492.3 c.1584+99*>+A intron_variant
100023 CFTR 117199807 A AC . INTRON NM_000492.3 c.1584+99*>+C intron_variant
100024 CFTR 117199807 A ACG . INTRON NM_000492.3 c.1584+99*>+CG intron_variant
100025 CFTR 117199807 A ACTG . INTRON NM_000492.3 c.1584+99*>+CTG intron_variant
100026 CFTR 117199807 A AG . INTRON NM_000492.3 c.1584+99*>+G intron_variant
100027 CFTR 117199807 A AGTC . INTRON NM_000492.3 c.1584+99*>+GTC intron_variant
100028 CFTR 117199807 A AT . INTRON NM_000492.3 c.1584+99*>+T intron_variant
100029 CFTR 117199807 A ATA . INTRON NM_000492.3 c.1584+99*>+TA intron_variant
100030 CFTR 117199807 A C . INTRON NM_000492.3 c.1584+98A>C intron_variant
100031 CFTR 117199807 A G . INTRON NM_000492.3 c.1584+98A>G intron_variant
100032 CFTR 117199807 A T . INTRON NM_000492.3 c.1584+98A>T intron_variant
100033 CFTR 117199807 AT A . INTRON NM_000492.3 c.1584+99*>-T intron_variant
100034 CFTR 117199807 ATA A . INTRON NM_000492.3 c.1584+99*>-TA intron_variant
100035 CFTR 117199807 ATAT A . INTRON NM_000492.3 c.1584+99*>-TAT intron_variant
100036 CFTR 117199808 T A . INTRON NM_000492.3 c.1584+99T>A intron_variant
100037 CFTR 117199808 T C . INTRON NM_000492.3 c.1584+99T>C intron_variant
100038 CFTR 117199808 T G . INTRON NM_000492.3 c.1584+99T>G intron_variant
100039 CFTR 117199808 T TA . INTRON NM_000492.3 c.1584+100*>+A intron_variant
100040 CFTR 117199808 T TC . INTRON NM_000492.3 c.1584+100*>+C intron_variant
100041 CFTR 117199808 T TG . INTRON NM_000492.3 c.1584+100*>+G intron_variant
100042 CFTR 117199808 T TGA . INTRON NM_000492.3 c.1584+100*>+GA intron_variant
100043 CFTR 117199808 T TGC . INTRON NM_000492.3 c.1584+100*>+GC intron_variant
100044 CFTR 117199808 T TGCT . INTRON NM_000492.3 c.1584+100*>+GCT intron_variant
100045 CFTR 117199808 T TT . INTRON NM_000492.3 c.1584+100*>+T intron_variant
100046 CFTR 117199808 T TTAC . INTRON NM_000492.3 c.1584+100*>+TAC intron_variant
100047 CFTR 117199808 TA T . INTRON NM_000492.3 c.1584+100*>-A intron_variant
100048 CFTR 117199808 TAT T . INTRON NM_000492.3 c.1584+100*>-AT intron_variant
100049 CFTR 117199808 TATT T . INTRON NM_000492.3 c.1584+100*>-ATT intron_variant

50 rows × 9 columns

In [4]:
with pd.get_store('classified_variant_store.h5') as store:
    vep_subset = store.get("cftr_vep_refseq_subset")
del vep_subset["Feature"]
vep_subset.drop_duplicates(inplace=True)
grouped_vep_subset = vep_subset.groupby(["Gene", "POS", "REF", "ALT"])
grouped_vep_subset = grouped_vep_subset.agg({"normalized_so_vep": ranked})
grouped_vep_subset = grouped_vep_subset.rename(columns={"normalized_so_vep": "normalized_so_vep_max"}).reset_index()
grouped_vep_subset = pd.merge(grouped_vep_subset, vep_subset, how="left", on=["POS", "REF", "ALT", "Gene"])
grouped_vep_subset = grouped_vep_subset[grouped_vep_subset["normalized_so_vep_max"] == grouped_vep_subset["normalized_so_vep"]]
grouped_vep_subset = grouped_vep_subset.groupby(["Gene", "POS", "REF", "ALT"]).first()
agg_vep = grouped_vep_subset.reset_index()
del grouped_vep_subset
del vep_subset
del agg_vep["normalized_so_vep_max"]
agg_vep[80000:80023]
Out[4]:
Gene POS REF ALT ID Consequence hgvs_vep normalized_so_vep
80000 1080 117182109 A C . missense_variant NP_000483.3:p.Asn386HisNM_000492.3:c.1156A>C nonsynonymous_variant
80001 1080 117182109 A G . missense_variant NP_000483.3:p.Asn386AspNM_000492.3:c.1156A>G nonsynonymous_variant
80002 1080 117182109 A T . missense_variant NP_000483.3:p.Asn386TyrNM_000492.3:c.1156A>T nonsynonymous_variant
80003 1080 117182109 AA A . frameshift_variant NP_000483.3:p.Asn386ThrfsTer2NM_000492.3:c.115... frameshift_variant
80004 1080 117182109 AAC A . frameshift_variant NP_000483.3:p.Asn386IlefsTer24NM_000492.3:c.11... frameshift_variant
80005 1080 117182109 AACT A . inframe_deletion NP_000483.3:p.Asn386_Leu387delinsIleNM_000492.... inframe_variant
80006 1080 117182110 A AA . frameshift_variant NP_000483.3:p.Asn386LysfsTer25NM_000492.3:c.11... frameshift_variant
80007 1080 117182110 A AAC . frameshift_variant NP_000483.3:p.Asn386LysfsTer3NM_000492.3:c.115... frameshift_variant
80008 1080 117182110 A AAG . frameshift_variant NP_000483.3:p.Asn386LysfsTer3NM_000492.3:c.115... frameshift_variant
80009 1080 117182110 A AATC . inframe_insertion NP_000483.3:p.Asn386delinsLysSerNM_000492.3:c.... inframe_variant
80010 1080 117182110 A AC . frameshift_variant NP_000483.3:p.Thr388AsnfsTer23NM_000492.3:c.11... frameshift_variant
80011 1080 117182110 A ACAT . inframe_insertion NP_000483.3:p.Asn386_Leu387insIleNM_000492.3:c... inframe_variant
80012 1080 117182110 A AG . frameshift_variant NP_000483.3:p.Asn386LysfsTer25NM_000492.3:c.11... frameshift_variant
80013 1080 117182110 A AT . frameshift_variant NP_000483.3:p.Thr388AsnfsTer23NM_000492.3:c.11... frameshift_variant
80014 1080 117182110 A C . missense_variant NP_000483.3:p.Asn386ThrNM_000492.3:c.1157A>C nonsynonymous_variant
80015 1080 117182110 A G . missense_variant NP_000483.3:p.Asn386SerNM_000492.3:c.1157A>G nonsynonymous_variant
80016 1080 117182110 A T . missense_variant NP_000483.3:p.Asn386IleNM_000492.3:c.1157A>T nonsynonymous_variant
80017 1080 117182110 AC A . frameshift_variant NP_000483.3:p.Leu387TerNM_000492.3:c.1158delC frameshift_variant
80018 1080 117182110 ACT A . frameshift_variant NP_000483.3:p.Leu387AsnfsTer23NM_000492.3:c.11... frameshift_variant
80019 1080 117182110 ACTT A . inframe_deletion NP_000483.3:p.Asn386_Leu387delinsLysNM_000492.... inframe_variant
80020 1080 117182111 C A . missense_variant NP_000483.3:p.Asn386LysNM_000492.3:c.1158C>A nonsynonymous_variant
80021 1080 117182111 C CA . frameshift_variant NP_000483.3:p.Leu387IlefsTer24NM_000492.3:c.11... frameshift_variant
80022 1080 117182111 C CAC . frameshift_variant NP_000483.3:p.Leu387ThrfsTer2NM_000492.3:c.115... frameshift_variant

23 rows × 8 columns

In [5]:
with pd.get_store('classified_variant_store.h5') as store:
    annovar_subset = store.get("cftr_annovar_ensembl_subset")
grouped_annovar_subset = annovar_subset.groupby(["Gene", "POS", "REF", "ALT"])
agg_annovar = grouped_annovar_subset.agg({"normalized_so_annovar": ranked}).reset_index()
del annovar_subset['normalized_so_annovar']
agg_annovar = pd.merge(agg_annovar, annovar_subset, on=["Gene", "POS", "REF", "ALT"])
del grouped_annovar_subset
del annovar_subset
agg_annovar[2000:2050]
Out[5]:
Gene POS REF ALT normalized_so_annovar combined_effect hgvs
2000 ENSG00000001626 117119399 GGT G splicing_variant splicing NaN
2001 ENSG00000001626 117119399 GGTA G splicing_variant splicing NaN
2002 ENSG00000001626 117119400 G A splicing_variant splicing NaN
2003 ENSG00000001626 117119400 G C splicing_variant splicing NaN
2004 ENSG00000001626 117119400 G GA splicing_variant splicing NaN
2005 ENSG00000001626 117119400 G GAGC splicing_variant splicing NaN
2006 ENSG00000001626 117119400 G GC splicing_variant splicing NaN
2007 ENSG00000001626 117119400 G GG splicing_variant splicing NaN
2008 ENSG00000001626 117119400 G GGCA splicing_variant splicing NaN
2009 ENSG00000001626 117119400 G GT splicing_variant splicing NaN
2010 ENSG00000001626 117119400 G GTA splicing_variant splicing NaN
2011 ENSG00000001626 117119400 G GTG splicing_variant splicing NaN
2012 ENSG00000001626 117119400 G T splicing_variant splicing NaN
2013 ENSG00000001626 117119400 GT G splicing_variant splicing NaN
2014 ENSG00000001626 117119400 GTA G splicing_variant splicing NaN
2015 ENSG00000001626 117119400 GTAA G splicing_variant splicing NaN
2016 ENSG00000001626 117119401 T A splicing_variant splicing NaN
2017 ENSG00000001626 117119401 T C splicing_variant splicing NaN
2018 ENSG00000001626 117119401 T G splicing_variant splicing NaN
2019 ENSG00000001626 117119401 T TA splicing_variant splicing NaN
2020 ENSG00000001626 117119401 T TAT splicing_variant splicing NaN
2021 ENSG00000001626 117119401 T TC splicing_variant splicing NaN
2022 ENSG00000001626 117119401 T TCGT splicing_variant splicing NaN
2023 ENSG00000001626 117119401 T TG splicing_variant splicing NaN
2024 ENSG00000001626 117119401 T TGCT splicing_variant splicing NaN
2025 ENSG00000001626 117119401 T TGT splicing_variant splicing NaN
2026 ENSG00000001626 117119401 T TT splicing_variant splicing NaN
2027 ENSG00000001626 117119401 TA T intron_variant intronic NaN
2028 ENSG00000001626 117119401 TAA T intron_variant intronic NaN
2029 ENSG00000001626 117119401 TAAA T intron_variant intronic NaN
2030 ENSG00000001626 117119402 A AA intron_variant intronic NaN
2031 ENSG00000001626 117119402 A AAG intron_variant intronic NaN
2032 ENSG00000001626 117119402 A AAGT intron_variant intronic NaN
2033 ENSG00000001626 117119402 A AC intron_variant intronic NaN
2034 ENSG00000001626 117119402 A ACAG intron_variant intronic NaN
2035 ENSG00000001626 117119402 A AG intron_variant intronic NaN
2036 ENSG00000001626 117119402 A AGT intron_variant intronic NaN
2037 ENSG00000001626 117119402 A AT intron_variant intronic NaN
2038 ENSG00000001626 117119402 A C intron_variant intronic NaN
2039 ENSG00000001626 117119402 A G intron_variant intronic NaN
2040 ENSG00000001626 117119402 A T intron_variant intronic NaN
2041 ENSG00000001626 117119402 AA A intron_variant intronic NaN
2042 ENSG00000001626 117119402 AAA A intron_variant intronic NaN
2043 ENSG00000001626 117119402 AAAT A intron_variant intronic NaN
2044 ENSG00000001626 117119403 A AA intron_variant intronic NaN
2045 ENSG00000001626 117119403 A AC intron_variant intronic NaN
2046 ENSG00000001626 117119403 A ACAG intron_variant intronic NaN
2047 ENSG00000001626 117119403 A ACT intron_variant intronic NaN
2048 ENSG00000001626 117119403 A ACTG intron_variant intronic NaN
2049 ENSG00000001626 117119403 A AG intron_variant intronic NaN

50 rows × 7 columns

In [6]:
vc_snpeff = agg_snpeff.groupby(["normalized_so_snpeff"]).size()
vc_snpeff.name = "SNPeff"
vc_vep = agg_vep.groupby(["normalized_so_vep"]).size()
vc_vep.name = "VEP"
vc_annovar = agg_annovar.groupby(["normalized_so_annovar"]).size()
vc_annovar.name = "Annovar"
vc_df = pd.DataFrame([vc_snpeff, vc_vep, vc_annovar])
vc_df.transpose().plot(kind="barh", fontsize=13, figsize=(16,8))
Out[6]:
<matplotlib.axes.AxesSubplot at 0x13f517990>

It's worth noting that the refseq + other transcripts set was used in VEP's web app. This explain the huge number of extra annotations in

In [7]:
master_df = pd.merge(agg_snpeff, agg_vep, how="outer", on=["Gene", "POS", "REF", "ALT"])
master_df = pd.merge(master_df, agg_annovar, how="outer", on=["Gene", "POS", "REF", "ALT"])
In [8]:
master_df[100000:100001]
Out[8]:
Gene POS REF ALT ID_x Effect Transcript_ID hgvs_snpeff normalized_so_snpeff ID_y Consequence hgvs_vep normalized_so_vep normalized_so_annovar combined_effect hgvs
100000 CFTR 117199805 A AT . INTRON NM_000492.3 c.1584+97*>+T intron_variant NaN NaN NaN NaN NaN NaN NaN

1 rows × 16 columns

In [9]:
for effect in master_df["normalized_so_snpeff"].unique():
    vep_effect = master_df[master_df["normalized_so_vep"] == effect]
    annovar_effect = master_df[master_df["normalized_so_annovar"] == effect]
    snpeff_effect = master_df[master_df["normalized_so_snpeff"] == effect]
    fig = plt.figure(figsize=(10,10), dpi=300)
    fig.suptitle(effect, fontsize=14, fontweight='bold')
    v = venn3_unweighted([set(vep_effect.index.values), set(snpeff_effect.index.values), set(annovar_effect.index.values)], set_labels=("VEP", "SNPeff", "Annovar"))
    plt.plot(fontsize=24)
In [10]:
sampletables = '<h1>Other algo\'s agree, but...</h1>'
for effect in master_df["normalized_so_snpeff"].unique():
    sampletables += "<h2> Annovar doesn't match for <em>" + str(effect) + "</em></h2>"
    query = master_df.loc[(master_df["normalized_so_annovar"]!=effect) & 
                                  (master_df["normalized_so_snpeff"]==effect) & 
                                  (master_df["normalized_so_vep"]==effect)]
    num_rows = query.count()[0]
    if num_rows > 0:
        sampletables += query.head(5).to_html()
    sampletables += "<p>" + str(num_rows) + " rows</p>"
HTML(sampletables)
Out[10]:

Other algo's agree, but...

Annovar doesn't match for intergenic_variant

Gene POS REF ALT ID_x Effect Transcript_ID hgvs_snpeff normalized_so_snpeff ID_y Consequence hgvs_vep normalized_so_vep normalized_so_annovar combined_effect hgvs
0 117105737 C A . INTERGENIC intergenic_variant . intergenic_variant intergenic_variant NaN NaN NaN
1 117105737 C CA . INTERGENIC intergenic_variant . intergenic_variant intergenic_variant NaN NaN NaN
2 117105737 C CAG . INTERGENIC intergenic_variant . intergenic_variant intergenic_variant NaN NaN NaN
3 117105737 C CC . INTERGENIC intergenic_variant . intergenic_variant intergenic_variant NaN NaN NaN
4 117105737 C CCTG . INTERGENIC intergenic_variant . intergenic_variant intergenic_variant NaN NaN NaN

9198 rows

Annovar doesn't match for upstream_gene_variant

0 rows

Annovar doesn't match for 5_prime_UTR_variant

0 rows

Annovar doesn't match for inframe_variant

0 rows

Annovar doesn't match for frameshift_variant

0 rows

Annovar doesn't match for nonsynonymous_variant

0 rows

Annovar doesn't match for stop_gained

0 rows

Annovar doesn't match for synonymous_variant

0 rows

Annovar doesn't match for splicing_variant

0 rows

Annovar doesn't match for intron_variant

0 rows

Annovar doesn't match for stop_lost

0 rows

Annovar doesn't match for 3_prime_UTR_variant

0 rows

Annovar doesn't match for downstream_gene_variant

0 rows

Annovar doesn't match for nan

0 rows

In [11]:
sampletables = '<h1>At least 1 column doesn\'t match</h1>'
for effect in master_df["normalized_so_snpeff"].unique():
    sampletables += "<h2> Annovar doesn't match for <em>" + str(effect) + "</em></h2>"
    query = master_df.loc[(master_df["normalized_so_annovar"]!=effect) & 
                                  ((master_df["normalized_so_snpeff"]==effect) | (master_df["normalized_so_vep"]==effect))]
    num_rows = query.count()[0]
    if num_rows > 0:
        sampletables += query.head(5).to_html()
    sampletables += "<p>" + str(num_rows) + " rows</p>"
HTML(sampletables)
Out[11]:

At least 1 column doesn't match

Annovar doesn't match for intergenic_variant

Gene POS REF ALT ID_x Effect Transcript_ID hgvs_snpeff normalized_so_snpeff ID_y Consequence hgvs_vep normalized_so_vep normalized_so_annovar combined_effect hgvs
0 117105737 C A . INTERGENIC intergenic_variant . intergenic_variant intergenic_variant NaN NaN NaN
1 117105737 C CA . INTERGENIC intergenic_variant . intergenic_variant intergenic_variant NaN NaN NaN
2 117105737 C CAG . INTERGENIC intergenic_variant . intergenic_variant intergenic_variant NaN NaN NaN
3 117105737 C CC . INTERGENIC intergenic_variant . intergenic_variant intergenic_variant NaN NaN NaN
4 117105737 C CCTG . INTERGENIC intergenic_variant . intergenic_variant intergenic_variant NaN NaN NaN

32500 rows

Annovar doesn't match for upstream_gene_variant

Gene POS REF ALT ID_x Effect Transcript_ID hgvs_snpeff normalized_so_snpeff ID_y Consequence hgvs_vep normalized_so_vep normalized_so_annovar combined_effect hgvs
32500 CFTR 117115644 G A . UPSTREAM NM_000492.3 upstream_gene_variant NaN NaN NaN NaN NaN NaN NaN
32501 CFTR 117115644 G C . UPSTREAM NM_000492.3 upstream_gene_variant NaN NaN NaN NaN NaN NaN NaN
32502 CFTR 117115644 G GA . UPSTREAM NM_000492.3 upstream_gene_variant NaN NaN NaN NaN NaN NaN NaN
32503 CFTR 117115644 G GAC . UPSTREAM NM_000492.3 upstream_gene_variant NaN NaN NaN NaN NaN NaN NaN
32504 CFTR 117115644 G GACG . UPSTREAM NM_000492.3 upstream_gene_variant NaN NaN NaN NaN NaN NaN NaN

83012 rows

Annovar doesn't match for 5_prime_UTR_variant

Gene POS REF ALT ID_x Effect Transcript_ID hgvs_snpeff normalized_so_snpeff ID_y Consequence hgvs_vep normalized_so_vep normalized_so_annovar combined_effect hgvs
46611 CFTR 117120014 GGGA G . UTR_5_PRIME NM_000492.3 5_prime_UTR_variant NaN NaN NaN NaN NaN NaN NaN
46624 CFTR 117120015 GGA G . UTR_5_PRIME NM_000492.3 5_prime_UTR_variant NaN NaN NaN NaN NaN NaN NaN
46625 CFTR 117120015 GGAA G . UTR_5_PRIME NM_000492.3 5_prime_UTR_variant NaN NaN NaN NaN NaN NaN NaN
46628 CFTR 117120016 G GA . UTR_5_PRIME NM_000492.3 5_prime_UTR_variant NaN NaN NaN NaN NaN NaN NaN
46629 CFTR 117120016 G GAGC . UTR_5_PRIME NM_000492.3 5_prime_UTR_variant NaN NaN NaN NaN NaN NaN NaN

9940 rows

Annovar doesn't match for inframe_variant

Gene POS REF ALT ID_x Effect Transcript_ID hgvs_snpeff normalized_so_snpeff ID_y Consequence hgvs_vep normalized_so_vep normalized_so_annovar combined_effect hgvs
48459 CFTR 117120146 ACCA A . CODON_DELETION NM_000492.3 p.Met1X/c.1*>-CCA inframe_variant NaN NaN NaN NaN NaN NaN NaN
48473 CFTR 117120147 CCAT C . CODON_DELETION NM_000492.3 p.Met1X/c.1*>-CAT inframe_variant NaN NaN NaN NaN NaN NaN NaN
48476 CFTR 117120148 C CACT . CODON_INSERTION NM_000492.3 p.Met1X/c.1*>+ACT inframe_variant NaN NaN NaN NaN NaN NaN NaN
48482 CFTR 117120148 C CTGC . CODON_INSERTION NM_000492.3 p.Met1X/c.1*>+TGC inframe_variant NaN NaN NaN NaN NaN NaN NaN
48487 CFTR 117120148 CATG C . CODON_DELETION NM_000492.3 p.Met1X/c.1*>-ATG inframe_variant NaN NaN NaN NaN NaN NaN NaN

46483 rows

Annovar doesn't match for frameshift_variant

Gene POS REF ALT ID_x Effect Transcript_ID hgvs_snpeff normalized_so_snpeff ID_y Consequence hgvs_vep normalized_so_vep normalized_so_annovar combined_effect hgvs
48472 CFTR 117120147 CCA C . FRAME_SHIFT NM_000492.3 p.X1X/c.1*>-CA frameshift_variant NaN NaN NaN NaN NaN NaN NaN
48475 CFTR 117120148 C CA . FRAME_SHIFT NM_000492.3 p.Met1X/c.1*>+A frameshift_variant NaN NaN NaN NaN NaN NaN NaN
48477 CFTR 117120148 C CC . FRAME_SHIFT NM_000492.3 p.Met1X/c.1*>+C frameshift_variant NaN NaN NaN NaN NaN NaN NaN
48478 CFTR 117120148 C CG . FRAME_SHIFT NM_000492.3 p.Met1X/c.1*>+G frameshift_variant NaN NaN NaN NaN NaN NaN NaN
48479 CFTR 117120148 C CGC . FRAME_SHIFT NM_000492.3 p.Met1X/c.1*>+GC frameshift_variant NaN NaN NaN NaN NaN NaN NaN

132305 rows

Annovar doesn't match for nonsynonymous_variant

Gene POS REF ALT ID_x Effect Transcript_ID hgvs_snpeff normalized_so_snpeff ID_y Consequence hgvs_vep normalized_so_vep normalized_so_annovar combined_effect hgvs
48496 CFTR 117120149 A C . NON_SYNONYMOUS_START NM_000492.3 p.Met1?/c.1A>C nonsynonymous_variant NaN NaN NaN NaN NaN NaN NaN
48497 CFTR 117120149 A G . START_LOST NM_000492.3 p.Met1?/c.1A>G nonsynonymous_variant NaN NaN NaN NaN NaN NaN NaN
48498 CFTR 117120149 A T . NON_SYNONYMOUS_START NM_000492.3 p.Met1?/c.1A>T nonsynonymous_variant NaN NaN NaN NaN NaN NaN NaN
48502 CFTR 117120150 T A . START_LOST NM_000492.3 p.Met1?/c.2T>A nonsynonymous_variant NaN NaN NaN NaN NaN NaN NaN
48503 CFTR 117120150 T C . START_LOST NM_000492.3 p.Met1?/c.2T>C nonsynonymous_variant NaN NaN NaN NaN NaN NaN NaN

36437 rows

Annovar doesn't match for stop_gained

Gene POS REF ALT ID_x Effect Transcript_ID hgvs_snpeff normalized_so_snpeff ID_y Consequence hgvs_vep normalized_so_vep normalized_so_annovar combined_effect hgvs
48525 CFTR 117120151 G GTGA . STOP_GAINED NM_000492.3 p.Gln2*/c.4*>+TGA stop_gained NaN NaN NaN NaN NaN NaN NaN
48540 CFTR 117120152 C T . STOP_GAINED NM_000492.3 p.Gln2*/c.4C>T stop_gained NaN NaN NaN NaN NaN NaN NaN
48564 CFTR 117120154 G GT . STOP_GAINED NM_000492.3 p.Arg3*/c.7*>+T stop_gained NaN NaN NaN NaN NaN NaN NaN
48565 CFTR 117120154 G GTA . STOP_GAINED NM_000492.3 p.Arg3*/c.7*>+TA stop_gained NaN NaN NaN NaN NaN NaN NaN
48609 CFTR 117120157 G GTAG . STOP_GAINED NM_000492.3 p.Ser4*/c.10*>+TAG stop_gained NaN NaN NaN NaN NaN NaN NaN

6778 rows

Annovar doesn't match for synonymous_variant

Gene POS REF ALT ID_x Effect Transcript_ID hgvs_snpeff normalized_so_snpeff ID_y Consequence hgvs_vep normalized_so_vep normalized_so_annovar combined_effect hgvs
48558 CFTR 117120154 G A . SYNONYMOUS_CODING NM_000492.3 p.Gln2Gln/c.6G>A synonymous_variant NaN NaN NaN NaN NaN NaN NaN
48580 CFTR 117120155 A C . SYNONYMOUS_CODING NM_000492.3 p.Arg3Arg/c.7A>C synonymous_variant NaN NaN NaN NaN NaN NaN NaN
48600 CFTR 117120157 G A . SYNONYMOUS_CODING NM_000492.3 p.Arg3Arg/c.9G>A synonymous_variant NaN NaN NaN NaN NaN NaN NaN
48642 CFTR 117120160 G A . SYNONYMOUS_CODING NM_000492.3 p.Ser4Ser/c.12G>A synonymous_variant NaN NaN NaN NaN NaN NaN NaN
48643 CFTR 117120160 G C . SYNONYMOUS_CODING NM_000492.3 p.Ser4Ser/c.12G>C synonymous_variant NaN NaN NaN NaN NaN NaN NaN

11010 rows

Annovar doesn't match for splicing_variant

Gene POS REF ALT ID_x Effect Transcript_ID hgvs_snpeff normalized_so_snpeff ID_y Consequence hgvs_vep normalized_so_vep normalized_so_annovar combined_effect hgvs
49201 CFTR 117120199 CAGG C . SPLICE_SITE_DONOR NM_000492.3 splicing_variant NaN NaN NaN NaN NaN NaN NaN
49214 CFTR 117120200 AGG A . SPLICE_SITE_DONOR NM_000492.3 splicing_variant NaN NaN NaN NaN NaN NaN NaN
49215 CFTR 117120200 AGGT A . SPLICE_SITE_DONOR NM_000492.3 splicing_variant NaN NaN NaN NaN NaN NaN NaN
49218 CFTR 117120201 G GA . SPLICE_SITE_DONOR NM_000492.3 splicing_variant NaN NaN NaN NaN NaN NaN NaN
49219 CFTR 117120201 G GACT . SPLICE_SITE_DONOR NM_000492.3 splicing_variant NaN NaN NaN NaN NaN NaN NaN

5016 rows

Annovar doesn't match for intron_variant

Gene POS REF ALT ID_x Effect Transcript_ID hgvs_snpeff normalized_so_snpeff ID_y Consequence hgvs_vep normalized_so_vep normalized_so_annovar combined_effect hgvs
49247 CFTR 117120203 T TA . INTRON NM_000492.3 c.53+3*>+A intron_variant NaN NaN NaN NaN NaN NaN NaN
49248 CFTR 117120203 T TAC . INTRON NM_000492.3 c.53+3*>+AC intron_variant NaN NaN NaN NaN NaN NaN NaN
49249 CFTR 117120203 T TAGC . INTRON NM_000492.3 c.53+3*>+AGC intron_variant NaN NaN NaN NaN NaN NaN NaN
49250 CFTR 117120203 T TC . INTRON NM_000492.3 c.53+3*>+C intron_variant NaN NaN NaN NaN NaN NaN NaN
49251 CFTR 117120203 T TG . INTRON NM_000492.3 c.53+3*>+G intron_variant NaN NaN NaN NaN NaN NaN NaN

321983 rows

Annovar doesn't match for stop_lost

Gene POS REF ALT ID_x Effect Transcript_ID hgvs_snpeff normalized_so_snpeff ID_y Consequence hgvs_vep normalized_so_vep normalized_so_annovar combined_effect hgvs
187885 CFTR 117307157 CTTT C . STOP_LOST NM_000492.3 p.X1480Glnext*?/c.4439*>-TTT stop_lost NaN NaN NaN NaN NaN NaN NaN
187899 CFTR 117307158 TTTA T . STOP_LOST NM_000492.3 p.X1480Leuext*?/c.4440*>-TTA stop_lost NaN NaN NaN NaN NaN NaN NaN
187903 CFTR 117307159 T TA . STOP_LOST NM_000492.3 p.*1481Xext*?/c.4441*>+A stop_lost NaN NaN NaN NaN NaN NaN NaN
187905 CFTR 117307159 T TC . STOP_LOST NM_000492.3 p.*1481Xext*?/c.4441*>+C stop_lost NaN NaN NaN NaN NaN NaN NaN
187907 CFTR 117307159 T TG . STOP_LOST NM_000492.3 p.*1481Xext*?/c.4441*>+G stop_lost NaN NaN NaN NaN NaN NaN NaN

160 rows

Annovar doesn't match for 3_prime_UTR_variant

Gene POS REF ALT ID_x Effect Transcript_ID hgvs_snpeff normalized_so_snpeff ID_y Consequence hgvs_vep normalized_so_vep normalized_so_annovar combined_effect hgvs
187944 CFTR 117307162 G GA . UTR_3_PRIME NM_000492.3 3_prime_UTR_variant NaN NaN NaN NaN NaN NaN NaN
187945 CFTR 117307162 G GC . UTR_3_PRIME NM_000492.3 3_prime_UTR_variant NaN NaN NaN NaN NaN NaN NaN
187946 CFTR 117307162 G GCA . UTR_3_PRIME NM_000492.3 3_prime_UTR_variant NaN NaN NaN NaN NaN NaN NaN
187947 CFTR 117307162 G GCAG . UTR_3_PRIME NM_000492.3 3_prime_UTR_variant NaN NaN NaN NaN NaN NaN NaN
187948 CFTR 117307162 G GG . UTR_3_PRIME NM_000492.3 3_prime_UTR_variant NaN NaN NaN NaN NaN NaN NaN

7970 rows

Annovar doesn't match for downstream_gene_variant

Gene POS REF ALT ID_x Effect Transcript_ID hgvs_snpeff normalized_so_snpeff ID_y Consequence hgvs_vep normalized_so_vep normalized_so_annovar combined_effect hgvs
188824 CTTNBP2 117349970 C A . DOWNSTREAM NM_033427.2 downstream_gene_variant NaN NaN NaN NaN NaN NaN NaN
188825 CTTNBP2 117349970 C CA . DOWNSTREAM NM_033427.2 downstream_gene_variant NaN NaN NaN NaN NaN NaN NaN
188826 CTTNBP2 117349970 C CAG . DOWNSTREAM NM_033427.2 downstream_gene_variant NaN NaN NaN NaN NaN NaN NaN
188827 CTTNBP2 117349970 C CAT . DOWNSTREAM NM_033427.2 downstream_gene_variant NaN NaN NaN NaN NaN NaN NaN
188828 CTTNBP2 117349970 C CC . DOWNSTREAM NM_033427.2 downstream_gene_variant NaN NaN NaN NaN NaN NaN NaN

75432 rows

Annovar doesn't match for nan

0 rows

In [12]:
sampletables =''
for effect in master_df["normalized_so_snpeff"].unique():
    sampletables += "<h2> Snpeff doesn't match for <em>" + str(effect) + "</em></h2>"
    query = master_df.loc[(master_df["normalized_so_annovar"]==effect) & 
                                  (master_df["normalized_so_snpeff"]!=effect) & 
                                  (master_df["normalized_so_vep"]==effect)]
    num_rows = query.count()[0]
    if num_rows > 0:
        sampletables += query.head(5).to_html()
    sampletables += "<p>" + str(num_rows) + " rows</p>"
HTML(sampletables)
Out[12]:

Snpeff doesn't match for intergenic_variant

0 rows

Snpeff doesn't match for upstream_gene_variant

0 rows

Snpeff doesn't match for 5_prime_UTR_variant

0 rows

Snpeff doesn't match for inframe_variant

0 rows

Snpeff doesn't match for frameshift_variant

0 rows

Snpeff doesn't match for nonsynonymous_variant

0 rows

Snpeff doesn't match for stop_gained

0 rows

Snpeff doesn't match for synonymous_variant

0 rows

Snpeff doesn't match for splicing_variant

0 rows

Snpeff doesn't match for intron_variant

0 rows

Snpeff doesn't match for stop_lost

0 rows

Snpeff doesn't match for 3_prime_UTR_variant

0 rows

Snpeff doesn't match for downstream_gene_variant

0 rows

Snpeff doesn't match for nan

0 rows

In [13]:
sampletables = '<h1>Other algo\'s agree, but...</h1>'
for effect in master_df["normalized_so_snpeff"].unique():
    sampletables += "<h2> Snpeff doesn't match for <em>" + str(effect) + "</em></h2>"
    query = master_df.loc[(master_df["normalized_so_annovar"]==effect) & 
                                  (master_df["normalized_so_snpeff"]!=effect) & 
                                  (master_df["normalized_so_vep"]==effect)]
    num_rows = query.count()[0]
    if num_rows > 0:
        sampletables += query.head(5).to_html()
    sampletables += "<p>" + str(num_rows) + " rows</p>"
HTML(sampletables)
Out[13]:

Other algo's agree, but...

Snpeff doesn't match for intergenic_variant

0 rows

Snpeff doesn't match for upstream_gene_variant

0 rows

Snpeff doesn't match for 5_prime_UTR_variant

0 rows

Snpeff doesn't match for inframe_variant

0 rows

Snpeff doesn't match for frameshift_variant

0 rows

Snpeff doesn't match for nonsynonymous_variant

0 rows

Snpeff doesn't match for stop_gained

0 rows

Snpeff doesn't match for synonymous_variant

0 rows

Snpeff doesn't match for splicing_variant

0 rows

Snpeff doesn't match for intron_variant

0 rows

Snpeff doesn't match for stop_lost

0 rows

Snpeff doesn't match for 3_prime_UTR_variant

0 rows

Snpeff doesn't match for downstream_gene_variant

0 rows

Snpeff doesn't match for nan

0 rows

In [14]:
sampletables = '<h1>At least 1 column doesn\'t match</h1>'
for effect in master_df["normalized_so_snpeff"].unique():
    sampletables += "<h2> Snpeff doesn't match for <em>" + str(effect) + "</em></h2>"
    query = num_rows = master_df.loc[(master_df["normalized_so_snpeff"]!=effect) & 
                                     ((master_df["normalized_so_annovar"]==effect) | (master_df["normalized_so_vep"]==effect))]
    num_rows = query.count()[0]
    if num_rows > 0:
        sampletables += query.head(5).to_html()
    sampletables += "<p>" + str(num_rows) + " rows</p>"
HTML(sampletables)
Out[14]:

At least 1 column doesn't match

Snpeff doesn't match for intergenic_variant

0 rows

Snpeff doesn't match for upstream_gene_variant

Gene POS REF ALT ID_x Effect Transcript_ID hgvs_snpeff normalized_so_snpeff ID_y Consequence hgvs_vep normalized_so_vep normalized_so_annovar combined_effect hgvs
224357 1080 117115644 G A NaN NaN NaN NaN NaN . upstream_gene_variant upstream_gene_variant NaN NaN NaN
224358 1080 117115644 G C NaN NaN NaN NaN NaN . upstream_gene_variant upstream_gene_variant NaN NaN NaN
224359 1080 117115644 G GA NaN NaN NaN NaN NaN . upstream_gene_variant upstream_gene_variant NaN NaN NaN
224360 1080 117115644 G GAC NaN NaN NaN NaN NaN . upstream_gene_variant upstream_gene_variant NaN NaN NaN
224361 1080 117115644 G GACG NaN NaN NaN NaN NaN . upstream_gene_variant upstream_gene_variant NaN NaN NaN

70294 rows

Snpeff doesn't match for 5_prime_UTR_variant

Gene POS REF ALT ID_x Effect Transcript_ID hgvs_snpeff normalized_so_snpeff ID_y Consequence hgvs_vep normalized_so_vep normalized_so_annovar combined_effect hgvs
238468 1080 117120014 GGGA G NaN NaN NaN NaN NaN . 5_prime_UTR_variant 5_prime_UTR_variant NaN NaN NaN
238481 1080 117120015 GGA G NaN NaN NaN NaN NaN . 5_prime_UTR_variant 5_prime_UTR_variant NaN NaN NaN
238482 1080 117120015 GGAA G NaN NaN NaN NaN NaN . 5_prime_UTR_variant 5_prime_UTR_variant NaN NaN NaN
238494 1080 117120016 GA G NaN NaN NaN NaN NaN . 5_prime_UTR_variant NM_000492.3:c.-132delA 5_prime_UTR_variant NaN NaN NaN
238495 1080 117120016 GAA G NaN NaN NaN NaN NaN . 5_prime_UTR_variant NM_000492.3:c.-132_-131delAA 5_prime_UTR_variant NaN NaN NaN

13796 rows

Snpeff doesn't match for inframe_variant

Gene POS REF ALT ID_x Effect Transcript_ID hgvs_snpeff normalized_so_snpeff ID_y Consequence hgvs_vep normalized_so_vep normalized_so_annovar combined_effect hgvs
240344 1080 117120148 CATG C NaN NaN NaN NaN NaN . inframe_deletion NP_000483.3:p.Met1?NM_000492.3:c.1_3delATG inframe_variant NaN NaN NaN
240349 1080 117120149 A AGAC NaN NaN NaN NaN NaN . inframe_insertion NP_000483.3:p.Met1?NM_000492.3:c.1_2insGAC inframe_variant NaN NaN NaN
240350 1080 117120149 A AGAT NaN NaN NaN NaN NaN . inframe_insertion NP_000483.3:p.Met1?NM_000492.3:c.1_2insGAT inframe_variant NaN NaN NaN
240358 1080 117120149 ATGC A NaN NaN NaN NaN NaN . inframe_deletion NP_000483.3:p.MetGln1_?2NM_000492.3:c.2_4delTGC inframe_variant NaN NaN NaN
240366 1080 117120150 T TGCA NaN NaN NaN NaN NaN . inframe_insertion NP_000483.3:p.Met1_Gln2insGlnNM_000492.3:c.2_3... inframe_variant NaN NaN NaN

46597 rows

Snpeff doesn't match for frameshift_variant

Gene POS REF ALT ID_x Effect Transcript_ID hgvs_snpeff normalized_so_snpeff ID_y Consequence hgvs_vep normalized_so_vep normalized_so_annovar combined_effect hgvs
240342 1080 117120148 CA C NaN NaN NaN NaN NaN . frameshift_variant NP_000483.3:p.Met1CysfsTer?NM_000492.3:c.1delA frameshift_variant NaN NaN NaN
240343 1080 117120148 CAT C NaN NaN NaN NaN NaN . frameshift_variant NP_000483.3:p.Met1AlafsTer?NM_000492.3:c.1_2delAT frameshift_variant NaN NaN NaN
240345 1080 117120149 A AA NaN NaN NaN NaN NaN . frameshift_variant NP_000483.3:p.Met1AsnfsTer?NM_000492.3:c.1dupA frameshift_variant NaN NaN NaN
240346 1080 117120149 A AC NaN NaN NaN NaN NaN . frameshift_variant NP_000483.3:p.Met1ThrfsTer?NM_000492.3:c.1_2insC frameshift_variant NaN NaN NaN
240347 1080 117120149 A AG NaN NaN NaN NaN NaN . frameshift_variant NP_000483.3:p.Met1SerfsTer?NM_000492.3:c.1_2insG frameshift_variant NaN NaN NaN

132883 rows

Snpeff doesn't match for nonsynonymous_variant

Gene POS REF ALT ID_x Effect Transcript_ID hgvs_snpeff normalized_so_snpeff ID_y Consequence hgvs_vep normalized_so_vep normalized_so_annovar combined_effect hgvs
240353 1080 117120149 A C NaN NaN NaN NaN NaN . initiator_codon_variant NP_000483.3:p.Met1?NM_000492.3:c.1A>C nonsynonymous_variant NaN NaN NaN
240354 1080 117120149 A G NaN NaN NaN NaN NaN . initiator_codon_variant NP_000483.3:p.Met1?NM_000492.3:c.1A>G nonsynonymous_variant NaN NaN NaN
240355 1080 117120149 A T NaN NaN NaN NaN NaN . initiator_codon_variant NP_000483.3:p.Met1?NM_000492.3:c.1A>T nonsynonymous_variant NaN NaN NaN
240359 1080 117120150 T A NaN NaN NaN NaN NaN . initiator_codon_variant NP_000483.3:p.Met1?NM_000492.3:c.2T>A nonsynonymous_variant NaN NaN NaN
240360 1080 117120150 T C NaN NaN NaN NaN NaN . initiator_codon_variant NP_000483.3:p.Met1?NM_000492.3:c.2T>C nonsynonymous_variant NaN NaN NaN

36437 rows

Snpeff doesn't match for stop_gained

Gene POS REF ALT ID_x Effect Transcript_ID hgvs_snpeff normalized_so_snpeff ID_y Consequence hgvs_vep normalized_so_vep normalized_so_annovar combined_effect hgvs
240382 1080 117120151 G GTGA NaN NaN NaN NaN NaN . stop_gained NP_000483.3:p.Met1_Gln2insTerNM_000492.3:c.3_4... stop_gained NaN NaN NaN
240397 1080 117120152 C T NaN NaN NaN NaN NaN . stop_gained NP_000483.3:p.Gln2TerNM_000492.3:c.4C>T stop_gained NaN NaN NaN
240466 1080 117120157 G GTAG NaN NaN NaN NaN NaN . stop_gained NP_000483.3:p.Arg3_Ser4insTerNM_000492.3:c.9_1... stop_gained NaN NaN NaN
240475 1080 117120158 T TAGT NaN NaN NaN NaN NaN . stop_gained NP_000483.3:p.Arg3_Ser4insTerNM_000492.3:c.10_... stop_gained NaN NaN NaN
240485 1080 117120159 C A NaN NaN NaN NaN NaN . stop_gained NP_000483.3:p.Ser4TerNM_000492.3:c.11C>A stop_gained NaN NaN NaN

6252 rows

Snpeff doesn't match for synonymous_variant

Gene POS REF ALT ID_x Effect Transcript_ID hgvs_snpeff normalized_so_snpeff ID_y Consequence hgvs_vep normalized_so_vep normalized_so_annovar combined_effect hgvs
240415 1080 117120154 G A NaN NaN NaN NaN NaN . synonymous_variant NM_000492.3:c.6G>A(p.%3D)NM_000492.3:c.6G>A synonymous_variant NaN NaN NaN
240437 1080 117120155 A C NaN NaN NaN NaN NaN . synonymous_variant NM_000492.3:c.7A>C(p.%3D)NM_000492.3:c.7A>C synonymous_variant NaN NaN NaN
240457 1080 117120157 G A NaN NaN NaN NaN NaN . synonymous_variant NM_000492.3:c.9G>A(p.%3D)NM_000492.3:c.9G>A synonymous_variant NaN NaN NaN
240499 1080 117120160 G A NaN NaN NaN NaN NaN . synonymous_variant NM_000492.3:c.12G>A(p.%3D)NM_000492.3:c.12G>A synonymous_variant NaN NaN NaN
240500 1080 117120160 G C NaN NaN NaN NaN NaN . synonymous_variant NM_000492.3:c.12G>C(p.%3D)NM_000492.3:c.12G>C synonymous_variant NaN NaN NaN

11010 rows

Snpeff doesn't match for splicing_variant

Gene POS REF ALT ID_x Effect Transcript_ID hgvs_snpeff normalized_so_snpeff ID_y Consequence hgvs_vep normalized_so_vep normalized_so_annovar combined_effect hgvs
241058 1080 117120199 CAGG C NaN NaN NaN NaN NaN . splice_donor_variant NM_000492.3:c.52_53+1delAGG splicing_variant NaN NaN NaN
241071 1080 117120200 AGG A NaN NaN NaN NaN NaN . splice_donor_variant NM_000492.3:c.53_53+1delGG splicing_variant NaN NaN NaN
241072 1080 117120200 AGGT A NaN NaN NaN NaN NaN . splice_donor_variant NM_000492.3:c.53_53+2delGGT splicing_variant NaN NaN NaN
241084 1080 117120201 GG G NaN NaN NaN NaN NaN . splice_donor_variant NM_000492.3:c.53+1delG splicing_variant NaN NaN NaN
241085 1080 117120201 GGT G NaN NaN NaN NaN NaN . splice_donor_variant NM_000492.3:c.53+1_53+2delGT splicing_variant NaN NaN NaN

5044 rows

Snpeff doesn't match for intron_variant

Gene POS REF ALT ID_x Effect Transcript_ID hgvs_snpeff normalized_so_snpeff ID_y Consequence hgvs_vep normalized_so_vep normalized_so_annovar combined_effect hgvs
241104 1080 117120203 T TA NaN NaN NaN NaN NaN . intron_variant NM_000492.3:c.53+2_53+3insA intron_variant NaN NaN NaN
241105 1080 117120203 T TAC NaN NaN NaN NaN NaN . intron_variant NM_000492.3:c.53+2_53+3insAC intron_variant NaN NaN NaN
241106 1080 117120203 T TAGC NaN NaN NaN NaN NaN . intron_variant NM_000492.3:c.53+2_53+3insAGC intron_variant NaN NaN NaN
241107 1080 117120203 T TC NaN NaN NaN NaN NaN . intron_variant NM_000492.3:c.53+2_53+3insC intron_variant NaN NaN NaN
241108 1080 117120203 T TG NaN NaN NaN NaN NaN . intron_variant NM_000492.3:c.53+2_53+3insG intron_variant NaN NaN NaN

320838 rows

Snpeff doesn't match for stop_lost

Gene POS REF ALT ID_x Effect Transcript_ID hgvs_snpeff normalized_so_snpeff ID_y Consequence hgvs_vep normalized_so_vep normalized_so_annovar combined_effect hgvs
379742 1080 117307157 CTTT C NaN NaN NaN NaN NaN . stop_lost NP_000483.3:p.LeuTer1480GlnNM_000492.3:c.4439_... stop_lost NaN NaN NaN
379755 1080 117307158 TTT T NaN NaN NaN NaN NaN . stop_lost NP_000483.3:p.Ter1481GluNM_000492.3:c.4440_444... stop_lost NaN NaN NaN
379756 1080 117307158 TTTA T NaN NaN NaN NaN NaN . stop_lost NP_000483.3:p.Ter1481delextTer3NM_000492.3:c.4... stop_lost NaN NaN NaN
379768 1080 117307159 TT T NaN NaN NaN NaN NaN . stop_lost NP_000483.3:p.Ter1481ArgNM_000492.3:c.4441delT stop_lost NaN NaN NaN
379769 1080 117307159 TTA T NaN NaN NaN NaN NaN . stop_lost NP_000483.3:p.Ter1481GluNM_000492.3:c.4441_444... stop_lost NaN NaN NaN

150 rows

Snpeff doesn't match for 3_prime_UTR_variant

Gene POS REF ALT ID_x Effect Transcript_ID hgvs_snpeff normalized_so_snpeff ID_y Consequence hgvs_vep normalized_so_vep normalized_so_annovar combined_effect hgvs
379784 1080 117307160 TAGA T NaN NaN NaN NaN NaN . 3_prime_UTR_variant NM_000492.3:c.*1_4442delAGA 3_prime_UTR_variant NaN NaN NaN
379797 1080 117307161 AGA A NaN NaN NaN NaN NaN . 3_prime_UTR_variant NM_000492.3:c.*1_4443delGA 3_prime_UTR_variant NaN NaN NaN
379798 1080 117307161 AGAG A NaN NaN NaN NaN NaN . 3_prime_UTR_variant NM_000492.3:c.*2_4443delGAG 3_prime_UTR_variant NaN NaN NaN
379801 1080 117307162 G GA NaN NaN NaN NaN NaN . 3_prime_UTR_variant NM_000492.3:c.*1_4443insA 3_prime_UTR_variant NaN NaN NaN
379802 1080 117307162 G GC NaN NaN NaN NaN NaN . 3_prime_UTR_variant NM_000492.3:c.*1_4443insC 3_prime_UTR_variant NaN NaN NaN

8909 rows

Snpeff doesn't match for downstream_gene_variant

Gene POS REF ALT ID_x Effect Transcript_ID hgvs_snpeff normalized_so_snpeff ID_y Consequence hgvs_vep normalized_so_vep normalized_so_annovar combined_effect hgvs
380681 83992 117349970 C A NaN NaN NaN NaN NaN . downstream_gene_variant downstream_gene_variant NaN NaN NaN
380682 83992 117349970 C CA NaN NaN NaN NaN NaN . downstream_gene_variant downstream_gene_variant NaN NaN NaN
380683 83992 117349970 C CAG NaN NaN NaN NaN NaN . downstream_gene_variant downstream_gene_variant NaN NaN NaN
380684 83992 117349970 C CAT NaN NaN NaN NaN NaN . downstream_gene_variant downstream_gene_variant NaN NaN NaN
380685 83992 117349970 C CC NaN NaN NaN NaN NaN . downstream_gene_variant downstream_gene_variant NaN NaN NaN

66262 rows

Snpeff doesn't match for nan

0 rows

In [15]:
sampletables = '<h1>Other algo\'s agree, but...</h1>'
for effect in master_df["normalized_so_snpeff"].unique():
    sampletables += "<h2> VEP doesn't match for <em>" + str(effect) + "</em></h2>"
    query = master_df.loc[(master_df["normalized_so_annovar"]==effect) & 
                                  (master_df["normalized_so_snpeff"]==effect) & 
                                  (master_df["normalized_so_vep"]!=effect)]
    num_rows = query.count()[0]
    if num_rows > 0:
        sampletables += query.head(5).to_html()
    sampletables += "<p>" + str(num_rows) + " rows</p>"
HTML(sampletables)
Out[15]:

Other algo's agree, but...

VEP doesn't match for intergenic_variant

0 rows

VEP doesn't match for upstream_gene_variant

0 rows

VEP doesn't match for 5_prime_UTR_variant

0 rows

VEP doesn't match for inframe_variant

0 rows

VEP doesn't match for frameshift_variant

0 rows

VEP doesn't match for nonsynonymous_variant

0 rows

VEP doesn't match for stop_gained

0 rows

VEP doesn't match for synonymous_variant

0 rows

VEP doesn't match for splicing_variant

0 rows

VEP doesn't match for intron_variant

0 rows

VEP doesn't match for stop_lost

0 rows

VEP doesn't match for 3_prime_UTR_variant

0 rows

VEP doesn't match for downstream_gene_variant

0 rows

VEP doesn't match for nan

0 rows

In [16]:
sampletables = '<h1>At least 1 column doesn\'t match</h1>'
for effect in master_df["normalized_so_snpeff"].unique():
    sampletables += "<h2> VEP doesn't match for <em>" + str(effect) + "</em></h2>"
    query = master_df.loc[(master_df["normalized_so_vep"] != effect) &
                          ((master_df["normalized_so_annovar"]==effect) | (master_df["normalized_so_snpeff"]==effect))]
    num_rows = query.count()[0]
    if num_rows > 0:
        sampletables += query.head(5).to_html()
    sampletables += "<p>" + str(num_rows) + " rows</p>"
HTML(sampletables)
Out[16]:

At least 1 column doesn't match

VEP doesn't match for intergenic_variant

Gene POS REF ALT ID_x Effect Transcript_ID hgvs_snpeff normalized_so_snpeff ID_y Consequence hgvs_vep normalized_so_vep normalized_so_annovar combined_effect hgvs
3472 117115644 G A . INTERGENIC intergenic_variant NaN NaN NaN NaN NaN NaN NaN
3473 117115644 G C . INTERGENIC intergenic_variant NaN NaN NaN NaN NaN NaN NaN
3474 117115644 G GA . INTERGENIC intergenic_variant NaN NaN NaN NaN NaN NaN NaN
3475 117115644 G GAC . INTERGENIC intergenic_variant NaN NaN NaN NaN NaN NaN NaN
3476 117115644 G GACG . INTERGENIC intergenic_variant NaN NaN NaN NaN NaN NaN NaN

23302 rows

VEP doesn't match for upstream_gene_variant

Gene POS REF ALT ID_x Effect Transcript_ID hgvs_snpeff normalized_so_snpeff ID_y Consequence hgvs_vep normalized_so_vep normalized_so_annovar combined_effect hgvs
32500 CFTR 117115644 G A . UPSTREAM NM_000492.3 upstream_gene_variant NaN NaN NaN NaN NaN NaN NaN
32501 CFTR 117115644 G C . UPSTREAM NM_000492.3 upstream_gene_variant NaN NaN NaN NaN NaN NaN NaN
32502 CFTR 117115644 G GA . UPSTREAM NM_000492.3 upstream_gene_variant NaN NaN NaN NaN NaN NaN NaN
32503 CFTR 117115644 G GAC . UPSTREAM NM_000492.3 upstream_gene_variant NaN NaN NaN NaN NaN NaN NaN
32504 CFTR 117115644 G GACG . UPSTREAM NM_000492.3 upstream_gene_variant NaN NaN NaN NaN NaN NaN NaN

15534 rows

VEP doesn't match for 5_prime_UTR_variant

Gene POS REF ALT ID_x Effect Transcript_ID hgvs_snpeff normalized_so_snpeff ID_y Consequence hgvs_vep normalized_so_vep normalized_so_annovar combined_effect hgvs
46611 CFTR 117120014 GGGA G . UTR_5_PRIME NM_000492.3 5_prime_UTR_variant NaN NaN NaN NaN NaN NaN NaN
46624 CFTR 117120015 GGA G . UTR_5_PRIME NM_000492.3 5_prime_UTR_variant NaN NaN NaN NaN NaN NaN NaN
46625 CFTR 117120015 GGAA G . UTR_5_PRIME NM_000492.3 5_prime_UTR_variant NaN NaN NaN NaN NaN NaN NaN
46628 CFTR 117120016 G GA . UTR_5_PRIME NM_000492.3 5_prime_UTR_variant NaN NaN NaN NaN NaN NaN NaN
46629 CFTR 117120016 G GAGC . UTR_5_PRIME NM_000492.3 5_prime_UTR_variant NaN NaN NaN NaN NaN NaN NaN

7552 rows

VEP doesn't match for inframe_variant

Gene POS REF ALT ID_x Effect Transcript_ID hgvs_snpeff normalized_so_snpeff ID_y Consequence hgvs_vep normalized_so_vep normalized_so_annovar combined_effect hgvs
48459 CFTR 117120146 ACCA A . CODON_DELETION NM_000492.3 p.Met1X/c.1*>-CCA inframe_variant NaN NaN NaN NaN NaN NaN NaN
48473 CFTR 117120147 CCAT C . CODON_DELETION NM_000492.3 p.Met1X/c.1*>-CAT inframe_variant NaN NaN NaN NaN NaN NaN NaN
48476 CFTR 117120148 C CACT . CODON_INSERTION NM_000492.3 p.Met1X/c.1*>+ACT inframe_variant NaN NaN NaN NaN NaN NaN NaN
48482 CFTR 117120148 C CTGC . CODON_INSERTION NM_000492.3 p.Met1X/c.1*>+TGC inframe_variant NaN NaN NaN NaN NaN NaN NaN
48487 CFTR 117120148 CATG C . CODON_DELETION NM_000492.3 p.Met1X/c.1*>-ATG inframe_variant NaN NaN NaN NaN NaN NaN NaN

24882 rows

VEP doesn't match for frameshift_variant

Gene POS REF ALT ID_x Effect Transcript_ID hgvs_snpeff normalized_so_snpeff ID_y Consequence hgvs_vep normalized_so_vep normalized_so_annovar combined_effect hgvs
48472 CFTR 117120147 CCA C . FRAME_SHIFT NM_000492.3 p.X1X/c.1*>-CA frameshift_variant NaN NaN NaN NaN NaN NaN NaN
48475 CFTR 117120148 C CA . FRAME_SHIFT NM_000492.3 p.Met1X/c.1*>+A frameshift_variant NaN NaN NaN NaN NaN NaN NaN
48477 CFTR 117120148 C CC . FRAME_SHIFT NM_000492.3 p.Met1X/c.1*>+C frameshift_variant NaN NaN NaN NaN NaN NaN NaN
48478 CFTR 117120148 C CG . FRAME_SHIFT NM_000492.3 p.Met1X/c.1*>+G frameshift_variant NaN NaN NaN NaN NaN NaN NaN
48479 CFTR 117120148 C CGC . FRAME_SHIFT NM_000492.3 p.Met1X/c.1*>+GC frameshift_variant NaN NaN NaN NaN NaN NaN NaN

69548 rows

VEP doesn't match for nonsynonymous_variant

Gene POS REF ALT ID_x Effect Transcript_ID hgvs_snpeff normalized_so_snpeff ID_y Consequence hgvs_vep normalized_so_vep normalized_so_annovar combined_effect hgvs
48496 CFTR 117120149 A C . NON_SYNONYMOUS_START NM_000492.3 p.Met1?/c.1A>C nonsynonymous_variant NaN NaN NaN NaN NaN NaN NaN
48497 CFTR 117120149 A G . START_LOST NM_000492.3 p.Met1?/c.1A>G nonsynonymous_variant NaN NaN NaN NaN NaN NaN NaN
48498 CFTR 117120149 A T . NON_SYNONYMOUS_START NM_000492.3 p.Met1?/c.1A>T nonsynonymous_variant NaN NaN NaN NaN NaN NaN NaN
48502 CFTR 117120150 T A . START_LOST NM_000492.3 p.Met1?/c.2T>A nonsynonymous_variant NaN NaN NaN NaN NaN NaN NaN
48503 CFTR 117120150 T C . START_LOST NM_000492.3 p.Met1?/c.2T>C nonsynonymous_variant NaN NaN NaN NaN NaN NaN NaN

19460 rows

VEP doesn't match for stop_gained

Gene POS REF ALT ID_x Effect Transcript_ID hgvs_snpeff normalized_so_snpeff ID_y Consequence hgvs_vep normalized_so_vep normalized_so_annovar combined_effect hgvs
48525 CFTR 117120151 G GTGA . STOP_GAINED NM_000492.3 p.Gln2*/c.4*>+TGA stop_gained NaN NaN NaN NaN NaN NaN NaN
48540 CFTR 117120152 C T . STOP_GAINED NM_000492.3 p.Gln2*/c.4C>T stop_gained NaN NaN NaN NaN NaN NaN NaN
48564 CFTR 117120154 G GT . STOP_GAINED NM_000492.3 p.Arg3*/c.7*>+T stop_gained NaN NaN NaN NaN NaN NaN NaN
48565 CFTR 117120154 G GTA . STOP_GAINED NM_000492.3 p.Arg3*/c.7*>+TA stop_gained NaN NaN NaN NaN NaN NaN NaN
48609 CFTR 117120157 G GTAG . STOP_GAINED NM_000492.3 p.Ser4*/c.10*>+TAG stop_gained NaN NaN NaN NaN NaN NaN NaN

4588 rows

VEP doesn't match for synonymous_variant

Gene POS REF ALT ID_x Effect Transcript_ID hgvs_snpeff normalized_so_snpeff ID_y Consequence hgvs_vep normalized_so_vep normalized_so_annovar combined_effect hgvs
48558 CFTR 117120154 G A . SYNONYMOUS_CODING NM_000492.3 p.Gln2Gln/c.6G>A synonymous_variant NaN NaN NaN NaN NaN NaN NaN
48580 CFTR 117120155 A C . SYNONYMOUS_CODING NM_000492.3 p.Arg3Arg/c.7A>C synonymous_variant NaN NaN NaN NaN NaN NaN NaN
48600 CFTR 117120157 G A . SYNONYMOUS_CODING NM_000492.3 p.Arg3Arg/c.9G>A synonymous_variant NaN NaN NaN NaN NaN NaN NaN
48642 CFTR 117120160 G A . SYNONYMOUS_CODING NM_000492.3 p.Ser4Ser/c.12G>A synonymous_variant NaN NaN NaN NaN NaN NaN NaN
48643 CFTR 117120160 G C . SYNONYMOUS_CODING NM_000492.3 p.Ser4Ser/c.12G>C synonymous_variant NaN NaN NaN NaN NaN NaN NaN

5882 rows

VEP doesn't match for splicing_variant

Gene POS REF ALT ID_x Effect Transcript_ID hgvs_snpeff normalized_so_snpeff ID_y Consequence hgvs_vep normalized_so_vep normalized_so_annovar combined_effect hgvs
49201 CFTR 117120199 CAGG C . SPLICE_SITE_DONOR NM_000492.3 splicing_variant NaN NaN NaN NaN NaN NaN NaN
49214 CFTR 117120200 AGG A . SPLICE_SITE_DONOR NM_000492.3 splicing_variant NaN NaN NaN NaN NaN NaN NaN
49215 CFTR 117120200 AGGT A . SPLICE_SITE_DONOR NM_000492.3 splicing_variant NaN NaN NaN NaN NaN NaN NaN
49218 CFTR 117120201 G GA . SPLICE_SITE_DONOR NM_000492.3 splicing_variant NaN NaN NaN NaN NaN NaN NaN
49219 CFTR 117120201 G GACT . SPLICE_SITE_DONOR NM_000492.3 splicing_variant NaN NaN NaN NaN NaN NaN NaN

3252 rows

VEP doesn't match for intron_variant

Gene POS REF ALT ID_x Effect Transcript_ID hgvs_snpeff normalized_so_snpeff ID_y Consequence hgvs_vep normalized_so_vep normalized_so_annovar combined_effect hgvs
49247 CFTR 117120203 T TA . INTRON NM_000492.3 c.53+3*>+A intron_variant NaN NaN NaN NaN NaN NaN NaN
49248 CFTR 117120203 T TAC . INTRON NM_000492.3 c.53+3*>+AC intron_variant NaN NaN NaN NaN NaN NaN NaN
49249 CFTR 117120203 T TAGC . INTRON NM_000492.3 c.53+3*>+AGC intron_variant NaN NaN NaN NaN NaN NaN NaN
49250 CFTR 117120203 T TC . INTRON NM_000492.3 c.53+3*>+C intron_variant NaN NaN NaN NaN NaN NaN NaN
49251 CFTR 117120203 T TG . INTRON NM_000492.3 c.53+3*>+G intron_variant NaN NaN NaN NaN NaN NaN NaN

173531 rows

VEP doesn't match for stop_lost

Gene POS REF ALT ID_x Effect Transcript_ID hgvs_snpeff normalized_so_snpeff ID_y Consequence hgvs_vep normalized_so_vep normalized_so_annovar combined_effect hgvs
187885 CFTR 117307157 CTTT C . STOP_LOST NM_000492.3 p.X1480Glnext*?/c.4439*>-TTT stop_lost NaN NaN NaN NaN NaN NaN NaN
187899 CFTR 117307158 TTTA T . STOP_LOST NM_000492.3 p.X1480Leuext*?/c.4440*>-TTA stop_lost NaN NaN NaN NaN NaN NaN NaN
187903 CFTR 117307159 T TA . STOP_LOST NM_000492.3 p.*1481Xext*?/c.4441*>+A stop_lost NaN NaN NaN NaN NaN NaN NaN
187905 CFTR 117307159 T TC . STOP_LOST NM_000492.3 p.*1481Xext*?/c.4441*>+C stop_lost NaN NaN NaN NaN NaN NaN NaN
187907 CFTR 117307159 T TG . STOP_LOST NM_000492.3 p.*1481Xext*?/c.4441*>+G stop_lost NaN NaN NaN NaN NaN NaN NaN

50 rows

VEP doesn't match for 3_prime_UTR_variant

Gene POS REF ALT ID_x Effect Transcript_ID hgvs_snpeff normalized_so_snpeff ID_y Consequence hgvs_vep normalized_so_vep normalized_so_annovar combined_effect hgvs
187944 CFTR 117307162 G GA . UTR_3_PRIME NM_000492.3 3_prime_UTR_variant NaN NaN NaN NaN NaN NaN NaN
187945 CFTR 117307162 G GC . UTR_3_PRIME NM_000492.3 3_prime_UTR_variant NaN NaN NaN NaN NaN NaN NaN
187946 CFTR 117307162 G GCA . UTR_3_PRIME NM_000492.3 3_prime_UTR_variant NaN NaN NaN NaN NaN NaN NaN
187947 CFTR 117307162 G GCAG . UTR_3_PRIME NM_000492.3 3_prime_UTR_variant NaN NaN NaN NaN NaN NaN NaN
187948 CFTR 117307162 G GG . UTR_3_PRIME NM_000492.3 3_prime_UTR_variant NaN NaN NaN NaN NaN NaN NaN

6897 rows

VEP doesn't match for downstream_gene_variant

Gene POS REF ALT ID_x Effect Transcript_ID hgvs_snpeff normalized_so_snpeff ID_y Consequence hgvs_vep normalized_so_vep normalized_so_annovar combined_effect hgvs
188824 CTTNBP2 117349970 C A . DOWNSTREAM NM_033427.2 downstream_gene_variant NaN NaN NaN NaN NaN NaN NaN
188825 CTTNBP2 117349970 C CA . DOWNSTREAM NM_033427.2 downstream_gene_variant NaN NaN NaN NaN NaN NaN NaN
188826 CTTNBP2 117349970 C CAG . DOWNSTREAM NM_033427.2 downstream_gene_variant NaN NaN NaN NaN NaN NaN NaN
188827 CTTNBP2 117349970 C CAT . DOWNSTREAM NM_033427.2 downstream_gene_variant NaN NaN NaN NaN NaN NaN NaN
188828 CTTNBP2 117349970 C CC . DOWNSTREAM NM_033427.2 downstream_gene_variant NaN NaN NaN NaN NaN NaN NaN

9170 rows

VEP doesn't match for nan

0 rows