Notebook

This is a re-implementation of some Pythona and R code based off the article, https://shiring.github.io/r_vs_python/2017/01/22/R_vs_Py_post which was based off of https://www.toptal.com/python/comprehensive-introduction-your-genome-scipy

In [1]:

# !wget ftp://ftp.ensembl.org/pub/release-85/gff3/homo_sapiens/Homo_sapiens.GRCh38.85.gff3.gz

In [2]:

import pandas as pd
import matplotlib as plt
%matplotlib inline

In [3]:

import warnings
warnings.filterwarnings('ignore')

In [4]:

%load_ext rpy2.ipython

In [5]:

%%R
library(dplyr)
library(ggplot2)

In [6]:

%%timeit -n 1 -r 1
global df
col_names = ['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes']
df = pd.read_csv('Homo_sapiens.GRCh38.85.gff3.gz', compression='gzip',
                         sep='\t', comment='#', low_memory=False,
                         header=None, names=col_names)

10 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)

In [7]:

%%timeit -n 1 -r 1
%%R
df <- read.csv("Homo_sapiens.GRCh38.85.gff3.gz", 
               header = FALSE, 
               sep = "\t", 
               col.names = c('seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes'), 
               comment.char = "#")
head(df)

  seqid source              type start       end   score strand phase
1     1 GRCh38        chromosome     1 248956422       .      .     .
2     1      . biological_region 10469     11240 1.3e+03      .     .
3     1      . biological_region 10650     10657   0.999      +     .
4     1      . biological_region 10655     10657   0.999      -     .
5     1      . biological_region 10678     10687   0.999      +     .
6     1      . biological_region 10681     10688   0.999      -     .
                                          attributes
1 ID=chromosome:1;Alias=CM000663.2,chr1,NC_000001.11
2           external_name=oe %3D 0.79;logic_name=cpg
3                                 logic_name=eponine
4                                 logic_name=eponine
5                                 logic_name=eponine
6                                 logic_name=eponine

1min 11s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)

In [8]:

%%timeit -n 1 -r 1
print(df.seqid.unique())

['1' '10' '11' '12' '13' '14' '15' '16' '17' '18' '19' '2' '20' '21' '22'
 '3' '4' '5' '6' '7' '8' '9' 'GL000008.2' 'GL000009.2' 'GL000194.1'
 'GL000195.1' 'GL000205.2' 'GL000208.1' 'GL000213.1' 'GL000214.1'
 'GL000216.2' 'GL000218.1' 'GL000219.1' 'GL000220.1' 'GL000221.1'
 'GL000224.1' 'GL000225.1' 'GL000226.1' 'KI270302.1' 'KI270303.1'
 'KI270304.1' 'KI270305.1' 'KI270310.1' 'KI270311.1' 'KI270312.1'
 'KI270315.1' 'KI270316.1' 'KI270317.1' 'KI270320.1' 'KI270322.1'
 'KI270329.1' 'KI270330.1' 'KI270333.1' 'KI270334.1' 'KI270335.1'
 'KI270336.1' 'KI270337.1' 'KI270338.1' 'KI270340.1' 'KI270362.1'
 'KI270363.1' 'KI270364.1' 'KI270366.1' 'KI270371.1' 'KI270372.1'
 'KI270373.1' 'KI270374.1' 'KI270375.1' 'KI270376.1' 'KI270378.1'
 'KI270379.1' 'KI270381.1' 'KI270382.1' 'KI270383.1' 'KI270384.1'
 'KI270385.1' 'KI270386.1' 'KI270387.1' 'KI270388.1' 'KI270389.1'
 'KI270390.1' 'KI270391.1' 'KI270392.1' 'KI270393.1' 'KI270394.1'
 'KI270395.1' 'KI270396.1' 'KI270411.1' 'KI270412.1' 'KI270414.1'
 'KI270417.1' 'KI270418.1' 'KI270419.1' 'KI270420.1' 'KI270422.1'
 'KI270423.1' 'KI270424.1' 'KI270425.1' 'KI270429.1' 'KI270435.1'
 'KI270438.1' 'KI270442.1' 'KI270448.1' 'KI270465.1' 'KI270466.1'
 'KI270467.1' 'KI270468.1' 'KI270507.1' 'KI270508.1' 'KI270509.1'
 'KI270510.1' 'KI270511.1' 'KI270512.1' 'KI270515.1' 'KI270516.1'
 'KI270517.1' 'KI270518.1' 'KI270519.1' 'KI270521.1' 'KI270522.1'
 'KI270528.1' 'KI270529.1' 'KI270530.1' 'KI270538.1' 'KI270539.1'
 'KI270544.1' 'KI270548.1' 'KI270579.1' 'KI270580.1' 'KI270581.1'
 'KI270582.1' 'KI270583.1' 'KI270584.1' 'KI270587.1' 'KI270588.1'
 'KI270589.1' 'KI270590.1' 'KI270591.1' 'KI270593.1' 'KI270706.1'
 'KI270707.1' 'KI270708.1' 'KI270709.1' 'KI270710.1' 'KI270711.1'
 'KI270712.1' 'KI270713.1' 'KI270714.1' 'KI270715.1' 'KI270716.1'
 'KI270717.1' 'KI270718.1' 'KI270719.1' 'KI270720.1' 'KI270721.1'
 'KI270722.1' 'KI270723.1' 'KI270724.1' 'KI270725.1' 'KI270726.1'
 'KI270727.1' 'KI270728.1' 'KI270729.1' 'KI270730.1' 'KI270731.1'
 'KI270732.1' 'KI270733.1' 'KI270734.1' 'KI270735.1' 'KI270736.1'
 'KI270737.1' 'KI270738.1' 'KI270739.1' 'KI270740.1' 'KI270741.1'
 'KI270742.1' 'KI270743.1' 'KI270744.1' 'KI270745.1' 'KI270746.1'
 'KI270747.1' 'KI270748.1' 'KI270749.1' 'KI270750.1' 'KI270751.1'
 'KI270752.1' 'KI270753.1' 'KI270754.1' 'KI270755.1' 'KI270756.1'
 'KI270757.1' 'MT' 'X' 'Y']
71.6 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)

In [9]:

%%timeit -n 1 -r 1
%%R
unique(df$seqid)

  [1] 1          10         11         12         13         14        
  [7] 15         16         17         18         19         2         
 [13] 20         21         22         3          4          5         
 [19] 6          7          8          9          GL000008.2 GL000009.2
 [25] GL000194.1 GL000195.1 GL000205.2 GL000208.1 GL000213.1 GL000214.1
 [31] GL000216.2 GL000218.1 GL000219.1 GL000220.1 GL000221.1 GL000224.1
 [37] GL000225.1 GL000226.1 KI270302.1 KI270303.1 KI270304.1 KI270305.1
 [43] KI270310.1 KI270311.1 KI270312.1 KI270315.1 KI270316.1 KI270317.1
 [49] KI270320.1 KI270322.1 KI270329.1 KI270330.1 KI270333.1 KI270334.1
 [55] KI270335.1 KI270336.1 KI270337.1 KI270338.1 KI270340.1 KI270362.1
 [61] KI270363.1 KI270364.1 KI270366.1 KI270371.1 KI270372.1 KI270373.1
 [67] KI270374.1 KI270375.1 KI270376.1 KI270378.1 KI270379.1 KI270381.1
 [73] KI270382.1 KI270383.1 KI270384.1 KI270385.1 KI270386.1 KI270387.1
 [79] KI270388.1 KI270389.1 KI270390.1 KI270391.1 KI270392.1 KI270393.1
 [85] KI270394.1 KI270395.1 KI270396.1 KI270411.1 KI270412.1 KI270414.1
 [91] KI270417.1 KI270418.1 KI270419.1 KI270420.1 KI270422.1 KI270423.1
 [97] KI270424.1 KI270425.1 KI270429.1 KI270435.1 KI270438.1 KI270442.1
[103] KI270448.1 KI270465.1 KI270466.1 KI270467.1 KI270468.1 KI270507.1
[109] KI270508.1 KI270509.1 KI270510.1 KI270511.1 KI270512.1 KI270515.1
[115] KI270516.1 KI270517.1 KI270518.1 KI270519.1 KI270521.1 KI270522.1
[121] KI270528.1 KI270529.1 KI270530.1 KI270538.1 KI270539.1 KI270544.1
[127] KI270548.1 KI270579.1 KI270580.1 KI270581.1 KI270582.1 KI270583.1
[133] KI270584.1 KI270587.1 KI270588.1 KI270589.1 KI270590.1 KI270591.1
[139] KI270593.1 KI270706.1 KI270707.1 KI270708.1 KI270709.1 KI270710.1
[145] KI270711.1 KI270712.1 KI270713.1 KI270714.1 KI270715.1 KI270716.1
[151] KI270717.1 KI270718.1 KI270719.1 KI270720.1 KI270721.1 KI270722.1
[157] KI270723.1 KI270724.1 KI270725.1 KI270726.1 KI270727.1 KI270728.1
[163] KI270729.1 KI270730.1 KI270731.1 KI270732.1 KI270733.1 KI270734.1
[169] KI270735.1 KI270736.1 KI270737.1 KI270738.1 KI270739.1 KI270740.1
[175] KI270741.1 KI270742.1 KI270743.1 KI270744.1 KI270745.1 KI270746.1
[181] KI270747.1 KI270748.1 KI270749.1 KI270750.1 KI270751.1 KI270752.1
[187] KI270753.1 KI270754.1 KI270755.1 KI270756.1 KI270757.1 MT        
[193] X          Y         
194 Levels: 1 10 11 12 13 14 15 16 17 18 19 2 20 21 22 3 4 5 6 7 8 ... Y

35.4 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)

In [10]:

%%timeit -n 1 -r 1
df.seqid.unique().shape

79.4 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)

In [11]:

%%timeit -n 1 -r 1
%%R
length(unique(df$seqid))

[1] 194

36.3 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)

In [12]:

%%timeit -n 1 -r 1
df.source.value_counts()

300 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)

In [13]:

%%timeit -n 1 -r 1
%%R
sort(table(df$source), decreasing = TRUE)

        havana ensembl_havana        ensembl              .        mirbase 
       1441093         745065         228212         182510           4701 
        GRCh38          insdc 
           194             74

274 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)

In [14]:

%%timeit -n 1 -r 1
global gdf
gdf = df[df.source == 'GRCh38']
gdf.shape
print(gdf.sample(10))

              seqid  source         type    start       end score strand  \
2511495  KI270389.1  GRCh38  supercontig        1      1298     .      .   
2594560           Y  GRCh38   chromosome  2781480  56887902     .      .   
2511468  KI270333.1  GRCh38  supercontig        1      2699     .      .   
2513704  KI270739.1  GRCh38  supercontig        1     73985     .      .   
2511559  KI270522.1  GRCh38  supercontig        1      5674     .      .   
2511388  GL000224.1  GRCh38  supercontig        1    179693     .      .   
2511481  KI270373.1  GRCh38  supercontig        1      1451     .      .   
2511460  KI270312.1  GRCh38  supercontig        1       998     .      .   
2511573  KI270584.1  GRCh38  supercontig        1      4513     .      .   
2511588  KI270709.1  GRCh38  supercontig        1     66860     .      .   

        phase                                         attributes  
2511495     .  ID=supercontig:KI270389.1;Alias=chrUn_KI270389...  
2594560     .  ID=chromosome:Y;Alias=CM000686.2,chrY,NC_00002...  
2511468     .  ID=supercontig:KI270333.1;Alias=chrUn_KI270333...  
2513704     .  ID=supercontig:KI270739.1;Alias=chr22_KI270739...  
2511559     .  ID=supercontig:KI270522.1;Alias=chrUn_KI270522...  
2511388     .  ID=supercontig:GL000224.1;Alias=chrUn_GL000224...  
2511481     .  ID=supercontig:KI270373.1;Alias=chrUn_KI270373...  
2511460     .  ID=supercontig:KI270312.1;Alias=chrUn_KI270312...  
2511573     .  ID=supercontig:KI270584.1;Alias=chrUn_KI270584...  
2511588     .  ID=supercontig:KI270709.1;Alias=chr1_KI270709v...  
192 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)

In [15]:

%%timeit -n 1 -r 1
%%R
gdf <- df[df$source == "GRCh38", ]
dim(gdf)
sample_n(gdf, 10)

             seqid source        type start       end score strand phase
865054          16 GRCh38  chromosome     1  90338345     .      .     .
2511482 KI270373.1 GRCh38 supercontig     1      1451     .      .     .
2511636 KI270711.1 GRCh38 supercontig     1     42210     .      .     .
2511491 KI270384.1 GRCh38 supercontig     1      1658     .      .     .
2511458 KI270305.1 GRCh38 supercontig     1      1472     .      .     .
2511544 KI270465.1 GRCh38 supercontig     1      1774     .      .     .
235069          10 GRCh38  chromosome     1 133797422     .      .     .
2511515 KI270429.1 GRCh38 supercontig     1      1361     .      .     .
990811          17 GRCh38  chromosome     1  83257441     .      .     .
2511485 KI270376.1 GRCh38 supercontig     1      1136     .      .     .
                                                                attributes
865054                ID=chromosome:16;Alias=CM000678.2,chr16,NC_000016.10
2511482       ID=supercontig:KI270373.1;Alias=chrUn_KI270373v1,NT_187492.1
2511636 ID=supercontig:KI270711.1;Alias=chr1_KI270711v1_random,NT_187366.1
2511491       ID=supercontig:KI270384.1;Alias=chrUn_KI270384v1,NT_187484.1
2511458       ID=supercontig:KI270305.1;Alias=chrUn_KI270305v1,NT_187399.1
2511544       ID=supercontig:KI270465.1;Alias=chrUn_KI270465v1,NT_187422.1
235069                ID=chromosome:10;Alias=CM000672.2,chr10,NC_000010.11
2511515       ID=supercontig:KI270429.1;Alias=chrUn_KI270429v1,NT_187419.1
990811                ID=chromosome:17;Alias=CM000679.2,chr17,NC_000017.11
2511485       ID=supercontig:KI270376.1;Alias=chrUn_KI270376v1,NT_187489.1

878 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)

In [16]:

%%timeit -n 1 -r 1
global gdf
gdf = gdf.copy()
gdf['length'] = gdf.end - gdf.start + 1

print(gdf.length.sum())

3096629726
52.8 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)

In [17]:

%%R
#  %%timeit -n 1 -r 1  gives an error due to $
gdf$length <- gdf$end - gdf$start + 1
sum(gdf$length)

[1] 3096629726

In [18]:

%%timeit -n 1 -r 1
print(gdf[(gdf['type'] == 'supercontig')].length.sum() / gdf.length.sum())

0.00370219174212
2.5 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)

In [19]:

%%R
chrs <- c(1:23, "X", "Y", "MT")
sum(subset(gdf, !seqid %in% chrs)$length) / sum(gdf$length)

[1] 0.003702192

In [20]:

%%timeit -n 1 -r 1
global edf
edf = df[df.source.isin(['ensembl', 'havana', 'ensembl_havana'])]
edf.shape

edf.sample(10)
print(edf.type.value_counts())

exon                             1180596
CDS                               704604
five_prime_UTR                    142387
three_prime_UTR                   133938
transcript                         96375
gene                               42470
processed_transcript               28228
aberrant_processed_transcript      26944
NMD_transcript_variant             13761
lincRNA                            13247
processed_pseudogene               10722
lincRNA_gene                        7533
pseudogene                          3049
RNA                                 2221
snRNA_gene                          1909
snRNA                               1909
snoRNA                               956
snoRNA_gene                          944
pseudogenic_transcript               737
rRNA                                 549
rRNA_gene                            549
miRNA                                302
V_gene_segment                       216
J_gene_segment                       158
VD_gene_segment                       37
C_gene_segment                        29
Name: type, dtype: int64
799 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)

In [21]:

%%timeit -n 1 -r 1
%%R
edf <- subset(df, source %in% c("ensembl", "havana", "ensembl_havana"))
dim(edf)
sample_n(edf, 10)

        seqid         source            type     start       end score strand
128494      1 ensembl_havana             CDS 117098833 117098907     .      +
2548429     X         havana            exon  66000422  66001071     .      +
2178494     6 ensembl_havana             CDS 142309544 142309644     .      +
1953812     4         havana             CDS 167000349 167000463     .      -
1497584     2 ensembl_havana            exon 178731693 178731971     .      -
416225     11        ensembl             CDS  66838367  66838445     .      +
2439754     9 ensembl_havana            exon  37854780  37854982     .      +
975953     16         havana            exon  79713836  79714134     .      +
38652       1 ensembl_havana three_prime_UTR  24364372  24364482     .      +
1013586    17 ensembl_havana  five_prime_UTR   7630142   7630172     .      +
        phase
128494      1
2548429     .
2178494     1
1953812     2
1497584     .
416225      0
2439754     .
975953      .
38652       .
1013586     .
                                                                                                                                                  attributes
128494                                                                   ID=CDS:ENSP00000358478;Parent=transcript:ENST00000369466;protein_id=ENSP00000358478
2548429 Parent=transcript:ENST00000424241;Name=ENSE00001750385;constitutive=1;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSE00001750385;rank=2;version=1
2178494                                                                  ID=CDS:ENSP00000296932;Parent=transcript:ENST00000296932;protein_id=ENSP00000296932
1953812                                                                  ID=CDS:ENSP00000420920;Parent=transcript:ENST00000506886;protein_id=ENSP00000420920
1497584  Parent=transcript:ENST00000589042;Name=ENSE00003797539;constitutive=0;ensembl_end_phase=1;ensembl_phase=1;exon_id=ENSE00003797539;rank=58;version=1
416225                                                                   ID=CDS:ENSP00000354227;Parent=transcript:ENST00000360962;protein_id=ENSP00000354227
2439754   Parent=transcript:ENST00000377724;Name=ENSE00003462136;constitutive=0;ensembl_end_phase=1;ensembl_phase=2;exon_id=ENSE00003462136;rank=4;version=1
975953  Parent=transcript:ENST00000563360;Name=ENSE00002611138;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSE00002611138;rank=2;version=1
38652                                                                                                                      Parent=transcript:ENST00000350501
1013586                                                                                                                    Parent=transcript:ENST00000380450

671 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)

In [22]:

%%timeit -n 1 -r 1
global ndf
ndf = edf[edf.type == 'gene']
ndf = ndf.copy()
ndf.sample(10).attributes.values
print(ndf.shape)

(42470, 9)
191 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)

In [23]:

%%timeit -n 1 -r 1
%%R
ndf <- subset(edf, type == "gene")
sample_n(ndf, 10)$attributes

 [1] ID=gene:ENSG00000163157;Name=TMOD4;biotype=protein_coding;description=tropomodulin 4 [Source:HGNC Symbol%3BAcc:HGNC:11874];gene_id=ENSG00000163157;havana_gene=OTTHUMG00000012350;havana_version=6;logic_name=ensembl_havana_gene;version=14                                 
 [2] ID=gene:ENSG00000072201;Name=LNX1;biotype=protein_coding;description=ligand of numb-protein X 1 [Source:HGNC Symbol%3BAcc:HGNC:6657];gene_id=ENSG00000072201;havana_gene=OTTHUMG00000102099;havana_version=4;logic_name=ensembl_havana_gene;version=13                       
 [3] ID=gene:ENSG00000158122;Name=AAED1;biotype=protein_coding;description=AhpC/TSA antioxidant enzyme domain containing 1 [Source:HGNC Symbol%3BAcc:HGNC:16881];gene_id=ENSG00000158122;havana_gene=OTTHUMG00000020299;havana_version=1;logic_name=ensembl_havana_gene;version=11
 [4] ID=gene:ENSG00000163083;Name=INHBB;biotype=protein_coding;description=inhibin beta B subunit [Source:HGNC Symbol%3BAcc:HGNC:6067];gene_id=ENSG00000163083;havana_gene=OTTHUMG00000131437;havana_version=1;logic_name=ensembl_havana_gene;version=5                           
 [5] ID=gene:ENSG00000230011;Name=CTSLP4;biotype=unprocessed_pseudogene;description=cathepsin L pseudogene 4 [Source:HGNC Symbol%3BAcc:HGNC:23645];gene_id=ENSG00000230011;havana_gene=OTTHUMG00000018237;havana_version=1;logic_name=havana;version=2                            
 [6] ID=gene:ENSG00000274601;Name=WI2-88277B6.1;biotype=unprocessed_pseudogene;gene_id=ENSG00000274601;havana_gene=OTTHUMG00000188051;havana_version=1;logic_name=havana;version=1                                                                                                
 [7] ID=gene:ENSG00000261221;Name=ZNF865;biotype=protein_coding;description=zinc finger protein 865 [Source:HGNC Symbol%3BAcc:HGNC:38705];gene_id=ENSG00000261221;havana_gene=OTTHUMG00000177108;havana_version=1;logic_name=ensembl_havana_gene;version=3                        
 [8] ID=gene:ENSG00000233816;Name=IFNA13;biotype=protein_coding;description=interferon%2C alpha 13 [Source:HGNC Symbol%3BAcc:HGNC:5419];gene_id=ENSG00000233816;havana_gene=OTTHUMG00000019675;havana_version=2;logic_name=ensembl_havana_gene;version=3                          
 [9] ID=gene:ENSG00000270863;Name=DDX55P1;biotype=processed_pseudogene;description=DEAD-box helicase 55 pseudogene 1 [Source:HGNC Symbol%3BAcc:HGNC:49852];gene_id=ENSG00000270863;havana_gene=OTTHUMG00000184770;havana_version=1;logic_name=havana;version=1                    
[10] ID=gene:ENSG00000217643;Name=PTGES3P2;biotype=processed_pseudogene;description=prostaglandin E synthase 3 (cytosolic) pseudogene 2 [Source:HGNC Symbol%3BAcc:HGNC:43822];gene_id=ENSG00000217643;havana_gene=OTTHUMG00000152177;havana_version=1;logic_name=havana;version=1 
1623077 Levels: external_name=Ala;logic_name=trnascan ...

2min 5s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)

In [24]:

%%timeit -n 1 -r 1
import re

RE_GENE_NAME = re.compile(r'Name=(?P<gene_name>.+?);')
def extract_gene_name(attributes_str):
    res = RE_GENE_NAME.search(attributes_str)
    return res.group('gene_name')


ndf['gene_name'] = ndf.attributes.apply(extract_gene_name)

RE_GENE_ID = re.compile(r'gene_id=(?P<gene_id>ENSG.+?);')
def extract_gene_id(attributes_str):
    res = RE_GENE_ID.search(attributes_str)
    return res.group('gene_id')


ndf['gene_id'] = ndf.attributes.apply(extract_gene_id)


RE_DESC = re.compile('description=(?P<desc>.+?);')
def extract_description(attributes_str):
    res = RE_DESC.search(attributes_str)
    if res is None:
        return ''
    else:
        return res.group('desc')


ndf['desc'] = ndf.attributes.apply(extract_description)

ndf.drop('attributes', axis=1, inplace=True)
ndf.head()

198 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)

In [25]:

%%R
ptm <- proc.time()
ndf$gene_name <- gsub("(.*Name=)(.*?)(;biotype.*)", "\\2", ndf$attributes)
ndf$gene_id <- gsub("(ID=gene:)(.*?)(;Name.*)", "\\2", ndf$attributes)
ndf$desc <- gsub("(.*description=)(.*?)(;.*)", "\\2", ndf$attributes)

# some genes don't have a description
ndf$desc <- ifelse(grepl("^ID=.*", ndf$desc), "", ndf$desc)

ndf <- subset(ndf, select = -attributes)
print(proc.time() - ptm)
head(ndf)

   user  system elapsed 
  1.307   0.017   1.322 
    seqid         source type  start    end score strand phase gene_name
17      1         havana gene  11869  14409     .      +     .   DDX11L1
29      1         havana gene  14404  29570     .      -     .    WASH7P
72      1         havana gene  52473  53312     .      +     .    OR4G4P
75      1         havana gene  62948  63887     .      +     .   OR4G11P
78      1 ensembl_havana gene  69091  70008     .      +     .     OR4F5
109     1         havana gene 131025 134836     .      +     .    CICP27
            gene_id
17  ENSG00000223972
29  ENSG00000227232
72  ENSG00000268020
75  ENSG00000240361
78  ENSG00000186092
109 ENSG00000233750
                                                                                                  desc
17                                 DEAD/H-box helicase 11 like 1 [Source:HGNC Symbol%3BAcc:HGNC:37102]
29                       WAS protein family homolog 7 pseudogene [Source:HGNC Symbol%3BAcc:HGNC:38034]
72   olfactory receptor family 4 subfamily G member 4 pseudogene [Source:HGNC Symbol%3BAcc:HGNC:14822]
75  olfactory receptor family 4 subfamily G member 11 pseudogene [Source:HGNC Symbol%3BAcc:HGNC:31276]
78              olfactory receptor family 4 subfamily F member 5 [Source:HGNC Symbol%3BAcc:HGNC:14825]
109              capicua transcriptional repressor pseudogene 27 [Source:HGNC Symbol%3BAcc:HGNC:48835]

Jump to plotting

In [26]:

ndf['length'] = ndf.end - ndf.start + 1
ndf.length.describe()

Out[26]:

count    4.247000e+04
mean     3.583348e+04
std      9.683485e+04
min      8.000000e+00
25%      8.840000e+02
50%      5.170500e+03
75%      3.055200e+04
max      2.304997e+06
Name: length, dtype: float64

In [27]:

%%R
ndf$length <- ndf$end - ndf$start + 1
summary(ndf$length)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
      8     884    5170   35834   30552 2304997

In [28]:

%%timeit -n 1 -r 1
ndf.length.plot(kind='hist', bins=50, logy=True)

255 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)

In [29]:

%%timeit -n 1 -r 1
%%R
ndf %>% ggplot(aes(x = length)) + 
  geom_histogram(bins = 50, fill = "blue") + 
  scale_y_log10()

1.06 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)

In [ ]: