This is a re-implementation of some Pythona and R code based off the article, https://shiring.github.io/r_vs_python/2017/01/22/R_vs_Py_post which was based off of https://www.toptal.com/python/comprehensive-introduction-your-genome-scipy
# !wget ftp://ftp.ensembl.org/pub/release-85/gff3/homo_sapiens/Homo_sapiens.GRCh38.85.gff3.gz
import pandas as pd
import matplotlib as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
%load_ext rpy2.ipython
%%R
library(dplyr)
library(ggplot2)
%%timeit -n 1 -r 1
global df
col_names = ['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes']
df = pd.read_csv('Homo_sapiens.GRCh38.85.gff3.gz', compression='gzip',
sep='\t', comment='#', low_memory=False,
header=None, names=col_names)
10 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
%%timeit -n 1 -r 1
%%R
df <- read.csv("Homo_sapiens.GRCh38.85.gff3.gz",
header = FALSE,
sep = "\t",
col.names = c('seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes'),
comment.char = "#")
head(df)
seqid source type start end score strand phase 1 1 GRCh38 chromosome 1 248956422 . . . 2 1 . biological_region 10469 11240 1.3e+03 . . 3 1 . biological_region 10650 10657 0.999 + . 4 1 . biological_region 10655 10657 0.999 - . 5 1 . biological_region 10678 10687 0.999 + . 6 1 . biological_region 10681 10688 0.999 - . attributes 1 ID=chromosome:1;Alias=CM000663.2,chr1,NC_000001.11 2 external_name=oe %3D 0.79;logic_name=cpg 3 logic_name=eponine 4 logic_name=eponine 5 logic_name=eponine 6 logic_name=eponine
1min 11s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
%%timeit -n 1 -r 1
print(df.seqid.unique())
['1' '10' '11' '12' '13' '14' '15' '16' '17' '18' '19' '2' '20' '21' '22' '3' '4' '5' '6' '7' '8' '9' 'GL000008.2' 'GL000009.2' 'GL000194.1' 'GL000195.1' 'GL000205.2' 'GL000208.1' 'GL000213.1' 'GL000214.1' 'GL000216.2' 'GL000218.1' 'GL000219.1' 'GL000220.1' 'GL000221.1' 'GL000224.1' 'GL000225.1' 'GL000226.1' 'KI270302.1' 'KI270303.1' 'KI270304.1' 'KI270305.1' 'KI270310.1' 'KI270311.1' 'KI270312.1' 'KI270315.1' 'KI270316.1' 'KI270317.1' 'KI270320.1' 'KI270322.1' 'KI270329.1' 'KI270330.1' 'KI270333.1' 'KI270334.1' 'KI270335.1' 'KI270336.1' 'KI270337.1' 'KI270338.1' 'KI270340.1' 'KI270362.1' 'KI270363.1' 'KI270364.1' 'KI270366.1' 'KI270371.1' 'KI270372.1' 'KI270373.1' 'KI270374.1' 'KI270375.1' 'KI270376.1' 'KI270378.1' 'KI270379.1' 'KI270381.1' 'KI270382.1' 'KI270383.1' 'KI270384.1' 'KI270385.1' 'KI270386.1' 'KI270387.1' 'KI270388.1' 'KI270389.1' 'KI270390.1' 'KI270391.1' 'KI270392.1' 'KI270393.1' 'KI270394.1' 'KI270395.1' 'KI270396.1' 'KI270411.1' 'KI270412.1' 'KI270414.1' 'KI270417.1' 'KI270418.1' 'KI270419.1' 'KI270420.1' 'KI270422.1' 'KI270423.1' 'KI270424.1' 'KI270425.1' 'KI270429.1' 'KI270435.1' 'KI270438.1' 'KI270442.1' 'KI270448.1' 'KI270465.1' 'KI270466.1' 'KI270467.1' 'KI270468.1' 'KI270507.1' 'KI270508.1' 'KI270509.1' 'KI270510.1' 'KI270511.1' 'KI270512.1' 'KI270515.1' 'KI270516.1' 'KI270517.1' 'KI270518.1' 'KI270519.1' 'KI270521.1' 'KI270522.1' 'KI270528.1' 'KI270529.1' 'KI270530.1' 'KI270538.1' 'KI270539.1' 'KI270544.1' 'KI270548.1' 'KI270579.1' 'KI270580.1' 'KI270581.1' 'KI270582.1' 'KI270583.1' 'KI270584.1' 'KI270587.1' 'KI270588.1' 'KI270589.1' 'KI270590.1' 'KI270591.1' 'KI270593.1' 'KI270706.1' 'KI270707.1' 'KI270708.1' 'KI270709.1' 'KI270710.1' 'KI270711.1' 'KI270712.1' 'KI270713.1' 'KI270714.1' 'KI270715.1' 'KI270716.1' 'KI270717.1' 'KI270718.1' 'KI270719.1' 'KI270720.1' 'KI270721.1' 'KI270722.1' 'KI270723.1' 'KI270724.1' 'KI270725.1' 'KI270726.1' 'KI270727.1' 'KI270728.1' 'KI270729.1' 'KI270730.1' 'KI270731.1' 'KI270732.1' 'KI270733.1' 'KI270734.1' 'KI270735.1' 'KI270736.1' 'KI270737.1' 'KI270738.1' 'KI270739.1' 'KI270740.1' 'KI270741.1' 'KI270742.1' 'KI270743.1' 'KI270744.1' 'KI270745.1' 'KI270746.1' 'KI270747.1' 'KI270748.1' 'KI270749.1' 'KI270750.1' 'KI270751.1' 'KI270752.1' 'KI270753.1' 'KI270754.1' 'KI270755.1' 'KI270756.1' 'KI270757.1' 'MT' 'X' 'Y'] 71.6 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
%%timeit -n 1 -r 1
%%R
unique(df$seqid)
[1] 1 10 11 12 13 14 [7] 15 16 17 18 19 2 [13] 20 21 22 3 4 5 [19] 6 7 8 9 GL000008.2 GL000009.2 [25] GL000194.1 GL000195.1 GL000205.2 GL000208.1 GL000213.1 GL000214.1 [31] GL000216.2 GL000218.1 GL000219.1 GL000220.1 GL000221.1 GL000224.1 [37] GL000225.1 GL000226.1 KI270302.1 KI270303.1 KI270304.1 KI270305.1 [43] KI270310.1 KI270311.1 KI270312.1 KI270315.1 KI270316.1 KI270317.1 [49] KI270320.1 KI270322.1 KI270329.1 KI270330.1 KI270333.1 KI270334.1 [55] KI270335.1 KI270336.1 KI270337.1 KI270338.1 KI270340.1 KI270362.1 [61] KI270363.1 KI270364.1 KI270366.1 KI270371.1 KI270372.1 KI270373.1 [67] KI270374.1 KI270375.1 KI270376.1 KI270378.1 KI270379.1 KI270381.1 [73] KI270382.1 KI270383.1 KI270384.1 KI270385.1 KI270386.1 KI270387.1 [79] KI270388.1 KI270389.1 KI270390.1 KI270391.1 KI270392.1 KI270393.1 [85] KI270394.1 KI270395.1 KI270396.1 KI270411.1 KI270412.1 KI270414.1 [91] KI270417.1 KI270418.1 KI270419.1 KI270420.1 KI270422.1 KI270423.1 [97] KI270424.1 KI270425.1 KI270429.1 KI270435.1 KI270438.1 KI270442.1 [103] KI270448.1 KI270465.1 KI270466.1 KI270467.1 KI270468.1 KI270507.1 [109] KI270508.1 KI270509.1 KI270510.1 KI270511.1 KI270512.1 KI270515.1 [115] KI270516.1 KI270517.1 KI270518.1 KI270519.1 KI270521.1 KI270522.1 [121] KI270528.1 KI270529.1 KI270530.1 KI270538.1 KI270539.1 KI270544.1 [127] KI270548.1 KI270579.1 KI270580.1 KI270581.1 KI270582.1 KI270583.1 [133] KI270584.1 KI270587.1 KI270588.1 KI270589.1 KI270590.1 KI270591.1 [139] KI270593.1 KI270706.1 KI270707.1 KI270708.1 KI270709.1 KI270710.1 [145] KI270711.1 KI270712.1 KI270713.1 KI270714.1 KI270715.1 KI270716.1 [151] KI270717.1 KI270718.1 KI270719.1 KI270720.1 KI270721.1 KI270722.1 [157] KI270723.1 KI270724.1 KI270725.1 KI270726.1 KI270727.1 KI270728.1 [163] KI270729.1 KI270730.1 KI270731.1 KI270732.1 KI270733.1 KI270734.1 [169] KI270735.1 KI270736.1 KI270737.1 KI270738.1 KI270739.1 KI270740.1 [175] KI270741.1 KI270742.1 KI270743.1 KI270744.1 KI270745.1 KI270746.1 [181] KI270747.1 KI270748.1 KI270749.1 KI270750.1 KI270751.1 KI270752.1 [187] KI270753.1 KI270754.1 KI270755.1 KI270756.1 KI270757.1 MT [193] X Y 194 Levels: 1 10 11 12 13 14 15 16 17 18 19 2 20 21 22 3 4 5 6 7 8 ... Y
35.4 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
%%timeit -n 1 -r 1
df.seqid.unique().shape
79.4 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
%%timeit -n 1 -r 1
%%R
length(unique(df$seqid))
[1] 194
36.3 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
%%timeit -n 1 -r 1
df.source.value_counts()
300 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
%%timeit -n 1 -r 1
%%R
sort(table(df$source), decreasing = TRUE)
havana ensembl_havana ensembl . mirbase 1441093 745065 228212 182510 4701 GRCh38 insdc 194 74
274 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
%%timeit -n 1 -r 1
global gdf
gdf = df[df.source == 'GRCh38']
gdf.shape
print(gdf.sample(10))
seqid source type start end score strand \ 2511495 KI270389.1 GRCh38 supercontig 1 1298 . . 2594560 Y GRCh38 chromosome 2781480 56887902 . . 2511468 KI270333.1 GRCh38 supercontig 1 2699 . . 2513704 KI270739.1 GRCh38 supercontig 1 73985 . . 2511559 KI270522.1 GRCh38 supercontig 1 5674 . . 2511388 GL000224.1 GRCh38 supercontig 1 179693 . . 2511481 KI270373.1 GRCh38 supercontig 1 1451 . . 2511460 KI270312.1 GRCh38 supercontig 1 998 . . 2511573 KI270584.1 GRCh38 supercontig 1 4513 . . 2511588 KI270709.1 GRCh38 supercontig 1 66860 . . phase attributes 2511495 . ID=supercontig:KI270389.1;Alias=chrUn_KI270389... 2594560 . ID=chromosome:Y;Alias=CM000686.2,chrY,NC_00002... 2511468 . ID=supercontig:KI270333.1;Alias=chrUn_KI270333... 2513704 . ID=supercontig:KI270739.1;Alias=chr22_KI270739... 2511559 . ID=supercontig:KI270522.1;Alias=chrUn_KI270522... 2511388 . ID=supercontig:GL000224.1;Alias=chrUn_GL000224... 2511481 . ID=supercontig:KI270373.1;Alias=chrUn_KI270373... 2511460 . ID=supercontig:KI270312.1;Alias=chrUn_KI270312... 2511573 . ID=supercontig:KI270584.1;Alias=chrUn_KI270584... 2511588 . ID=supercontig:KI270709.1;Alias=chr1_KI270709v... 192 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
%%timeit -n 1 -r 1
%%R
gdf <- df[df$source == "GRCh38", ]
dim(gdf)
sample_n(gdf, 10)
seqid source type start end score strand phase 865054 16 GRCh38 chromosome 1 90338345 . . . 2511482 KI270373.1 GRCh38 supercontig 1 1451 . . . 2511636 KI270711.1 GRCh38 supercontig 1 42210 . . . 2511491 KI270384.1 GRCh38 supercontig 1 1658 . . . 2511458 KI270305.1 GRCh38 supercontig 1 1472 . . . 2511544 KI270465.1 GRCh38 supercontig 1 1774 . . . 235069 10 GRCh38 chromosome 1 133797422 . . . 2511515 KI270429.1 GRCh38 supercontig 1 1361 . . . 990811 17 GRCh38 chromosome 1 83257441 . . . 2511485 KI270376.1 GRCh38 supercontig 1 1136 . . . attributes 865054 ID=chromosome:16;Alias=CM000678.2,chr16,NC_000016.10 2511482 ID=supercontig:KI270373.1;Alias=chrUn_KI270373v1,NT_187492.1 2511636 ID=supercontig:KI270711.1;Alias=chr1_KI270711v1_random,NT_187366.1 2511491 ID=supercontig:KI270384.1;Alias=chrUn_KI270384v1,NT_187484.1 2511458 ID=supercontig:KI270305.1;Alias=chrUn_KI270305v1,NT_187399.1 2511544 ID=supercontig:KI270465.1;Alias=chrUn_KI270465v1,NT_187422.1 235069 ID=chromosome:10;Alias=CM000672.2,chr10,NC_000010.11 2511515 ID=supercontig:KI270429.1;Alias=chrUn_KI270429v1,NT_187419.1 990811 ID=chromosome:17;Alias=CM000679.2,chr17,NC_000017.11 2511485 ID=supercontig:KI270376.1;Alias=chrUn_KI270376v1,NT_187489.1
878 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
%%timeit -n 1 -r 1
global gdf
gdf = gdf.copy()
gdf['length'] = gdf.end - gdf.start + 1
print(gdf.length.sum())
3096629726 52.8 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
%%R
# %%timeit -n 1 -r 1 gives an error due to $
gdf$length <- gdf$end - gdf$start + 1
sum(gdf$length)
[1] 3096629726
%%timeit -n 1 -r 1
print(gdf[(gdf['type'] == 'supercontig')].length.sum() / gdf.length.sum())
0.00370219174212 2.5 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
%%R
chrs <- c(1:23, "X", "Y", "MT")
sum(subset(gdf, !seqid %in% chrs)$length) / sum(gdf$length)
[1] 0.003702192
%%timeit -n 1 -r 1
global edf
edf = df[df.source.isin(['ensembl', 'havana', 'ensembl_havana'])]
edf.shape
edf.sample(10)
print(edf.type.value_counts())
exon 1180596 CDS 704604 five_prime_UTR 142387 three_prime_UTR 133938 transcript 96375 gene 42470 processed_transcript 28228 aberrant_processed_transcript 26944 NMD_transcript_variant 13761 lincRNA 13247 processed_pseudogene 10722 lincRNA_gene 7533 pseudogene 3049 RNA 2221 snRNA_gene 1909 snRNA 1909 snoRNA 956 snoRNA_gene 944 pseudogenic_transcript 737 rRNA 549 rRNA_gene 549 miRNA 302 V_gene_segment 216 J_gene_segment 158 VD_gene_segment 37 C_gene_segment 29 Name: type, dtype: int64 799 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
%%timeit -n 1 -r 1
%%R
edf <- subset(df, source %in% c("ensembl", "havana", "ensembl_havana"))
dim(edf)
sample_n(edf, 10)
seqid source type start end score strand 128494 1 ensembl_havana CDS 117098833 117098907 . + 2548429 X havana exon 66000422 66001071 . + 2178494 6 ensembl_havana CDS 142309544 142309644 . + 1953812 4 havana CDS 167000349 167000463 . - 1497584 2 ensembl_havana exon 178731693 178731971 . - 416225 11 ensembl CDS 66838367 66838445 . + 2439754 9 ensembl_havana exon 37854780 37854982 . + 975953 16 havana exon 79713836 79714134 . + 38652 1 ensembl_havana three_prime_UTR 24364372 24364482 . + 1013586 17 ensembl_havana five_prime_UTR 7630142 7630172 . + phase 128494 1 2548429 . 2178494 1 1953812 2 1497584 . 416225 0 2439754 . 975953 . 38652 . 1013586 . attributes 128494 ID=CDS:ENSP00000358478;Parent=transcript:ENST00000369466;protein_id=ENSP00000358478 2548429 Parent=transcript:ENST00000424241;Name=ENSE00001750385;constitutive=1;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSE00001750385;rank=2;version=1 2178494 ID=CDS:ENSP00000296932;Parent=transcript:ENST00000296932;protein_id=ENSP00000296932 1953812 ID=CDS:ENSP00000420920;Parent=transcript:ENST00000506886;protein_id=ENSP00000420920 1497584 Parent=transcript:ENST00000589042;Name=ENSE00003797539;constitutive=0;ensembl_end_phase=1;ensembl_phase=1;exon_id=ENSE00003797539;rank=58;version=1 416225 ID=CDS:ENSP00000354227;Parent=transcript:ENST00000360962;protein_id=ENSP00000354227 2439754 Parent=transcript:ENST00000377724;Name=ENSE00003462136;constitutive=0;ensembl_end_phase=1;ensembl_phase=2;exon_id=ENSE00003462136;rank=4;version=1 975953 Parent=transcript:ENST00000563360;Name=ENSE00002611138;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSE00002611138;rank=2;version=1 38652 Parent=transcript:ENST00000350501 1013586 Parent=transcript:ENST00000380450
671 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
%%timeit -n 1 -r 1
global ndf
ndf = edf[edf.type == 'gene']
ndf = ndf.copy()
ndf.sample(10).attributes.values
print(ndf.shape)
(42470, 9) 191 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
%%timeit -n 1 -r 1
%%R
ndf <- subset(edf, type == "gene")
sample_n(ndf, 10)$attributes
[1] ID=gene:ENSG00000163157;Name=TMOD4;biotype=protein_coding;description=tropomodulin 4 [Source:HGNC Symbol%3BAcc:HGNC:11874];gene_id=ENSG00000163157;havana_gene=OTTHUMG00000012350;havana_version=6;logic_name=ensembl_havana_gene;version=14 [2] ID=gene:ENSG00000072201;Name=LNX1;biotype=protein_coding;description=ligand of numb-protein X 1 [Source:HGNC Symbol%3BAcc:HGNC:6657];gene_id=ENSG00000072201;havana_gene=OTTHUMG00000102099;havana_version=4;logic_name=ensembl_havana_gene;version=13 [3] ID=gene:ENSG00000158122;Name=AAED1;biotype=protein_coding;description=AhpC/TSA antioxidant enzyme domain containing 1 [Source:HGNC Symbol%3BAcc:HGNC:16881];gene_id=ENSG00000158122;havana_gene=OTTHUMG00000020299;havana_version=1;logic_name=ensembl_havana_gene;version=11 [4] ID=gene:ENSG00000163083;Name=INHBB;biotype=protein_coding;description=inhibin beta B subunit [Source:HGNC Symbol%3BAcc:HGNC:6067];gene_id=ENSG00000163083;havana_gene=OTTHUMG00000131437;havana_version=1;logic_name=ensembl_havana_gene;version=5 [5] ID=gene:ENSG00000230011;Name=CTSLP4;biotype=unprocessed_pseudogene;description=cathepsin L pseudogene 4 [Source:HGNC Symbol%3BAcc:HGNC:23645];gene_id=ENSG00000230011;havana_gene=OTTHUMG00000018237;havana_version=1;logic_name=havana;version=2 [6] ID=gene:ENSG00000274601;Name=WI2-88277B6.1;biotype=unprocessed_pseudogene;gene_id=ENSG00000274601;havana_gene=OTTHUMG00000188051;havana_version=1;logic_name=havana;version=1 [7] ID=gene:ENSG00000261221;Name=ZNF865;biotype=protein_coding;description=zinc finger protein 865 [Source:HGNC Symbol%3BAcc:HGNC:38705];gene_id=ENSG00000261221;havana_gene=OTTHUMG00000177108;havana_version=1;logic_name=ensembl_havana_gene;version=3 [8] ID=gene:ENSG00000233816;Name=IFNA13;biotype=protein_coding;description=interferon%2C alpha 13 [Source:HGNC Symbol%3BAcc:HGNC:5419];gene_id=ENSG00000233816;havana_gene=OTTHUMG00000019675;havana_version=2;logic_name=ensembl_havana_gene;version=3 [9] ID=gene:ENSG00000270863;Name=DDX55P1;biotype=processed_pseudogene;description=DEAD-box helicase 55 pseudogene 1 [Source:HGNC Symbol%3BAcc:HGNC:49852];gene_id=ENSG00000270863;havana_gene=OTTHUMG00000184770;havana_version=1;logic_name=havana;version=1 [10] ID=gene:ENSG00000217643;Name=PTGES3P2;biotype=processed_pseudogene;description=prostaglandin E synthase 3 (cytosolic) pseudogene 2 [Source:HGNC Symbol%3BAcc:HGNC:43822];gene_id=ENSG00000217643;havana_gene=OTTHUMG00000152177;havana_version=1;logic_name=havana;version=1 1623077 Levels: external_name=Ala;logic_name=trnascan ...
2min 5s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
%%timeit -n 1 -r 1
import re
RE_GENE_NAME = re.compile(r'Name=(?P<gene_name>.+?);')
def extract_gene_name(attributes_str):
res = RE_GENE_NAME.search(attributes_str)
return res.group('gene_name')
ndf['gene_name'] = ndf.attributes.apply(extract_gene_name)
RE_GENE_ID = re.compile(r'gene_id=(?P<gene_id>ENSG.+?);')
def extract_gene_id(attributes_str):
res = RE_GENE_ID.search(attributes_str)
return res.group('gene_id')
ndf['gene_id'] = ndf.attributes.apply(extract_gene_id)
RE_DESC = re.compile('description=(?P<desc>.+?);')
def extract_description(attributes_str):
res = RE_DESC.search(attributes_str)
if res is None:
return ''
else:
return res.group('desc')
ndf['desc'] = ndf.attributes.apply(extract_description)
ndf.drop('attributes', axis=1, inplace=True)
ndf.head()
198 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
%%R
ptm <- proc.time()
ndf$gene_name <- gsub("(.*Name=)(.*?)(;biotype.*)", "\\2", ndf$attributes)
ndf$gene_id <- gsub("(ID=gene:)(.*?)(;Name.*)", "\\2", ndf$attributes)
ndf$desc <- gsub("(.*description=)(.*?)(;.*)", "\\2", ndf$attributes)
# some genes don't have a description
ndf$desc <- ifelse(grepl("^ID=.*", ndf$desc), "", ndf$desc)
ndf <- subset(ndf, select = -attributes)
print(proc.time() - ptm)
head(ndf)
user system elapsed 1.307 0.017 1.322 seqid source type start end score strand phase gene_name 17 1 havana gene 11869 14409 . + . DDX11L1 29 1 havana gene 14404 29570 . - . WASH7P 72 1 havana gene 52473 53312 . + . OR4G4P 75 1 havana gene 62948 63887 . + . OR4G11P 78 1 ensembl_havana gene 69091 70008 . + . OR4F5 109 1 havana gene 131025 134836 . + . CICP27 gene_id 17 ENSG00000223972 29 ENSG00000227232 72 ENSG00000268020 75 ENSG00000240361 78 ENSG00000186092 109 ENSG00000233750 desc 17 DEAD/H-box helicase 11 like 1 [Source:HGNC Symbol%3BAcc:HGNC:37102] 29 WAS protein family homolog 7 pseudogene [Source:HGNC Symbol%3BAcc:HGNC:38034] 72 olfactory receptor family 4 subfamily G member 4 pseudogene [Source:HGNC Symbol%3BAcc:HGNC:14822] 75 olfactory receptor family 4 subfamily G member 11 pseudogene [Source:HGNC Symbol%3BAcc:HGNC:31276] 78 olfactory receptor family 4 subfamily F member 5 [Source:HGNC Symbol%3BAcc:HGNC:14825] 109 capicua transcriptional repressor pseudogene 27 [Source:HGNC Symbol%3BAcc:HGNC:48835]
Jump to plotting
ndf['length'] = ndf.end - ndf.start + 1
ndf.length.describe()
count 4.247000e+04 mean 3.583348e+04 std 9.683485e+04 min 8.000000e+00 25% 8.840000e+02 50% 5.170500e+03 75% 3.055200e+04 max 2.304997e+06 Name: length, dtype: float64
%%R
ndf$length <- ndf$end - ndf$start + 1
summary(ndf$length)
Min. 1st Qu. Median Mean 3rd Qu. Max. 8 884 5170 35834 30552 2304997
%%timeit -n 1 -r 1
ndf.length.plot(kind='hist', bins=50, logy=True)
255 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
%%timeit -n 1 -r 1
%%R
ndf %>% ggplot(aes(x = length)) +
geom_histogram(bins = 50, fill = "blue") +
scale_y_log10()
1.06 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)