#!/usr/bin/env python # coding: utf-8 # # This is a re-implementation of some Pythona and R code based off the article, https://shiring.github.io/r_vs_python/2017/01/22/R_vs_Py_post which was based off of https://www.toptal.com/python/comprehensive-introduction-your-genome-scipy # # In[1]: # !wget ftp://ftp.ensembl.org/pub/release-85/gff3/homo_sapiens/Homo_sapiens.GRCh38.85.gff3.gz # In[2]: import pandas as pd import matplotlib as plt get_ipython().run_line_magic('matplotlib', 'inline') # In[3]: import warnings warnings.filterwarnings('ignore') # In[4]: get_ipython().run_line_magic('load_ext', 'rpy2.ipython') # In[5]: get_ipython().run_cell_magic('R', '', 'library(dplyr)\nlibrary(ggplot2)\n') # In[6]: get_ipython().run_cell_magic('timeit', '-n 1 -r 1', "global df\ncol_names = ['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes']\ndf = pd.read_csv('Homo_sapiens.GRCh38.85.gff3.gz', compression='gzip',\n sep='\\t', comment='#', low_memory=False,\n header=None, names=col_names)\n") # In[7]: get_ipython().run_cell_magic('timeit', '-n 1 -r 1', '%%R\ndf <- read.csv("Homo_sapiens.GRCh38.85.gff3.gz", \n header = FALSE, \n sep = "\\t", \n col.names = c(\'seqid\', \'source\', \'type\', \'start\', \'end\', \'score\', \'strand\', \'phase\', \'attributes\'), \n comment.char = "#")\nhead(df)\n') # In[8]: get_ipython().run_cell_magic('timeit', '-n 1 -r 1', 'print(df.seqid.unique())\n') # In[9]: get_ipython().run_cell_magic('timeit', '-n 1 -r 1', '%%R\nunique(df$seqid)\n') # In[10]: get_ipython().run_cell_magic('timeit', '-n 1 -r 1', 'df.seqid.unique().shape\n') # In[11]: get_ipython().run_cell_magic('timeit', '-n 1 -r 1', '%%R\nlength(unique(df$seqid))\n') # In[12]: get_ipython().run_cell_magic('timeit', '-n 1 -r 1', 'df.source.value_counts()\n') # In[13]: get_ipython().run_cell_magic('timeit', '-n 1 -r 1', '%%R\nsort(table(df$source), decreasing = TRUE)\n') # In[14]: get_ipython().run_cell_magic('timeit', '-n 1 -r 1', "global gdf\ngdf = df[df.source == 'GRCh38']\ngdf.shape\nprint(gdf.sample(10))\n") # In[15]: get_ipython().run_cell_magic('timeit', '-n 1 -r 1', '%%R\ngdf <- df[df$source == "GRCh38", ]\ndim(gdf)\nsample_n(gdf, 10)\n') # In[16]: get_ipython().run_cell_magic('timeit', '-n 1 -r 1', "global gdf\ngdf = gdf.copy()\ngdf['length'] = gdf.end - gdf.start + 1\n\nprint(gdf.length.sum())\n") # In[17]: get_ipython().run_cell_magic('R', '', '# %%timeit -n 1 -r 1 gives an error due to $\ngdf$length <- gdf$end - gdf$start + 1\nsum(gdf$length)\n') # In[18]: get_ipython().run_cell_magic('timeit', '-n 1 -r 1', "print(gdf[(gdf['type'] == 'supercontig')].length.sum() / gdf.length.sum())\n") # In[19]: get_ipython().run_cell_magic('R', '', 'chrs <- c(1:23, "X", "Y", "MT")\nsum(subset(gdf, !seqid %in% chrs)$length) / sum(gdf$length)\n') # In[20]: get_ipython().run_cell_magic('timeit', '-n 1 -r 1', "global edf\nedf = df[df.source.isin(['ensembl', 'havana', 'ensembl_havana'])]\nedf.shape\n\nedf.sample(10)\nprint(edf.type.value_counts())\n") # In[21]: get_ipython().run_cell_magic('timeit', '-n 1 -r 1', '%%R\nedf <- subset(df, source %in% c("ensembl", "havana", "ensembl_havana"))\ndim(edf)\nsample_n(edf, 10)\n') # In[22]: get_ipython().run_cell_magic('timeit', '-n 1 -r 1', "global ndf\nndf = edf[edf.type == 'gene']\nndf = ndf.copy()\nndf.sample(10).attributes.values\nprint(ndf.shape)\n") # In[23]: get_ipython().run_cell_magic('timeit', '-n 1 -r 1', '%%R\nndf <- subset(edf, type == "gene")\nsample_n(ndf, 10)$attributes\n') # In[24]: get_ipython().run_cell_magic('timeit', '-n 1 -r 1', "import re\n\nRE_GENE_NAME = re.compile(r'Name=(?P.+?);')\ndef extract_gene_name(attributes_str):\n res = RE_GENE_NAME.search(attributes_str)\n return res.group('gene_name')\n\n\nndf['gene_name'] = ndf.attributes.apply(extract_gene_name)\n\nRE_GENE_ID = re.compile(r'gene_id=(?PENSG.+?);')\ndef extract_gene_id(attributes_str):\n res = RE_GENE_ID.search(attributes_str)\n return res.group('gene_id')\n\n\nndf['gene_id'] = ndf.attributes.apply(extract_gene_id)\n\n\nRE_DESC = re.compile('description=(?P.+?);')\ndef extract_description(attributes_str):\n res = RE_DESC.search(attributes_str)\n if res is None:\n return ''\n else:\n return res.group('desc')\n\n\nndf['desc'] = ndf.attributes.apply(extract_description)\n\nndf.drop('attributes', axis=1, inplace=True)\nndf.head()\n") # In[25]: get_ipython().run_cell_magic('R', '', 'ptm <- proc.time()\nndf$gene_name <- gsub("(.*Name=)(.*?)(;biotype.*)", "\\\\2", ndf$attributes)\nndf$gene_id <- gsub("(ID=gene:)(.*?)(;Name.*)", "\\\\2", ndf$attributes)\nndf$desc <- gsub("(.*description=)(.*?)(;.*)", "\\\\2", ndf$attributes)\n\n# some genes don\'t have a description\nndf$desc <- ifelse(grepl("^ID=.*", ndf$desc), "", ndf$desc)\n\nndf <- subset(ndf, select = -attributes)\nprint(proc.time() - ptm)\nhead(ndf)\n') # Jump to plotting # # In[26]: ndf['length'] = ndf.end - ndf.start + 1 ndf.length.describe() # In[27]: get_ipython().run_cell_magic('R', '', 'ndf$length <- ndf$end - ndf$start + 1\nsummary(ndf$length)\n') # In[28]: get_ipython().run_cell_magic('timeit', '-n 1 -r 1', "ndf.length.plot(kind='hist', bins=50, logy=True)\n") # In[29]: get_ipython().run_cell_magic('timeit', '-n 1 -r 1', '%%R\nndf %>% ggplot(aes(x = length)) + \n geom_histogram(bins = 50, fill = "blue") + \n scale_y_log10()\n') # In[ ]: # In[ ]: