#!/usr/bin/env python
# coding: utf-8

# 
# This is a re-implementation of some Pythona and R code based off the article, https://shiring.github.io/r_vs_python/2017/01/22/R_vs_Py_post which was based off of https://www.toptal.com/python/comprehensive-introduction-your-genome-scipy
# 

# In[1]:


# !wget ftp://ftp.ensembl.org/pub/release-85/gff3/homo_sapiens/Homo_sapiens.GRCh38.85.gff3.gz


# In[2]:


import pandas as pd
import matplotlib as plt
get_ipython().run_line_magic('matplotlib', 'inline')


# In[3]:


import warnings
warnings.filterwarnings('ignore')


# In[4]:


get_ipython().run_line_magic('load_ext', 'rpy2.ipython')


# In[5]:


get_ipython().run_cell_magic('R', '', 'library(dplyr)\nlibrary(ggplot2)\n')


# In[6]:


get_ipython().run_cell_magic('timeit', '-n 1 -r 1', "global df\ncol_names = ['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes']\ndf = pd.read_csv('Homo_sapiens.GRCh38.85.gff3.gz', compression='gzip',\n                         sep='\\t', comment='#', low_memory=False,\n                         header=None, names=col_names)\n")


# In[7]:


get_ipython().run_cell_magic('timeit', '-n 1 -r 1', '%%R\ndf <- read.csv("Homo_sapiens.GRCh38.85.gff3.gz", \n               header = FALSE, \n               sep = "\\t", \n               col.names = c(\'seqid\', \'source\', \'type\', \'start\', \'end\', \'score\', \'strand\', \'phase\', \'attributes\'), \n               comment.char = "#")\nhead(df)\n')


# In[8]:


get_ipython().run_cell_magic('timeit', '-n 1 -r 1', 'print(df.seqid.unique())\n')


# In[9]:


get_ipython().run_cell_magic('timeit', '-n 1 -r 1', '%%R\nunique(df$seqid)\n')


# In[10]:


get_ipython().run_cell_magic('timeit', '-n 1 -r 1', 'df.seqid.unique().shape\n')


# In[11]:


get_ipython().run_cell_magic('timeit', '-n 1 -r 1', '%%R\nlength(unique(df$seqid))\n')


# In[12]:


get_ipython().run_cell_magic('timeit', '-n 1 -r 1', 'df.source.value_counts()\n')


# In[13]:


get_ipython().run_cell_magic('timeit', '-n 1 -r 1', '%%R\nsort(table(df$source), decreasing = TRUE)\n')


# In[14]:


get_ipython().run_cell_magic('timeit', '-n 1 -r 1', "global gdf\ngdf = df[df.source == 'GRCh38']\ngdf.shape\nprint(gdf.sample(10))\n")


# In[15]:


get_ipython().run_cell_magic('timeit', '-n 1 -r 1', '%%R\ngdf <- df[df$source == "GRCh38", ]\ndim(gdf)\nsample_n(gdf, 10)\n')


# In[16]:


get_ipython().run_cell_magic('timeit', '-n 1 -r 1', "global gdf\ngdf = gdf.copy()\ngdf['length'] = gdf.end - gdf.start + 1\n\nprint(gdf.length.sum())\n")


# In[17]:


get_ipython().run_cell_magic('R', '', '#  %%timeit -n 1 -r 1  gives an error due to $\ngdf$length <- gdf$end - gdf$start + 1\nsum(gdf$length)\n')


# In[18]:


get_ipython().run_cell_magic('timeit', '-n 1 -r 1', "print(gdf[(gdf['type'] == 'supercontig')].length.sum() / gdf.length.sum())\n")


# In[19]:


get_ipython().run_cell_magic('R', '', 'chrs <- c(1:23, "X", "Y", "MT")\nsum(subset(gdf, !seqid %in% chrs)$length) / sum(gdf$length)\n')


# In[20]:


get_ipython().run_cell_magic('timeit', '-n 1 -r 1', "global edf\nedf = df[df.source.isin(['ensembl', 'havana', 'ensembl_havana'])]\nedf.shape\n\nedf.sample(10)\nprint(edf.type.value_counts())\n")


# In[21]:


get_ipython().run_cell_magic('timeit', '-n 1 -r 1', '%%R\nedf <- subset(df, source %in% c("ensembl", "havana", "ensembl_havana"))\ndim(edf)\nsample_n(edf, 10)\n')


# In[22]:


get_ipython().run_cell_magic('timeit', '-n 1 -r 1', "global ndf\nndf = edf[edf.type == 'gene']\nndf = ndf.copy()\nndf.sample(10).attributes.values\nprint(ndf.shape)\n")


# In[23]:


get_ipython().run_cell_magic('timeit', '-n 1 -r 1', '%%R\nndf <- subset(edf, type == "gene")\nsample_n(ndf, 10)$attributes\n')


# In[24]:


get_ipython().run_cell_magic('timeit', '-n 1 -r 1', "import re\n\nRE_GENE_NAME = re.compile(r'Name=(?P<gene_name>.+?);')\ndef extract_gene_name(attributes_str):\n    res = RE_GENE_NAME.search(attributes_str)\n    return res.group('gene_name')\n\n\nndf['gene_name'] = ndf.attributes.apply(extract_gene_name)\n\nRE_GENE_ID = re.compile(r'gene_id=(?P<gene_id>ENSG.+?);')\ndef extract_gene_id(attributes_str):\n    res = RE_GENE_ID.search(attributes_str)\n    return res.group('gene_id')\n\n\nndf['gene_id'] = ndf.attributes.apply(extract_gene_id)\n\n\nRE_DESC = re.compile('description=(?P<desc>.+?);')\ndef extract_description(attributes_str):\n    res = RE_DESC.search(attributes_str)\n    if res is None:\n        return ''\n    else:\n        return res.group('desc')\n\n\nndf['desc'] = ndf.attributes.apply(extract_description)\n\nndf.drop('attributes', axis=1, inplace=True)\nndf.head()\n")


# In[25]:


get_ipython().run_cell_magic('R', '', 'ptm <- proc.time()\nndf$gene_name <- gsub("(.*Name=)(.*?)(;biotype.*)", "\\\\2", ndf$attributes)\nndf$gene_id <- gsub("(ID=gene:)(.*?)(;Name.*)", "\\\\2", ndf$attributes)\nndf$desc <- gsub("(.*description=)(.*?)(;.*)", "\\\\2", ndf$attributes)\n\n# some genes don\'t have a description\nndf$desc <- ifelse(grepl("^ID=.*", ndf$desc), "", ndf$desc)\n\nndf <- subset(ndf, select = -attributes)\nprint(proc.time() - ptm)\nhead(ndf)\n')


# Jump to plotting
# 

# In[26]:


ndf['length'] = ndf.end - ndf.start + 1
ndf.length.describe()


# In[27]:


get_ipython().run_cell_magic('R', '', 'ndf$length <- ndf$end - ndf$start + 1\nsummary(ndf$length)\n')


# In[28]:


get_ipython().run_cell_magic('timeit', '-n 1 -r 1', "ndf.length.plot(kind='hist', bins=50, logy=True)\n")


# In[29]:


get_ipython().run_cell_magic('timeit', '-n 1 -r 1', '%%R\nndf %>% ggplot(aes(x = length)) + \n  geom_histogram(bins = 50, fill = "blue") + \n  scale_y_log10()\n')


# In[ ]:


# In[ ]: