#!/usr/bin/env python
# coding: utf-8

# **Notebook to add human histones to DataBase, used once. To delete before final merge**

# In[44]:


import os
import pandas as pd
import requests

from Bio import AlignIO
from Bio import SeqIO
from Bio.Align.Applications import MuscleCommandline

from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq

import subprocess


# In[2]:


path = os.getcwd()


# In[3]:


hist_prot = pd.read_csv(os.path.join(path, "static/human_hist/info/", "human_hist_proteins.csv"))
hist_genes = pd.read_csv(os.path.join(path, "static/human_hist/info/", "human_hist_genes.csv"), usecols=['Histone type', 'Histone variant', 'HGNC Symbol'])

hist_prot = pd.merge(hist_prot, hist_genes.drop_duplicates(), on='HGNC Symbol', how='left')


# # Add human histones to histone variant fasta

# In[4]:


var_dict = {
'H1.0':'H1.0',
'H1.1':'H1.1',#new
'H1.2':'H1.2',#new
'H1.3':'H1.3',#new
'H1.4':'H1.4',#new
'H1.5':'H1.5',#new
'TS H1.6':'TS_H1.6',
'TS H1.7':'TS_H1.7',
'OO H1.8':'OO_H1.8',
'TS H1.9(?)':'TS_H1.9',
'H1.10':'H1.10',
'TS H2A.1':'H2A.1',
'canonical H2A':'canonical_H2A',
'H2A.J(?)':'H2A.J',#new
'H2A.X':'H2A.X',
'H2A.Z.1':'H2A.Z',
'H2A.Z.2':'H2A.Z',
'macroH2A.1':'macroH2A',
'macroH2A.2':'macroH2A',
'H2A.B.1':'H2A.B',
'H2A.B.2':'H2A.B',
'H2A.P':'H2A.P',
'TS H2B.1':'H2B.1', 
'canonical H2B':'canonical_H2B',
'canonical H2B(?)':'canonical_H2B',
'H2B.S(?)':'H2B.S',#new
'H2B.W':'H2B.W',
# '?':'', #new,
'canonical H3.1':'canonical_H3',
'canonical H3.2':'canonical_H3',
'H3.Y.1':'H3.Y',
'H3.Y.2':'H3.Y',
'canonical H3(?)':'canonical_H3',
'H3.3':'H3.3',
'TS H3.4':'TS_H3.4',
'H3.5':'H3.5',
'cenH3':'cenH3',
'canonical H4':'canonical_H4',
}

var_dict_reverse = {v: k for k, v in var_dict.items()}


# In[324]:


os.system('>static/browse/seeds/H1/H1.1.fasta')
os.system('>static/browse/seeds/H1/H1.2.fasta')
os.system('>static/browse/seeds/H1/H1.3.fasta')
os.system('>static/browse/seeds/H1/H1.4.fasta')
os.system('>static/browse/seeds/H1/H1.5.fasta')
os.system('>static/browse/seeds/H2A/H2A.J.fasta')
os.system('>static/browse/seeds/H2B/H2B.S.fasta')


# In[5]:


var_type_dict = {}

for hist_type in hist_prot['Histone type'].unique():
    var_list = list(hist_prot['Histone variant'].loc[hist_prot['Histone type']==hist_type].unique())
    var_type_dict[hist_type]=var_list


# In[6]:


var_type_dict


# In[7]:


set(var_dict.keys()) - set(hist_prot['Histone variant'])


# In[8]:


set(hist_prot['Histone variant']) - set(var_dict.keys())


# In[9]:


get_ipython().run_cell_magic('time', '', '\nseqs=[]\n\nfor i in hist_prot[\'Protein stable ID\']:\n    seq=requests.get(\'http://rest.ensembl.org/sequence/id/%s?content-type=text/plain\'%i).content\n    seqs.append(seq.decode("utf-8"))\n    \nhist_seq = dict(zip(hist_prot[\'RefSeq peptide ID\'],seqs))\nhist_prot[\'seq\']=seqs\n')


# In[10]:


len(hist_prot)


# In[11]:


len(hist_seq)


# In[12]:


hist_prot['RefSeq peptide ID'].value_counts()


# In[13]:


for i in ['NP_004884', 'NP_542160','NP_001299582','NP_001035248']:
    print( hist_prot['seq'].loc[hist_prot['RefSeq peptide ID'] == i].nunique() )


# не буду добавлять GI в описание
# - https://ncbiinsights.ncbi.nlm.nih.gov/2016/12/06/converting-gi-numbers-to-accession-version/

# In[326]:


#for hist_type in ['H1', 'H2A', 'H2B', 'H3', 'H4']:

added_id = []

for hist_type in ['H1', 'H2A', 'H2B', 'H3', 'H4']:

    print('====================')
    for hist_var in var_type_dict[hist_type]: 
        
        if hist_var!='?':
            
            print('---------------------')
            print(hist_var)

            hist_var_histdb = var_dict[hist_var] 

            try:
                alignment = AlignIO.read(os.path.join(path,"static/browse/seeds/{}/".format(hist_type),"{}.fasta".format(hist_var_histdb)), "fasta")

                fasta_variant = []

                for record in alignment:
                    fasta_variant.append(record.id.split('|')[1].split('.')[0])

                var_list = hist_prot['RefSeq peptide ID'].loc[hist_prot['Histone variant']=='{}'.format(hist_var)].values


                for refseq in var_list:
                    if refseq in fasta_variant:
                        print(f'{refseq} in file')
                    else:
                        if not pd.isna(refseq):
                            print(f'{refseq} not in file, added') 
                            alignment.add_sequence(f'Homo|{refseq}|{hist_var_histdb}', hist_seq[refseq])
                            added_id.append(refseq)

                SeqIO.write(alignment, os.path.join(path,"static/browse/seeds/{}/".format(hist_type),"{}.fasta".format(hist_var_histdb)), "fasta")


            except:
                var_list = hist_prot['RefSeq peptide ID'].loc[hist_prot['Histone variant']=='{}'.format(hist_var)].values
                records = []
                for refseq in var_list:
                    if not pd.isna(refseq):
                        print(f'{refseq} added to new file') 
                        record = SeqRecord(Seq(hist_seq[refseq]),
                           id=f'Homo|{refseq}|{hist_var_histdb}', name=f'Homo|{refseq}|{hist_var_histdb}',
                           description=f'Homo|{refseq}|{hist_var_histdb}')
                        records.append(record)
                        added_id.append(refseq)
                SeqIO.write(records, os.path.join(path,"static/browse/seeds/{}/".format(hist_type),"{}.fasta".format(hist_var_histdb)), "fasta")


            os.chdir(f'{path}/static/browse/seeds/{hist_type}/')
            os.system(f'muscle -in {hist_var_histdb}.fasta -out {hist_var_histdb}.fasta')


# In[329]:


len(added_id)


# In[144]:


# # Make Histone type fasta

# In[330]:


os.chdir(f'{path}/static/browse/seeds')


# In[331]:


get_ipython().run_cell_magic('bash', '', '\n\nfor var in H1 H2A H2B H3 H4\ndo\n    echo $var\n    for file in `ls ${var}/*.fasta`\n    do\n        cat $file >> ${var}_temp.fasta\n    done\n    \ndone;\n')


# In[332]:


for hist_type in ['H1', 'H2A', 'H2B', 'H3', 'H4']:
    with open(f"{hist_type}_ungap.fasta", "w") as o:
        for record in SeqIO.parse(f'{hist_type}_temp.fasta', 'fasta'):
            record.seq = record.seq.ungap("-")
            SeqIO.write(record, o, "fasta")
    output = subprocess.run(["muscle", "-in", f"{hist_type}_ungap.fasta", "-out", f"{hist_type}_with_human.fasta"], capture_output=True)
    #subprocess.run(["rm", f"{hist_type}_temp.fasta"])
    

# In[333]:


get_ipython().run_cell_magic('bash', '', '\nfor hist_type in H1 H2A H2B H3 H4\ndo\n    rm ${hist_type}_temp.fasta\n    rm ${hist_type}_ungap.fasta\ndone\n')


# In[ ]:


# In[303]:


temp = pd.read_csv(os.path.join(path, "static/human_hist/info/", "human_hist_genes.csv"))


# In[310]:


temp.loc[temp['Histone variant'].isin(['H1.1', 'H1.2', 'H1.3','H1.4', 'H1.5', 'H2A.J(?)', 'H2B.S(?)'])]


# In[ ]: