#!/usr/bin/env python # coding: utf-8 # **Notebook to add human histones to DataBase, used once. To delete before final merge** # In[44]: import os import pandas as pd import requests from Bio import AlignIO from Bio import SeqIO from Bio.Align.Applications import MuscleCommandline from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq import subprocess # In[2]: path = os.getcwd() # In[3]: hist_prot = pd.read_csv(os.path.join(path, "static/human_hist/info/", "human_hist_proteins.csv")) hist_genes = pd.read_csv(os.path.join(path, "static/human_hist/info/", "human_hist_genes.csv"), usecols=['Histone type', 'Histone variant', 'HGNC Symbol']) hist_prot = pd.merge(hist_prot, hist_genes.drop_duplicates(), on='HGNC Symbol', how='left') # # Add human histones to histone variant fasta # In[4]: var_dict = { 'H1.0':'H1.0', 'H1.1':'H1.1',#new 'H1.2':'H1.2',#new 'H1.3':'H1.3',#new 'H1.4':'H1.4',#new 'H1.5':'H1.5',#new 'TS H1.6':'TS_H1.6', 'TS H1.7':'TS_H1.7', 'OO H1.8':'OO_H1.8', 'TS H1.9(?)':'TS_H1.9', 'H1.10':'H1.10', 'TS H2A.1':'H2A.1', 'canonical H2A':'canonical_H2A', 'H2A.J(?)':'H2A.J',#new 'H2A.X':'H2A.X', 'H2A.Z.1':'H2A.Z', 'H2A.Z.2':'H2A.Z', 'macroH2A.1':'macroH2A', 'macroH2A.2':'macroH2A', 'H2A.B.1':'H2A.B', 'H2A.B.2':'H2A.B', 'H2A.P':'H2A.P', 'TS H2B.1':'H2B.1', 'canonical H2B':'canonical_H2B', 'canonical H2B(?)':'canonical_H2B', 'H2B.S(?)':'H2B.S',#new 'H2B.W':'H2B.W', # '?':'', #new, 'canonical H3.1':'canonical_H3', 'canonical H3.2':'canonical_H3', 'H3.Y.1':'H3.Y', 'H3.Y.2':'H3.Y', 'canonical H3(?)':'canonical_H3', 'H3.3':'H3.3', 'TS H3.4':'TS_H3.4', 'H3.5':'H3.5', 'cenH3':'cenH3', 'canonical H4':'canonical_H4', } var_dict_reverse = {v: k for k, v in var_dict.items()} # In[324]: os.system('>static/browse/seeds/H1/H1.1.fasta') os.system('>static/browse/seeds/H1/H1.2.fasta') os.system('>static/browse/seeds/H1/H1.3.fasta') os.system('>static/browse/seeds/H1/H1.4.fasta') os.system('>static/browse/seeds/H1/H1.5.fasta') os.system('>static/browse/seeds/H2A/H2A.J.fasta') os.system('>static/browse/seeds/H2B/H2B.S.fasta') # In[5]: var_type_dict = {} for hist_type in hist_prot['Histone type'].unique(): var_list = list(hist_prot['Histone variant'].loc[hist_prot['Histone type']==hist_type].unique()) var_type_dict[hist_type]=var_list # In[6]: var_type_dict # In[7]: set(var_dict.keys()) - set(hist_prot['Histone variant']) # In[8]: set(hist_prot['Histone variant']) - set(var_dict.keys()) # In[9]: get_ipython().run_cell_magic('time', '', '\nseqs=[]\n\nfor i in hist_prot[\'Protein stable ID\']:\n seq=requests.get(\'http://rest.ensembl.org/sequence/id/%s?content-type=text/plain\'%i).content\n seqs.append(seq.decode("utf-8"))\n \nhist_seq = dict(zip(hist_prot[\'RefSeq peptide ID\'],seqs))\nhist_prot[\'seq\']=seqs\n') # In[10]: len(hist_prot) # In[11]: len(hist_seq) # In[12]: hist_prot['RefSeq peptide ID'].value_counts() # In[13]: for i in ['NP_004884', 'NP_542160','NP_001299582','NP_001035248']: print( hist_prot['seq'].loc[hist_prot['RefSeq peptide ID'] == i].nunique() ) # не буду добавлять GI в описание # - https://ncbiinsights.ncbi.nlm.nih.gov/2016/12/06/converting-gi-numbers-to-accession-version/ # In[326]: #for hist_type in ['H1', 'H2A', 'H2B', 'H3', 'H4']: added_id = [] for hist_type in ['H1', 'H2A', 'H2B', 'H3', 'H4']: print('====================') for hist_var in var_type_dict[hist_type]: if hist_var!='?': print('---------------------') print(hist_var) hist_var_histdb = var_dict[hist_var] try: alignment = AlignIO.read(os.path.join(path,"static/browse/seeds/{}/".format(hist_type),"{}.fasta".format(hist_var_histdb)), "fasta") fasta_variant = [] for record in alignment: fasta_variant.append(record.id.split('|')[1].split('.')[0]) var_list = hist_prot['RefSeq peptide ID'].loc[hist_prot['Histone variant']=='{}'.format(hist_var)].values for refseq in var_list: if refseq in fasta_variant: print(f'{refseq} in file') else: if not pd.isna(refseq): print(f'{refseq} not in file, added') alignment.add_sequence(f'Homo|{refseq}|{hist_var_histdb}', hist_seq[refseq]) added_id.append(refseq) SeqIO.write(alignment, os.path.join(path,"static/browse/seeds/{}/".format(hist_type),"{}.fasta".format(hist_var_histdb)), "fasta") except: var_list = hist_prot['RefSeq peptide ID'].loc[hist_prot['Histone variant']=='{}'.format(hist_var)].values records = [] for refseq in var_list: if not pd.isna(refseq): print(f'{refseq} added to new file') record = SeqRecord(Seq(hist_seq[refseq]), id=f'Homo|{refseq}|{hist_var_histdb}', name=f'Homo|{refseq}|{hist_var_histdb}', description=f'Homo|{refseq}|{hist_var_histdb}') records.append(record) added_id.append(refseq) SeqIO.write(records, os.path.join(path,"static/browse/seeds/{}/".format(hist_type),"{}.fasta".format(hist_var_histdb)), "fasta") os.chdir(f'{path}/static/browse/seeds/{hist_type}/') os.system(f'muscle -in {hist_var_histdb}.fasta -out {hist_var_histdb}.fasta') # In[329]: len(added_id) # In[144]: # # Make Histone type fasta # In[330]: os.chdir(f'{path}/static/browse/seeds') # In[331]: get_ipython().run_cell_magic('bash', '', '\n\nfor var in H1 H2A H2B H3 H4\ndo\n echo $var\n for file in `ls ${var}/*.fasta`\n do\n cat $file >> ${var}_temp.fasta\n done\n \ndone;\n') # In[332]: for hist_type in ['H1', 'H2A', 'H2B', 'H3', 'H4']: with open(f"{hist_type}_ungap.fasta", "w") as o: for record in SeqIO.parse(f'{hist_type}_temp.fasta', 'fasta'): record.seq = record.seq.ungap("-") SeqIO.write(record, o, "fasta") output = subprocess.run(["muscle", "-in", f"{hist_type}_ungap.fasta", "-out", f"{hist_type}_with_human.fasta"], capture_output=True) #subprocess.run(["rm", f"{hist_type}_temp.fasta"]) # In[333]: get_ipython().run_cell_magic('bash', '', '\nfor hist_type in H1 H2A H2B H3 H4\ndo\n rm ${hist_type}_temp.fasta\n rm ${hist_type}_ungap.fasta\ndone\n') # In[ ]: # In[303]: temp = pd.read_csv(os.path.join(path, "static/human_hist/info/", "human_hist_genes.csv")) # In[310]: temp.loc[temp['Histone variant'].isin(['H1.1', 'H1.2', 'H1.3','H1.4', 'H1.5', 'H2A.J(?)', 'H2B.S(?)'])] # In[ ]: