Notebook to add human histones to DataBase, used once. To delete before final merge

In [44]:

import os
import pandas as pd
import requests

from Bio import AlignIO
from Bio import SeqIO
from Bio.Align.Applications import MuscleCommandline

from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq

import subprocess

In [2]:

path = os.getcwd()

In [3]:

hist_prot = pd.read_csv(os.path.join(path, "static/human_hist/info/", "human_hist_proteins.csv"))
hist_genes = pd.read_csv(os.path.join(path, "static/human_hist/info/", "human_hist_genes.csv"), usecols=['Histone type', 'Histone variant', 'HGNC Symbol'])

hist_prot = pd.merge(hist_prot, hist_genes.drop_duplicates(), on='HGNC Symbol', how='left')

Add human histones to histone variant fasta¶

In [4]:

var_dict = {
'H1.0':'H1.0',
'H1.1':'H1.1',#new
'H1.2':'H1.2',#new
'H1.3':'H1.3',#new
'H1.4':'H1.4',#new
'H1.5':'H1.5',#new
'TS H1.6':'TS_H1.6',
'TS H1.7':'TS_H1.7',
'OO H1.8':'OO_H1.8',
'TS H1.9(?)':'TS_H1.9',
'H1.10':'H1.10',
'TS H2A.1':'H2A.1',
'canonical H2A':'canonical_H2A',
'H2A.J(?)':'H2A.J',#new
'H2A.X':'H2A.X',
'H2A.Z.1':'H2A.Z',
'H2A.Z.2':'H2A.Z',
'macroH2A.1':'macroH2A',
'macroH2A.2':'macroH2A',
'H2A.B.1':'H2A.B',
'H2A.B.2':'H2A.B',
'H2A.P':'H2A.P',
'TS H2B.1':'H2B.1', 
'canonical H2B':'canonical_H2B',
'canonical H2B(?)':'canonical_H2B',
'H2B.S(?)':'H2B.S',#new
'H2B.W':'H2B.W',
# '?':'', #new,
'canonical H3.1':'canonical_H3',
'canonical H3.2':'canonical_H3',
'H3.Y.1':'H3.Y',
'H3.Y.2':'H3.Y',
'canonical H3(?)':'canonical_H3',
'H3.3':'H3.3',
'TS H3.4':'TS_H3.4',
'H3.5':'H3.5',
'cenH3':'cenH3',
'canonical H4':'canonical_H4',
}

var_dict_reverse = {v: k for k, v in var_dict.items()}

In [324]:

os.system('>static/browse/seeds/H1/H1.1.fasta')
os.system('>static/browse/seeds/H1/H1.2.fasta')
os.system('>static/browse/seeds/H1/H1.3.fasta')
os.system('>static/browse/seeds/H1/H1.4.fasta')
os.system('>static/browse/seeds/H1/H1.5.fasta')
os.system('>static/browse/seeds/H2A/H2A.J.fasta')
os.system('>static/browse/seeds/H2B/H2B.S.fasta')

Out[324]:

In [5]:

var_type_dict = {}

for hist_type in hist_prot['Histone type'].unique():
    var_list = list(hist_prot['Histone variant'].loc[hist_prot['Histone type']==hist_type].unique())
    var_type_dict[hist_type]=var_list

In [6]:

var_type_dict

Out[6]:

{'H1': ['H1.0',
  'H1.1',
  'H1.2',
  'H1.3',
  'H1.4',
  'H1.5',
  'TS H1.6',
  'TS H1.7',
  'OO H1.8',
  'TS H1.9(?)',
  'H1.10'],
 'H2A': ['TS H2A.1',
  'canonical H2A',
  'H2A.J(?)',
  'H2A.X',
  'H2A.Z.1',
  'H2A.Z.2',
  'macroH2A.1',
  'macroH2A.2',
  'H2A.B.1',
  'H2A.B.2',
  'H2A.P'],
 'H2B': ['TS H2B.1',
  'canonical H2B',
  'canonical H2B(?)',
  'H2B.S(?)',
  'H2B.W',
  '?'],
 'H3': ['canonical H3.1',
  'canonical H3.2',
  'H3.Y.1',
  'H3.Y.2',
  'canonical H3(?)',
  'H3.3',
  'TS H3.4',
  'H3.5',
  'cenH3'],
 'H4': ['canonical H4']}

In [7]:

set(var_dict.keys()) - set(hist_prot['Histone variant'])

Out[7]:

set()

In [8]:

set(hist_prot['Histone variant']) - set(var_dict.keys())

Out[8]:

{'?'}

In [9]:

%%time

seqs=[]

for i in hist_prot['Protein stable ID']:
    seq=requests.get('http://rest.ensembl.org/sequence/id/%s?content-type=text/plain'%i).content
    seqs.append(seq.decode("utf-8"))
    
hist_seq = dict(zip(hist_prot['RefSeq peptide ID'],seqs))
hist_prot['seq']=seqs

CPU times: user 451 ms, sys: 69.2 ms, total: 521 ms
Wall time: 2min 43s

In [10]:

len(hist_prot)

Out[10]:

In [11]:

len(hist_seq)

Out[11]:

In [12]:

hist_prot['RefSeq peptide ID'].value_counts()

Out[12]:

NP_001157888    1
NP_003518       1
NP_003537       1
NP_734466       1
NP_005311       1
               ..
NP_066544       1
NP_001013721    1
NP_003514       1
NP_958925       1
NP_003534       1
Name: RefSeq peptide ID, Length: 104, dtype: int64

In [13]:

for i in ['NP_004884', 'NP_542160','NP_001299582','NP_001035248']:
    print( hist_prot['seq'].loc[hist_prot['RefSeq peptide ID'] == i].nunique() )

не буду добавлять GI в описание

https://ncbiinsights.ncbi.nlm.nih.gov/2016/12/06/converting-gi-numbers-to-accession-version/

In [326]:

#for hist_type in ['H1', 'H2A', 'H2B', 'H3', 'H4']:

added_id = []

for hist_type in ['H1', 'H2A', 'H2B', 'H3', 'H4']:

    print('====================')
    for hist_var in var_type_dict[hist_type]: 
        
        if hist_var!='?':
            
            print('---------------------')
            print(hist_var)

            hist_var_histdb = var_dict[hist_var] 

            try:
                alignment = AlignIO.read(os.path.join(path,"static/browse/seeds/{}/".format(hist_type),"{}.fasta".format(hist_var_histdb)), "fasta")

                fasta_variant = []

                for record in alignment:
                    fasta_variant.append(record.id.split('|')[1].split('.')[0])

                var_list = hist_prot['RefSeq peptide ID'].loc[hist_prot['Histone variant']=='{}'.format(hist_var)].values


                for refseq in var_list:
                    if refseq in fasta_variant:
                        print(f'{refseq} in file')
                    else:
                        if not pd.isna(refseq):
                            print(f'{refseq} not in file, added') 
                            alignment.add_sequence(f'Homo|{refseq}|{hist_var_histdb}', hist_seq[refseq])
                            added_id.append(refseq)

                SeqIO.write(alignment, os.path.join(path,"static/browse/seeds/{}/".format(hist_type),"{}.fasta".format(hist_var_histdb)), "fasta")



            except:
                var_list = hist_prot['RefSeq peptide ID'].loc[hist_prot['Histone variant']=='{}'.format(hist_var)].values
                records = []
                for refseq in var_list:
                    if not pd.isna(refseq):
                        print(f'{refseq} added to new file') 
                        record = SeqRecord(Seq(hist_seq[refseq]),
                           id=f'Homo|{refseq}|{hist_var_histdb}', name=f'Homo|{refseq}|{hist_var_histdb}',
                           description=f'Homo|{refseq}|{hist_var_histdb}')
                        records.append(record)
                        added_id.append(refseq)
                SeqIO.write(records, os.path.join(path,"static/browse/seeds/{}/".format(hist_type),"{}.fasta".format(hist_var_histdb)), "fasta")


            os.chdir(f'{path}/static/browse/seeds/{hist_type}/')
            os.system(f'muscle -in {hist_var_histdb}.fasta -out {hist_var_histdb}.fasta')

====================
---------------------
H1.0
NP_005309 in file
---------------------
H1.1
NP_005316 added to new file
---------------------
H1.2
NP_005310 added to new file
---------------------
H1.3
NP_005311 added to new file
---------------------
H1.4
NP_005312 added to new file
---------------------
H1.5
NP_005313 added to new file
---------------------
TS H1.6
NP_005314 in file
---------------------
TS H1.7
NP_861453 in file
---------------------
OO H1.8
NP_722575 in file
NP_001295191 not in file, added
---------------------
TS H1.9(?)
---------------------
H1.10
NP_006017 in file
====================
---------------------
TS H2A.1
NP_734466 in file
---------------------
canonical H2A
NP_003504 not in file, added
NP_003503 not in file, added
NP_066409 not in file, added
NP_066390 in file
NP_066408 not in file, added
NP_542163 not in file, added
NP_003500 not in file, added
NP_066544 not in file, added
NP_003501 not in file, added
NP_003502 not in file, added
NP_003505 not in file, added
NP_003507 not in file, added
NP_001035807 not in file, added
NP_003508 not in file, added
NP_778235 not in file, added
NP_254280 not in file, added
---------------------
H2A.J(?)
NP_808760 added to new file
---------------------
H2A.X
NP_002096 in file
---------------------
H2A.Z.1
NP_002097 in file
---------------------
H2A.Z.2
NP_619541 not in file, added
NP_958925 not in file, added
NP_036544 in file
NP_958844 not in file, added
NP_958924 not in file, added
---------------------
macroH2A.1
NP_613258 in file
NP_001035248 not in file, added
NP_613075 not in file, added
---------------------
macroH2A.2
NP_061119 in file
---------------------
H2A.B.1
NP_001017990 in file
---------------------
H2A.B.2
NP_001017991 not in file, added
NP_542451 in file
---------------------
H2A.P
NP_036406 in file
====================
---------------------
TS H2B.1
NP_733759 in file
---------------------
canonical H2B
NP_066406 not in file, added
NP_003517 not in file, added
NP_066407 not in file, added
NP_003514 not in file, added
NP_003513 not in file, added
NP_003516 not in file, added
NP_003515 not in file, added
NP_003509 not in file, added
NP_066402 in file
NP_001299582 not in file, added
NP_003510 not in file, added
NP_003512 not in file, added
NP_003511 not in file, added
NP_003518 not in file, added
NP_001019770 not in file, added
NP_001154806 not in file, added
NP_778225 not in file, added
---------------------
canonical H2B(?)
NP_003519 not in file, added
---------------------
H2B.S(?)
NP_059141 added to new file
---------------------
H2B.W
NP_001002916 not in file, added
====================
---------------------
canonical H3.1
NP_003520 in file
NP_003528 not in file, added
NP_003522 not in file, added
NP_003521 not in file, added
NP_003523 not in file, added
NP_066298 not in file, added
NP_003525 not in file, added
NP_003527 not in file, added
NP_003524 not in file, added
NP_003526 not in file, added
---------------------
canonical H3.2
NP_001116847 not in file, added
NP_066403 not in file, added
NP_001005464 not in file, added
---------------------
H3.Y.1
NP_001342187 not in file, added
---------------------
H3.Y.2
NP_001358848 not in file, added
---------------------
canonical H3(?)
NP_001342338 not in file, added
---------------------
H3.3
NP_002098 not in file, added
NP_005315 not in file, added
---------------------
TS H3.4
NP_003484 in file
---------------------
H3.5
NP_001013721 in file
---------------------
cenH3
NP_001800 in file
NP_001035891 not in file, added
====================
---------------------
canonical H4
NP_003529 not in file, added
NP_003535 not in file, added
NP_003533 not in file, added
NP_003530 not in file, added
NP_003536 not in file, added
NP_003531 not in file, added
NP_003538 in file
NP_003534 not in file, added
NP_003486 not in file, added
NP_068803 not in file, added
NP_003532 not in file, added
NP_003537 not in file, added
NP_003539 not in file, added
NP_001029249 in file
NP_778224 not in file, added

In [329]:

len(added_id)

Out[329]:

In [144]:

Make Histone type fasta¶

In [330]:

os.chdir(f'{path}/static/browse/seeds')

In [331]:

%%bash


for var in H1 H2A H2B H3 H4
do
    echo $var
    for file in `ls ${var}/*.fasta`
    do
        cat $file >> ${var}_temp.fasta
    done
    
done;

H1
H2A
H2B
H3
H4

In [332]:

for hist_type in ['H1', 'H2A', 'H2B', 'H3', 'H4']:
    with open(f"{hist_type}_ungap.fasta", "w") as o:
        for record in SeqIO.parse(f'{hist_type}_temp.fasta', 'fasta'):
            record.seq = record.seq.ungap("-")
            SeqIO.write(record, o, "fasta")
    output = subprocess.run(["muscle", "-in", f"{hist_type}_ungap.fasta", "-out", f"{hist_type}_with_human.fasta"], capture_output=True)
    #subprocess.run(["rm", f"{hist_type}_temp.fasta"])
    

In [333]:

%%bash

for hist_type in H1 H2A H2B H3 H4
do
    rm ${hist_type}_temp.fasta
    rm ${hist_type}_ungap.fasta
done

In [ ]:

In [303]:

temp = pd.read_csv(os.path.join(path, "static/human_hist/info/", "human_hist_genes.csv"))

In [310]:

temp.loc[temp['Histone variant'].isin(['H1.1', 'H1.2', 'H1.3','H1.4', 'H1.5', 'H2A.J(?)', 'H2B.S(?)'])]

Out[310]:

	Histone type	Histone variant	HGNC Symbol	NCBI gene ID	Ensembl gene ID	Expr. timing	Expr. pattern	Biotype	Bona fide canonical	PMIDs
1	H1	H1.1	H1-1	3024	ENSG00000124610	RD	NaN	COD	NaN	26689747
2	H1	H1.2	H1-2	3006	ENSG00000187837	Mixed	NaN	COD	NaN	26689747
3	H1	H1.3	H1-3	3007	ENSG00000124575	RD	NaN	COD	NaN	26689747
4	H1	H1.4	H1-4	3008	ENSG00000168298	RD	NaN	COD	NaN	26689747
5	H1	H1.5	H1-5	3009	ENSG00000184357	RD	NaN	COD	NaN	26689747
33	H2A	H2A.J(?)	H2AJ	55766	ENSG00000246705	RI	NaN	COD	NaN	25731851
77	H2B	H2B.S(?)	H2BS1	54145	ENSG00000234289	RI	NaN	COD	NaN	?

In [ ]: