Notebook to add human histones to DataBase, used once. To delete before final merge
import os
import pandas as pd
import requests
from Bio import AlignIO
from Bio import SeqIO
from Bio.Align.Applications import MuscleCommandline
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
import subprocess
path = os.getcwd()
hist_prot = pd.read_csv(os.path.join(path, "static/human_hist/info/", "human_hist_proteins.csv"))
hist_genes = pd.read_csv(os.path.join(path, "static/human_hist/info/", "human_hist_genes.csv"), usecols=['Histone type', 'Histone variant', 'HGNC Symbol'])
hist_prot = pd.merge(hist_prot, hist_genes.drop_duplicates(), on='HGNC Symbol', how='left')
var_dict = {
'H1.0':'H1.0',
'H1.1':'H1.1',#new
'H1.2':'H1.2',#new
'H1.3':'H1.3',#new
'H1.4':'H1.4',#new
'H1.5':'H1.5',#new
'TS H1.6':'TS_H1.6',
'TS H1.7':'TS_H1.7',
'OO H1.8':'OO_H1.8',
'TS H1.9(?)':'TS_H1.9',
'H1.10':'H1.10',
'TS H2A.1':'H2A.1',
'canonical H2A':'canonical_H2A',
'H2A.J(?)':'H2A.J',#new
'H2A.X':'H2A.X',
'H2A.Z.1':'H2A.Z',
'H2A.Z.2':'H2A.Z',
'macroH2A.1':'macroH2A',
'macroH2A.2':'macroH2A',
'H2A.B.1':'H2A.B',
'H2A.B.2':'H2A.B',
'H2A.P':'H2A.P',
'TS H2B.1':'H2B.1',
'canonical H2B':'canonical_H2B',
'canonical H2B(?)':'canonical_H2B',
'H2B.S(?)':'H2B.S',#new
'H2B.W':'H2B.W',
# '?':'', #new,
'canonical H3.1':'canonical_H3',
'canonical H3.2':'canonical_H3',
'H3.Y.1':'H3.Y',
'H3.Y.2':'H3.Y',
'canonical H3(?)':'canonical_H3',
'H3.3':'H3.3',
'TS H3.4':'TS_H3.4',
'H3.5':'H3.5',
'cenH3':'cenH3',
'canonical H4':'canonical_H4',
}
var_dict_reverse = {v: k for k, v in var_dict.items()}
os.system('>static/browse/seeds/H1/H1.1.fasta')
os.system('>static/browse/seeds/H1/H1.2.fasta')
os.system('>static/browse/seeds/H1/H1.3.fasta')
os.system('>static/browse/seeds/H1/H1.4.fasta')
os.system('>static/browse/seeds/H1/H1.5.fasta')
os.system('>static/browse/seeds/H2A/H2A.J.fasta')
os.system('>static/browse/seeds/H2B/H2B.S.fasta')
512
var_type_dict = {}
for hist_type in hist_prot['Histone type'].unique():
var_list = list(hist_prot['Histone variant'].loc[hist_prot['Histone type']==hist_type].unique())
var_type_dict[hist_type]=var_list
var_type_dict
{'H1': ['H1.0', 'H1.1', 'H1.2', 'H1.3', 'H1.4', 'H1.5', 'TS H1.6', 'TS H1.7', 'OO H1.8', 'TS H1.9(?)', 'H1.10'], 'H2A': ['TS H2A.1', 'canonical H2A', 'H2A.J(?)', 'H2A.X', 'H2A.Z.1', 'H2A.Z.2', 'macroH2A.1', 'macroH2A.2', 'H2A.B.1', 'H2A.B.2', 'H2A.P'], 'H2B': ['TS H2B.1', 'canonical H2B', 'canonical H2B(?)', 'H2B.S(?)', 'H2B.W', '?'], 'H3': ['canonical H3.1', 'canonical H3.2', 'H3.Y.1', 'H3.Y.2', 'canonical H3(?)', 'H3.3', 'TS H3.4', 'H3.5', 'cenH3'], 'H4': ['canonical H4']}
set(var_dict.keys()) - set(hist_prot['Histone variant'])
set()
set(hist_prot['Histone variant']) - set(var_dict.keys())
{'?'}
%%time
seqs=[]
for i in hist_prot['Protein stable ID']:
seq=requests.get('http://rest.ensembl.org/sequence/id/%s?content-type=text/plain'%i).content
seqs.append(seq.decode("utf-8"))
hist_seq = dict(zip(hist_prot['RefSeq peptide ID'],seqs))
hist_prot['seq']=seqs
CPU times: user 451 ms, sys: 69.2 ms, total: 521 ms Wall time: 2min 43s
len(hist_prot)
120
len(hist_seq)
105
hist_prot['RefSeq peptide ID'].value_counts()
NP_001157888 1 NP_003518 1 NP_003537 1 NP_734466 1 NP_005311 1 .. NP_066544 1 NP_001013721 1 NP_003514 1 NP_958925 1 NP_003534 1 Name: RefSeq peptide ID, Length: 104, dtype: int64
for i in ['NP_004884', 'NP_542160','NP_001299582','NP_001035248']:
print( hist_prot['seq'].loc[hist_prot['RefSeq peptide ID'] == i].nunique() )
0 0 1 1
не буду добавлять GI в описание
#for hist_type in ['H1', 'H2A', 'H2B', 'H3', 'H4']:
added_id = []
for hist_type in ['H1', 'H2A', 'H2B', 'H3', 'H4']:
print('====================')
for hist_var in var_type_dict[hist_type]:
if hist_var!='?':
print('---------------------')
print(hist_var)
hist_var_histdb = var_dict[hist_var]
try:
alignment = AlignIO.read(os.path.join(path,"static/browse/seeds/{}/".format(hist_type),"{}.fasta".format(hist_var_histdb)), "fasta")
fasta_variant = []
for record in alignment:
fasta_variant.append(record.id.split('|')[1].split('.')[0])
var_list = hist_prot['RefSeq peptide ID'].loc[hist_prot['Histone variant']=='{}'.format(hist_var)].values
for refseq in var_list:
if refseq in fasta_variant:
print(f'{refseq} in file')
else:
if not pd.isna(refseq):
print(f'{refseq} not in file, added')
alignment.add_sequence(f'Homo|{refseq}|{hist_var_histdb}', hist_seq[refseq])
added_id.append(refseq)
SeqIO.write(alignment, os.path.join(path,"static/browse/seeds/{}/".format(hist_type),"{}.fasta".format(hist_var_histdb)), "fasta")
except:
var_list = hist_prot['RefSeq peptide ID'].loc[hist_prot['Histone variant']=='{}'.format(hist_var)].values
records = []
for refseq in var_list:
if not pd.isna(refseq):
print(f'{refseq} added to new file')
record = SeqRecord(Seq(hist_seq[refseq]),
id=f'Homo|{refseq}|{hist_var_histdb}', name=f'Homo|{refseq}|{hist_var_histdb}',
description=f'Homo|{refseq}|{hist_var_histdb}')
records.append(record)
added_id.append(refseq)
SeqIO.write(records, os.path.join(path,"static/browse/seeds/{}/".format(hist_type),"{}.fasta".format(hist_var_histdb)), "fasta")
os.chdir(f'{path}/static/browse/seeds/{hist_type}/')
os.system(f'muscle -in {hist_var_histdb}.fasta -out {hist_var_histdb}.fasta')
==================== --------------------- H1.0 NP_005309 in file --------------------- H1.1 NP_005316 added to new file --------------------- H1.2 NP_005310 added to new file --------------------- H1.3 NP_005311 added to new file --------------------- H1.4 NP_005312 added to new file --------------------- H1.5 NP_005313 added to new file --------------------- TS H1.6 NP_005314 in file --------------------- TS H1.7 NP_861453 in file --------------------- OO H1.8 NP_722575 in file NP_001295191 not in file, added --------------------- TS H1.9(?) --------------------- H1.10 NP_006017 in file ==================== --------------------- TS H2A.1 NP_734466 in file --------------------- canonical H2A NP_003504 not in file, added NP_003503 not in file, added NP_066409 not in file, added NP_066390 in file NP_066408 not in file, added NP_542163 not in file, added NP_003500 not in file, added NP_066544 not in file, added NP_003501 not in file, added NP_003502 not in file, added NP_003505 not in file, added NP_003507 not in file, added NP_001035807 not in file, added NP_003508 not in file, added NP_778235 not in file, added NP_254280 not in file, added --------------------- H2A.J(?) NP_808760 added to new file --------------------- H2A.X NP_002096 in file --------------------- H2A.Z.1 NP_002097 in file --------------------- H2A.Z.2 NP_619541 not in file, added NP_958925 not in file, added NP_036544 in file NP_958844 not in file, added NP_958924 not in file, added --------------------- macroH2A.1 NP_613258 in file NP_001035248 not in file, added NP_613075 not in file, added --------------------- macroH2A.2 NP_061119 in file --------------------- H2A.B.1 NP_001017990 in file --------------------- H2A.B.2 NP_001017991 not in file, added NP_542451 in file --------------------- H2A.P NP_036406 in file ==================== --------------------- TS H2B.1 NP_733759 in file --------------------- canonical H2B NP_066406 not in file, added NP_003517 not in file, added NP_066407 not in file, added NP_003514 not in file, added NP_003513 not in file, added NP_003516 not in file, added NP_003515 not in file, added NP_003509 not in file, added NP_066402 in file NP_001299582 not in file, added NP_003510 not in file, added NP_003512 not in file, added NP_003511 not in file, added NP_003518 not in file, added NP_001019770 not in file, added NP_001154806 not in file, added NP_778225 not in file, added --------------------- canonical H2B(?) NP_003519 not in file, added --------------------- H2B.S(?) NP_059141 added to new file --------------------- H2B.W NP_001002916 not in file, added ==================== --------------------- canonical H3.1 NP_003520 in file NP_003528 not in file, added NP_003522 not in file, added NP_003521 not in file, added NP_003523 not in file, added NP_066298 not in file, added NP_003525 not in file, added NP_003527 not in file, added NP_003524 not in file, added NP_003526 not in file, added --------------------- canonical H3.2 NP_001116847 not in file, added NP_066403 not in file, added NP_001005464 not in file, added --------------------- H3.Y.1 NP_001342187 not in file, added --------------------- H3.Y.2 NP_001358848 not in file, added --------------------- canonical H3(?) NP_001342338 not in file, added --------------------- H3.3 NP_002098 not in file, added NP_005315 not in file, added --------------------- TS H3.4 NP_003484 in file --------------------- H3.5 NP_001013721 in file --------------------- cenH3 NP_001800 in file NP_001035891 not in file, added ==================== --------------------- canonical H4 NP_003529 not in file, added NP_003535 not in file, added NP_003533 not in file, added NP_003530 not in file, added NP_003536 not in file, added NP_003531 not in file, added NP_003538 in file NP_003534 not in file, added NP_003486 not in file, added NP_068803 not in file, added NP_003532 not in file, added NP_003537 not in file, added NP_003539 not in file, added NP_001029249 in file NP_778224 not in file, added
len(added_id)
79
os.chdir(f'{path}/static/browse/seeds')
%%bash
for var in H1 H2A H2B H3 H4
do
echo $var
for file in `ls ${var}/*.fasta`
do
cat $file >> ${var}_temp.fasta
done
done;
H1 H2A H2B H3 H4
for hist_type in ['H1', 'H2A', 'H2B', 'H3', 'H4']:
with open(f"{hist_type}_ungap.fasta", "w") as o:
for record in SeqIO.parse(f'{hist_type}_temp.fasta', 'fasta'):
record.seq = record.seq.ungap("-")
SeqIO.write(record, o, "fasta")
output = subprocess.run(["muscle", "-in", f"{hist_type}_ungap.fasta", "-out", f"{hist_type}_with_human.fasta"], capture_output=True)
#subprocess.run(["rm", f"{hist_type}_temp.fasta"])
%%bash
for hist_type in H1 H2A H2B H3 H4
do
rm ${hist_type}_temp.fasta
rm ${hist_type}_ungap.fasta
done
temp = pd.read_csv(os.path.join(path, "static/human_hist/info/", "human_hist_genes.csv"))
temp.loc[temp['Histone variant'].isin(['H1.1', 'H1.2', 'H1.3','H1.4', 'H1.5', 'H2A.J(?)', 'H2B.S(?)'])]
Histone type | Histone variant | HGNC Symbol | NCBI gene ID | Ensembl gene ID | Expr. timing | Expr. pattern | Biotype | Bona fide canonical | PMIDs | |
---|---|---|---|---|---|---|---|---|---|---|
1 | H1 | H1.1 | H1-1 | 3024 | ENSG00000124610 | RD | NaN | COD | NaN | 26689747 |
2 | H1 | H1.2 | H1-2 | 3006 | ENSG00000187837 | Mixed | NaN | COD | NaN | 26689747 |
3 | H1 | H1.3 | H1-3 | 3007 | ENSG00000124575 | RD | NaN | COD | NaN | 26689747 |
4 | H1 | H1.4 | H1-4 | 3008 | ENSG00000168298 | RD | NaN | COD | NaN | 26689747 |
5 | H1 | H1.5 | H1-5 | 3009 | ENSG00000184357 | RD | NaN | COD | NaN | 26689747 |
33 | H2A | H2A.J(?) | H2AJ | 55766 | ENSG00000246705 | RI | NaN | COD | NaN | 25731851 |
77 | H2B | H2B.S(?) | H2BS1 | 54145 | ENSG00000234289 | RI | NaN | COD | NaN | ? |