from Bio import Entrez
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import re
import random
import pandas as pd
Entrez.email = 'sasha.grrshnova98@gmail.com'
#get B.burgdorferi genome
burgdorferi_genome = id_search('NC_001318.1')
bavariensis_genome=id_search('NC_006156.1')
afzelii_genome=id_search('NC_018887.1')
garenii_genome=id_search('NC_018747.1')
borrelia=['burgdorferi', 'bavariensis', 'afzelii', 'garenii']
def PAM_search(seq):
genome = str(seq)
PAM_positions = {}
PAM_pos = []
for m in re.finditer(r"[ACGT]GG", genome):
PAM_pos.append(m.start())
PAM_positions['SpCas9'] = PAM_pos
PAM_pos = []
for m in re.finditer(r"[ACGT]G[AG][AG]T", genome):
PAM_pos.append(m.start())
PAM_positions['SaCas9_1'] = PAM_pos
PAM_pos = []
for m in re.finditer(r"G[ACGT]G[AG][AG][ACTG]", genome):
PAM_pos.append(m.start())
PAM_positions['SaCas9_2'] = PAM_pos
PAM_pos = []
for m in re.finditer(r"[ACGT][ACGT][ACGT][ACGT][AG][CT]AC", genome):
PAM_pos.append(m.start())
PAM_positions['CjCas9'] = PAM_pos
return PAM_positions
#to search start inverted pam in complement DNA
def PAM_rev_search(seq):
genome = str(seq.complement())
PAM_positions = {}
PAM_pos = []
for m in re.finditer(r"GG[ACGT]", genome):
PAM_pos.append(m.start())
PAM_positions['SpCas9'] = PAM_pos
PAM_pos = []
for m in re.finditer(r"T[AG][AG]G[ACGT]", genome):
PAM_pos.append(m.start())
PAM_positions['SaCas9_1'] = PAM_pos
PAM_pos = []
for m in re.finditer(r"[ACGT][AG][AG]G[ACGT]", genome):
PAM_pos.append(m.start())
PAM_positions['SaCas9_2'] = PAM_pos
PAM_pos = []
for m in re.finditer(r"CA[CT][AG][ACGT][ACGT][ACGT][ACGT]", genome):
PAM_pos.append(m.start())
PAM_positions['CjCas9'] = PAM_pos
return PAM_positions
burgdorferi_PAM_rev_pos = PAM_rev_search(burgdorferi_genome)
burgdorferi_PAM_pos = PAM_search(burgdorferi_genome)
bavariensis_PAM_rev_pos = PAM_rev_search(bavariensis_genome)
bavariensis_PAM_pos = PAM_search(bavariensis_genome)
afzelii_PAM_rev_pos = PAM_rev_search(afzelii_genome)
afzelii_PAM_pos = PAM_search(afzelii_genome)
garenii_PAM_rev_pos = PAM_rev_search(garenii_genome)
garenii_PAM_pos = PAM_search(garenii_genome)
#to chose cas proteins, len first pam and len between 2 pams (here 24 for SpCas9)
def pam_pairs(PAM_pos, PAM_rev_pos, pos_cas, rev_cas):
PAM_pos_y = []
for el in PAM_pos[pos_cas]:
PAM_pos_y.append(el + 24)
PAM_pairs = {}
for el in PAM_pos_y:
if el in set(PAM_rev_pos[rev_cas]):
PAM_pairs[el-24] = el
return PAM_pairs
burgdorferi_pam_pairs=pam_pairs(burgdorferi_PAM_pos, burgdorferi_PAM_rev_pos, 'SpCas9', 'SpCas9')
bavariensis_pam_pairs=pam_pairs(bavariensis_PAM_pos, bavariensis_PAM_rev_pos, 'SpCas9', 'SpCas9')
afzelii_pam_pairs=pam_pairs(afzelii_PAM_pos, afzelii_PAM_rev_pos, 'SpCas9', 'SpCas9')
garenii_pam_pairs=pam_pairs(garenii_PAM_pos, garenii_PAM_rev_pos, 'SpCas9', 'SpCas9')
# 1 target + pam
burgdorferi_pam_targets_pairs_seq={}
for i in burgdorferi_pam_pairs.keys():
burgdorferi_pam_targets_pairs_seq.update({i:str(burgdorferi_genome[i-20:i+3])})
bavariensis_pam_targets_pairs_seq={}
for i in bavariensis_pam_pairs.keys():
bavariensis_pam_targets_pairs_seq.update({i:str(bavariensis_genome[i-20:i+3])})
afzelii_pam_targets_pairs_seq={}
for i in afzelii_pam_pairs.keys():
afzelii_pam_targets_pairs_seq.update({i:str(afzelii_genome[i-20:i+3])})
garenii_pam_targets_pairs_seq={}
for i in garenii_pam_pairs.keys():
garenii_pam_targets_pairs_seq.update({i:str(garenii_genome[i-20:i+3])})
# second target + pam
burgdorferi_pam_targets_pairs_seq_rev={}
for i in burgdorferi_pam_pairs.values():
burgdorferi_pam_targets_pairs_seq_rev.update({i: str(burgdorferi_genome[i:i+23])})
bavariensis_pam_targets_pairs_seq_rev={}
for i in bavariensis_pam_pairs.values():
bavariensis_pam_targets_pairs_seq_rev.update({i:str(bavariensis_genome[i:i+23])})
afzelii_pam_targets_pairs_seq_rev={}
for i in afzelii_pam_pairs.values():
afzelii_pam_targets_pairs_seq_rev.update({i:str(afzelii_genome[i:i+23])})
garenii_pam_targets_pairs_seq_rev={}
for i in garenii_pam_pairs.values():
garenii_pam_targets_pairs_seq_rev.update({i: str(garenii_genome[i:i+23])})
def sequence_compare(seq_a, seq_b):
len1= len(seq_a)
len2= len(seq_b)
matches = 0
for pos in range (0,min(len1,len2)) :
if seq_a[pos] != seq_b[pos]:
matches+=0
else:
matches+=1
return matches
# to search condervative sequnces (target1-pam1) among 4 borrelias,
# threshold is a number of same nucleotides in same positions
thres=22
common_seq_dict={}
common_seq=[]
n=0
for i in set(burgdorferi_pam_targets_pairs_seq.values()):
for k in set(bavariensis_pam_targets_pairs_seq.values()):
if sequence_compare(i,k) > thres:
for l in set(afzelii_pam_targets_pairs_seq.values()):
if sequence_compare(i,l) > thres:
for m in set(garenii_pam_targets_pairs_seq.values()):
if sequence_compare(i,m) > thres:
a= [key for (key, value) in burgdorferi_pam_targets_pairs_seq.items() if value == i]
b= [key for (key, value) in bavariensis_pam_targets_pairs_seq.items() if value == k]
c= [key for (key, value) in afzelii_pam_targets_pairs_seq.items() if value == l]
d= [key for (key, value) in garenii_pam_targets_pairs_seq.items() if value == m]
common_seq_dict.update({n: [a , b, c, d]})
n=n+1
common_seq_df=pd.DataFrame.from_dict(common_seq_dict)
common_seq_df
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | [134376] | [378587] | [484274] | [444913] | [628385] | [443638] | [441401, 438156] | [438906, 435661] | [535898] | [445778] | [417622] | [345887] | [19925] | [481105] | [789106] | [436784, 440029] | [331095] | [871669] | [444366] |
1 | [134197] | [377346] | [486625] | [447105] | [630325] | [444930] | [438801, 442045] | [436306, 439550] | [538085] | [447970] | [418254] | [344628] | [19773] | [483428] | [791106] | [437429, 440673] | [329902] | [873604] | [446555] |
2 | [134240] | [377159] | [486357] | [446825] | [630240] | [444150] | [438413, 441649] | [435918, 439154] | [537885] | [447690] | [417870] | [344443] | [19926] | [483120] | [790823] | [440277, 437041] | [329753] | [873497] | [446273] |
3 | [134244] | [377626] | [483767] | [444253] | [627939] | [441997] | [439103] | [436608] | [535613] | [445118] | [418542] | [344906] | [19810] | [480567] | [788736] | [437731] | [330186] | [870992] | [443703] |
for index, column in common_seq_df.iteritems():
common_seq_df[index][0]=common_seq_df[index][0][0]
common_seq_df[index][1]=common_seq_df[index][1][0]
common_seq_df[index][2]=common_seq_df[index][2][0]
common_seq_df[index][3]=common_seq_df[index][3][0]
common_seq_rev_dict={}
#24 is len of pam1 and len between 2 pams
for index, column in common_seq_df.iteritems():
bur=burgdorferi_pam_targets_pairs_seq_rev[common_seq_df[index][0]+24]
bav=bavariensis_pam_targets_pairs_seq_rev[common_seq_df[index][1]+24]
afz=afzelii_pam_targets_pairs_seq_rev[common_seq_df[index][2]+24]
gar=garenii_pam_targets_pairs_seq_rev[common_seq_df[index][3]+24]
thres=22
if sequence_compare(bur,bav) > thres:
if sequence_compare(bur,afz) > thres:
if sequence_compare(bur,gar) > thres:
common_seq_rev_dict.update({index: [common_seq_df[index][0]+24, common_seq_df[index][1]+24,\
common_seq_df[index][2]+24, common_seq_df[index][3]+24]})
common_seq_rev_df=pd.DataFrame.from_dict(common_seq_rev_dict)
common_seq_rev_df=common_seq_rev_df.T
common_seq_df=common_seq_df.T
common_seq_df.columns=['bur_1', 'bav_1', 'afz_1', 'gar_1']
common_seq_rev_df.columns=['bur_2', 'bav_2', 'afz_2', 'gar_2']
common_seq_all=pd.merge(common_seq_df, common_seq_rev_df, how='outer', left_index=True, right_index=True)
common_seq_all.loc[~common_seq_all['bur_2'].isna()]
bur_1 | bav_1 | afz_1 | gar_1 | bur_2 | bav_2 | afz_2 | gar_2 | |
---|---|---|---|---|---|---|---|---|
3 | 444913 | 447105 | 446825 | 444253 | 444937.0 | 447129.0 | 446849.0 | 444277.0 |
5 | 443638 | 444930 | 444150 | 441997 | 443662.0 | 444954.0 | 444174.0 | 442021.0 |
6 | 441401 | 438801 | 438413 | 439103 | 441425.0 | 438825.0 | 438437.0 | 439127.0 |
7 | 438906 | 436306 | 435918 | 436608 | 438930.0 | 436330.0 | 435942.0 | 436632.0 |
9 | 445778 | 447970 | 447690 | 445118 | 445802.0 | 447994.0 | 447714.0 | 445142.0 |
10 | 417622 | 418254 | 417870 | 418542 | 417646.0 | 418278.0 | 417894.0 | 418566.0 |
14 | 789106 | 791106 | 790823 | 788736 | 789130.0 | 791130.0 | 790847.0 | 788760.0 |
15 | 436784 | 437429 | 440277 | 437731 | 436808.0 | 437453.0 | 440301.0 | 437755.0 |
burgdorferi_genome[444913-20:444913+3]
Seq('CGTGTGTAGCCCAGGACATAAGG', SingleLetterAlphabet())
bavariensis_genome[447105-20:447105+3]
Seq('CGTGTGTAGCCCAGGACATAAGG', SingleLetterAlphabet())
burgdorferi_genome[444937:444937+23]
Seq('CCTCACCTTCCTCCGACTTATCA', SingleLetterAlphabet())
bavariensis_genome[447129:447129+23]
Seq('CCTCACCTTCCTCCGACTTATCA', SingleLetterAlphabet())
#create df_PAM; 1 - Watson, 0 - Crick
import pandas as pd
df_PAM = pd.DataFrame({"PAM_pos" : PAM_SpCas9_pos, "Strand" : 0})
#create df_PAM_rev; 1 - Watson, 0 - Crick
df_PAM_rev = pd.DataFrame({"PAM_pos" : PAM_SpCas9_rev_pos, "Strand" : 1})
df_all_PAMs = pd.concat([df_PAM, df_PAM_rev])
handle = Entrez.esearch(db="probe", term="borrelia burgdorferi", retmode="text", retmax = 220)
record = Entrez.read(handle)
#print(record["IdList"])
records = record['IdList']
#print(records)
import requests
probes = []
for i in records:
id_dbprobe = i
link = 'https://www.ncbi.nlm.nih.gov/probe/' + id_dbprobe
f = requests.get(link)
result = f.text
res = re.findall(r'class="breakTxt">[ACTG]+', result)
for el in res:
probes.append(el[17:])
print(len(probes))
#print(probes)
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) ~\Miniconda3\lib\site-packages\requests\packages\urllib3\connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw) 378 try: # Python 2.7, use buffering of HTTP responses --> 379 httplib_response = conn.getresponse(buffering=True) 380 except TypeError: # Python 2.6 and older, Python 3 TypeError: getresponse() got an unexpected keyword argument 'buffering' During handling of the above exception, another exception occurred: KeyboardInterrupt Traceback (most recent call last) <ipython-input-55-eaed8232975d> in <module> 5 id_dbprobe = i 6 link = 'https://www.ncbi.nlm.nih.gov/probe/' + id_dbprobe ----> 7 f = requests.get(link) 8 result = f.text 9 res = re.findall(r'class="breakTxt">[ACTG]+', result) ~\Miniconda3\lib\site-packages\requests\api.py in get(url, params, **kwargs) 70 71 kwargs.setdefault('allow_redirects', True) ---> 72 return request('get', url, params=params, **kwargs) 73 74 ~\Miniconda3\lib\site-packages\requests\api.py in request(method, url, **kwargs) 56 # cases, and look like a memory leak in others. 57 with sessions.Session() as session: ---> 58 return session.request(method=method, url=url, **kwargs) 59 60 ~\Miniconda3\lib\site-packages\requests\sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json) 516 } 517 send_kwargs.update(settings) --> 518 resp = self.send(prep, **send_kwargs) 519 520 return resp ~\Miniconda3\lib\site-packages\requests\sessions.py in send(self, request, **kwargs) 637 638 # Send the request --> 639 r = adapter.send(request, **kwargs) 640 641 # Total elapsed time of the request (approximately) ~\Miniconda3\lib\site-packages\requests\adapters.py in send(self, request, stream, timeout, verify, cert, proxies) 436 decode_content=False, 437 retries=self.max_retries, --> 438 timeout=timeout 439 ) 440 ~\Miniconda3\lib\site-packages\requests\packages\urllib3\connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw) 598 timeout=timeout_obj, 599 body=body, headers=headers, --> 600 chunked=chunked) 601 602 # If we're going to release the connection in ``finally:``, then ~\Miniconda3\lib\site-packages\requests\packages\urllib3\connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw) 380 except TypeError: # Python 2.6 and older, Python 3 381 try: --> 382 httplib_response = conn.getresponse() 383 except Exception as e: 384 # Remove the TypeError from the exception chain in Python 3; ~\Miniconda3\lib\http\client.py in getresponse(self) 1196 try: 1197 try: -> 1198 response.begin() 1199 except ConnectionError: 1200 self.close() ~\Miniconda3\lib\http\client.py in begin(self) 295 # read until we get a non-100 response 296 while True: --> 297 version, status, reason = self._read_status() 298 if status != CONTINUE: 299 break ~\Miniconda3\lib\http\client.py in _read_status(self) 256 257 def _read_status(self): --> 258 line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1") 259 if len(line) > _MAXLINE: 260 raise LineTooLong("status line") ~\Miniconda3\lib\socket.py in readinto(self, b) 574 while True: 575 try: --> 576 return self._sock.recv_into(b) 577 except timeout: 578 self._timeout_occurred = True ~\Miniconda3\lib\site-packages\requests\packages\urllib3\contrib\pyopenssl.py in recv_into(self, *args, **kwargs) 275 def recv_into(self, *args, **kwargs): 276 try: --> 277 return self.connection.recv_into(*args, **kwargs) 278 except OpenSSL.SSL.SysCallError as e: 279 if self.suppress_ragged_eofs and e.args == (-1, 'Unexpected EOF'): ~\Miniconda3\lib\site-packages\OpenSSL\SSL.py in recv_into(self, buffer, nbytes, flags) 1332 result = _lib.SSL_peek(self._ssl, buf, nbytes) 1333 else: -> 1334 result = _lib.SSL_read(self._ssl, buf, nbytes) 1335 self._raise_ssl_error(self._ssl, result) 1336 KeyboardInterrupt:
f = open("BB_markers.txt", "a")
for k in probes:
a = str(k)+'\n'
f.write(a)
f.close()
#number of PAM for different Cas9 proteins in B.burgdorferi genome
Bburgdorferi_PAM_cnt = {}
for k in Bburgdorferi_PAM.keys():
Bburgdorferi_PAM_cnt[k] = len(Bburgdorferi_PAM.get(k))
print(Bburgdorferi_PAM_cnt)
#histogram - number of PAM for different Cas9 proteins in B.burgdorferi genome
import matplotlib.pyplot as plt
plt.bar(list(Bburgdorferi_PAM_cnt.keys()), Bburgdorferi_PAM_cnt.values())
plt.show()
burgdorferi_PAM_rev_pos = burgdorferi_PAM_rev_pos["SpCas9"]
burgdorferi_PAM_pos = burgdorferi_PAM_pos["SpCas9"]
bavariensis_PAM_rev_pos = bavariensis_PAM_rev_pos["SpCas9"]
bavariensis_PAM_pos = bavariensis_PAM_pos["SpCas9"]
afzelii_PAM_rev_pos = afzelii_PAM_rev_pos["SpCas9"]
afzelii_PAM_pos = afzelii_PAM_pos["SpCas9"]
garenii_PAM_rev_pos = garenii_PAM_rev_pos["SpCas9"]
garenii_PAM_pos = garenii_PAM_pos["SpCas9"]
thres=22
common_seq=[]
for i in set(burgdorferi_pam_targets_pairs_seq.values()):
for k in set(bavariensis_pam_targets_pairs_seq.values()):
if sequence_compare(i,k) > thres:
common_seq.append(i)
common_seq_2=[]
for i in common_seq:
for k in set(afzelii_pam_targets_pairs_seq.values()):
if sequence_compare(i,k) > thres:
common_seq_2.append(i)
common_seq=[]
for i in common_seq_2:
for k in set(garenii_pam_targets_pairs_seq.values()):
if sequence_compare(i,k) > thres:
common_seq.append(i)