import os
from Bio import SeqIO
import numpy as np
import pandas as pd
baseOutDir='/cellar/users/btsui/Data/Project/Skymap/ChipSeq/empty_references/'
#myDict={'Homo_sapiens':'/cellar/users/btsui/Data/ensembl/clean/Homo_sapiens.fa'}
fa_dir='/cellar/users/btsui/Data/ensembl/clean/'
fnames=pd.Series(os.listdir(fa_dir))
myDict=pd.Series(data=(fa_dir+fnames).values,
index=fnames.str.split('\.').str[0])
!grep -rn clean ./../*/*.ipynb
./../Chip-seq/BuildEmptyPickles.ipynb:27: "#myDict={'Homo_sapiens':'/cellar/users/btsui/Data/ensembl/clean/Homo_sapiens.fa'}" ./../Chip-seq/BuildEmptyPickles.ipynb:38: "fa_dir='/cellar/users/btsui/Data/ensembl/clean/'\n", ./../Chip-seq/BuildEmptyPickles.ipynb:52: "!grep -rn clean ./.../*.ipynb " ./../Chip-seq/BuildGenomes.ipynb:41: "clean_fa_dir='/cellar/users/btsui/Data/ensembl/clean/'\n", ./../Chip-seq/BuildGenomes.ipynb:42: "myFaNames=pd.Series(os.listdir(clean_fa_dir))\n", ./../Chip-seq/BuildGenomes.ipynb:44: " data=(clean_fa_dir+myFaNames).values)\n", ./../DownloadGenome/ExtractCleanFastas.ipynb:24: "clean_fa_out_dir='/cellar/users/btsui/Data/ensembl/clean/'" ./../DownloadGenome/ExtractCleanFastas.ipynb:60: " my_fa_out_dir=clean_fa_out_dir+my_specie+'.fa'\n", ./../XGS_WGS/old_MaskingGenomeWithSnp.ipynb:302: "#!ls /cellar/users/btsui/Data/ensembl/clean/Homo_sapiens.fa"
#from
### spit out the data
#binSize=20
for mySpecie, faDir in myDict.iteritems():
###for each specie, generate the data
with open(faDir, "rU") as handle:
#myChrNames=[]
#myChrSize=[]
myDict={}
for record in SeqIO.parse(handle, "fasta"):
myDict[record.id]=len(record)
pd.Series(myDict).to_csv(baseOutDir+mySpecie+'.size.tsv',sep='\t')
#myDict
tmpDf=pd.read_csv('/cellar/users/btsui/Data/Project/Skymap/ChipSeq/EpigenomeRoadmap_inter/GSM493384_UW.CD34.rep1.ChromatinAccessibility.CD34+-DS12274.bed.gz',sep='\t',header=None)
baseOutDir+mySpecie+'.size.csv'
'/cellar/users/btsui/Data/Project/Skymap/ChipSeq/empty_references/Homo_sapiens.size.csv'
!cat /cellar/users/btsui/Data/Project/Skymap/ChipSeq/empty_references/Homo_sapiens.size.csv
Homo_sapiens,/cellar/users/btsui/Data/Project/KangZhang/refFa/hg19.fa
binSize=20
for mySpecie, faDir in myDict.iteritems():
###for each specie, generate the data
with open(faDir, "rU") as handle:
myChrNames=[]
myChrBins=[]
for record in SeqIO.parse(handle, "fasta"):
nBp=len(record)
bins=np.arange(0,nBp+100,binSize)
myChrNames.append(record.id)
myChrBins.append(pd.Series(index=bins))
myMergedS=pd.concat(myChrBins,keys=myChrNames).sort_index().fillna(0).astype(np.int16)
outDir=baseOutDir+mySpecie+'.pickle'
myMergedS.to_pickle(outDir)
'/cellar/users/btsui/Data/Project/Skymap/ChipSeq/empty_references/Homo_sapiens.pickle'
import pandas as pd
import os
base_dir='/cellar/users/btsui/Data/Project/Skymap/ChipSeq/empty_references/'
myL=[]
fnames=os.listdir(base_dir)
for fname in fnames:
myL.append(pd.read_csv(base_dir+fname,sep='\t',header=None))
#myL[1]
pd.concat(myL,keys=fnames).loc['Bos_taurus.size.tsv']#.groupby(level=0).size().sort_values()
0 | 1 | |
---|---|---|
0 | 1 | 158337067.0 |
1 | 10 | 104305016.0 |
2 | 11 | 107310763.0 |
3 | 12 | 91163125.0 |
4 | 13 | 84240350.0 |
5 | 14 | 84648390.0 |
6 | 15 | 85296676.0 |
7 | 16 | 81724687.0 |
8 | 17 | 75158596.0 |
9 | 18 | 66004023.0 |
10 | 19 | 64057457.0 |
11 | 2 | 137060424.0 |
12 | 20 | 72042655.0 |
13 | 21 | 71599096.0 |
14 | 22 | 61435874.0 |
15 | 23 | 52530062.0 |
16 | 24 | 62714930.0 |
17 | 25 | 42904170.0 |
18 | 26 | 51681464.0 |
19 | 27 | 45407902.0 |
20 | 28 | 46312546.0 |
21 | 29 | 51505224.0 |
22 | 3 | 121430405.0 |
23 | 4 | 120829699.0 |
24 | 5 | 121191424.0 |
25 | 6 | 119458736.0 |
26 | 7 | 112638659.0 |
27 | 8 | 113384836.0 |
28 | 9 | 105708250.0 |
29 | MT | 16338.0 |
30 | X | 148823899.0 |
#!rm /cellar/users/btsui/Data/Project/Skymap/ChipSeq/empty_references/Homo_sapiens.csv