running in terminal is
bowtie2RegexL=[ "(\d+) reads; of these:",
'(\d+) \([0-9.]*%\) aligned exactly 1 time',
'(\d+) \([0-9.]*%\) aligned >1 times','(\d+) mates make up the pairs']
fastqcRegexL=[ 'Total reads processed:\s+([0-9,]+)',
'Reads with adapters:\s+([0-9,]+)',
'Adapter sequence:(.*)',
'Total basepairs processed:\s+([0-9,]+)',
'Total written \(filtered\):\s+([0-9,]+)',]
#%matplotlib inline
import pandas as pd
import os
from tqdm import tqdm
tmpFDir='/cellar/users/btsui/Data/SRA/DUMP/merged_variant_aligning_statistics.tsv'
logInDir='/nrnb/users/btsui/Data/all_seq/log_snp/'
inFnames=os.listdir(logInDir)
snp_files=os.listdir('/nrnb/users/btsui/Data/all_seq/snp/')
inFnameS=pd.Series(inFnames)
rerun=True
RegexL=bamRegexL+fastqcRegexL
spacer='asdfasdf'
if rerun:
with open(tmpFDir ,'w')as wf:
for i,inFname in enumerate(tqdm(inFnameS)):
with open(logInDir+inFname)as f:
lineS=pd.Series(f.readlines())
for regex in RegexL:
linesWithRegex=lineS.str.extract(regex,expand=False).dropna()
if len(linesWithRegex)>0:
wf.write("{inFname}{spacer}{regex}{spacer}{linesWithRegex}\n".format(
inFname=inFname,regex=regex,linesWithRegex=linesWithRegex.iloc[0],
spacer=spacer))
#if i>0:
# break
#lineS.str.extract('Total reads processed:\s+([0-9,]+)',expand=False).dropna()
#!ls -lah /cellar/users/btsui/Data/SRA/DUMP/merged_variant_aligning_statistics.tsv
tmpFDir='/cellar/users/btsui/Data/SRA/DUMP/merged_variant_aligning_statistics.tsv'
merged_variant_aligning_statistics=pd.read_csv(tmpFDir,sep=spacer,names=np.arange(3),error_bad_lines=False)
/cellar/users/btsui/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:1: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'. """Entry point for launching an IPython kernel.
merged_variant_aligning_statistics.columns=['inFname','regex','linesWithRegex']
m_fastqc=merged_variant_aligning_statistics.regex.isin(fastqcRegexL)
merged_variant_aligning_statistics.loc[m_fastqc,'group']='fastqc'
m_bowtie=merged_variant_aligning_statistics.regex.isin(bowtie2RegexL)
merged_variant_aligning_statistics.loc[m_bowtie,'group']='bowtie2'
import pandas as pd
import numpy as np
sra_dump_pickle_dir='/cellar/users/btsui/Data/SRA/DUMP/sra_dump.pickle'
sra_dump_df=pd.read_pickle(sra_dump_pickle_dir)
sra_dump_pickle_dir_annotated_dir=sra_dump_pickle_dir.replace('.pickle','.fastqc.bowtie_algn.pickle')
g=['group','regex','inFname']
mergedAlignmentStat=merged_variant_aligning_statistics.groupby(g).last()['linesWithRegex'].unstack().T#.set_index(['group','inFname'])['linesWithRegex'].unstack()
mergedAlignmentStat.index=mergedAlignmentStat.index.str.replace('.log','')
sra_dump_dfMultI=pd.concat([sra_dump_df],axis=1,keys=['SRAmeta'])
mergedDf=pd.concat([sra_dump_dfMultI,mergedAlignmentStat],axis=1)
/cellar/users/btsui/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version of pandas will change to not sort by default. To accept the future behavior, pass 'sort=True'. To retain the current behavior and silence the warning, pass sort=False """Entry point for launching an IPython kernel.
mergedDf=mergedDf.sort_values([('fastqc','Total basepairs processed:\s+([0-9,]+)')])
mergedDf.to_pickle(sra_dump_pickle_dir_annotated_dir)
#valid_m=mergedDf[('fastqc','Total basepairs processed:\s+([0-9,]+)')].notnull()
#mergedDf
#valid_m.sum()