#reload library if modified
%load_ext autoreload
%autoreload 2
%matplotlib inline
import sys
sys.path.insert(0, 'E:/mylib')
from MSQC import external as qc
qc.__msms_columns
['Charge', 'Ion injection time', 'Retention time', 'Raw file', 'Identified', 'Total ion current', 'Sequence']
import os
RAW_FILE_PATH =os.path.join('E:', os.sep, 'jennie_tof')
TXT_PATH=os.path.join(RAW_FILE_PATH,'combined','txt')
#temp = pd.read_csv(os.path.join(TXT_PATH,'msms.txt'),sep='\t',nrows=10)
#list(temp.columns)
#temp[['Raw file','Retention time','Mass Deviations [ppm]']].head()
0 -0.11761901059797;3.63471439837559;-0.79452595... 1 4.93616882370446;1.10515936566935;5.7520492348... 2 -3.42869368801671;-7.34229843977299;-3.5385031... 3 0.753720075125852;2.3178770923802;-3.261371801... 4 1.45079243699959;-2.74118467027962;-4.38280074... Name: Mass Deviations [ppm], dtype: object
temp[['Raw file','Retention time','Mass Deviations [ppm]']].head()
Raw file | Retention time | Mass Deviations [ppm] | |
---|---|---|---|
0 | b008p086_EVN_1_rpt | 116.28 | -0.11761901059797;3.63471439837559;-0.79452595... |
1 | b008p086_EVN_2_rpt | 116.22 | 4.93616882370446;1.10515936566935;5.7520492348... |
2 | b008p086_EVN_3 | 116.85 | -3.42869368801671;-7.34229843977299;-3.5385031... |
3 | b008p086_EVN_4 | 117.14 | 0.753720075125852;2.3178770923802;-3.261371801... |
4 | b008p086_EVN_5 | 116.63 | 1.45079243699959;-2.74118467027962;-4.38280074... |
df_summary, msScans, msmsIdentified, msmsScans, msmsmsScans, msms= qc.qc_pipline(TXT_PATH,parse_msms=True)
summary shape: (10, 52) parse msmsScans 237046 lines in E:\jennie_tof\combined\txt\msmsScans.txt 1 expected chunks of 237046.0 rows find_chunks ran in 0.23s
HBox(children=(IntProgress(value=0, max=1), HTML(value='')))
(237045, 7) parse msmsIdentified
D:\michele\miniconda3\envs\prediction\lib\site-packages\pandas\core\indexing.py:362: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy self.obj[key] = _infer_fill_value(value) D:\michele\miniconda3\envs\prediction\lib\site-packages\pandas\core\indexing.py:543: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy self.obj[item] = s
parse msScans 31927 lines in E:\jennie_tof\combined\txt\msScans.txt 1 expected chunks of 31927.0 rows find_chunks ran in 0.03s
HBox(children=(IntProgress(value=0, max=1), HTML(value='')))
(31926, 5) 85391 lines in E:\jennie_tof\combined\txt\msms.txt 1 expected chunks of 85391.0 rows find_chunks ran in 0.51s parse msms ['Raw file', 'Retention time', 'Mass deviations [ppm]', 'Mass deviations [Da]']
HBox(children=(IntProgress(value=0, max=1), HTML(value='')))
(85390, 4) compute msms fragment median errror parse Mass deviations [ppm] error
HBox(children=(IntProgress(value=0, max=85390), HTML(value='')))
parse Mass deviations [Da] error
HBox(children=(IntProgress(value=0, max=85390), HTML(value='')))
msms.columns
Index(['Raw file', 'Retention time', 'Mass deviations [Da]', 'Mass deviations [ppm]', 'RT_round', 'RT_bin_qcut', 'median_MSMS_error_ppm', 'median_MSMS_error_dalton'], dtype='object')
raw_list = os.listdir(RAW_FILE_PATH)
raw_list = [n for n in raw_list if n.endswith('.raw')]
raw_list
['JG_24hrs1.raw', 'JG_24hrs2.raw', 'JG_24hrs3.raw', 'JG_48hrs1.raw', 'JG_48hrs2.raw', 'JG_48hrs3.raw', 'JG_CTRL1.raw', 'JG_CTRL2.raw', 'JG_CTRL3.raw']
import pandas as pd
import matplotlib.pyplot as plt
import MSFileReader
import seaborn as sns
#raw_list.remove('Total')
for raw_file_name in raw_list:
print(raw_file_name)
raw_file=MSFileReader.ThermoRawfile(os.path.join(RAW_FILE_PATH, raw_file_name))
minutes, b_values = qc.extract_gradient(os.path.join(RAW_FILE_PATH, raw_file_name))
#tempMSdf=msScans[msScans['Raw file']==raw_file_name.split('.')[0]]
tempMSMSdf=msmsScans[msmsScans['Raw file']==raw_file_name.split('.')[0]]
#print(tempMSMSdf.head())
tempMSdf=msScans[msScans['Raw file']==raw_file_name.split('.')[0]]
#tempMSMSMSdf = msmsmsScans[msmsmsScans['Raw file']==raw_file_name.split('.')[0]]
qc.plot_raw_file(tempMSdf, tempMSMSdf, pd.DataFrame(), raw_file_name, gradient = (minutes, b_values))
plt.tight_layout()
title = 'Raw file: {rfn} - Instrument: {Inst} - ID: {ID}'.format(rfn=raw_file_name,
Inst=raw_file.GetInstModel(),
ID=raw_file.GetInstSerialNumber())
plt.suptitle(title ,fontsize=16, y=1.05)
plt.savefig(os.path.join(RAW_FILE_PATH, 'images', raw_file_name+'.png'))
plt.show()
sns.jointplot(x='Retention time', y="median_MSMS_error_ppm", kind='hex',
data=msms[msms['Raw file']==raw_file_name.split('.')[0]],
height=10,ratio=5)
plt.savefig(os.path.join(RAW_FILE_PATH, 'images', raw_file_name+'MSMS_error_ppm.png'))
plt.show()
sns.jointplot(x='Retention time', y="median_MSMS_error_dalton", kind='hex',
data=msms[msms['Raw file']==raw_file_name.split('.')[0]],
height=10,ratio=5)
plt.savefig(os.path.join(RAW_FILE_PATH, 'images', raw_file_name+'MSMS_error_da.png'))
plt.show()
#break
JG_24hrs1.raw
JG_24hrs2.raw
JG_24hrs3.raw
JG_48hrs1.raw
JG_48hrs2.raw
JG_48hrs3.raw
JG_CTRL1.raw
JG_CTRL2.raw
JG_CTRL3.raw
msmsScans.head()
Raw file | Retention time | Ion injection time | Total ion current | Identified | Sequence | Charge | RT_round | RT_bin_qcut | |
---|---|---|---|---|---|---|---|---|---|
0 | JG_24hrs1 | 0.97638 | 100.0 | 7099.200195 | - | KLSLLKVGTK | 0 | 0 | 1.0 |
1 | JG_24hrs1 | 4.15670 | 100.0 | 7348.399902 | - | GFPGDAEGAM | 0 | 4 | 1.0 |
2 | JG_24hrs1 | 7.43520 | 100.0 | 9539.599609 | - | 0 | 7 | 1.0 | |
3 | JG_24hrs1 | 13.03900 | 100.0 | 12034.000000 | - | NVVFSPYGVSSVLAMLQMTTAGK | 5 | 13 | 1.0 |
4 | JG_24hrs1 | 13.06900 | 100.0 | 24490.000000 | - | SGHSGSHHSHTTSQGRSDASHGQSGSR | 5 | 13 | 1.0 |
msmsScans.head(),raw_list[0]
( Raw file Retention time Ion injection time Total ion current \ 0 JG_24hrs1 0.97638 100.0 7099.200195 1 JG_24hrs1 4.15670 100.0 7348.399902 2 JG_24hrs1 7.43520 100.0 9539.599609 3 JG_24hrs1 13.03900 100.0 12034.000000 4 JG_24hrs1 13.06900 100.0 24490.000000 Identified Sequence Charge RT_round RT_bin_qcut 0 - KLSLLKVGTK 0 0 1.0 1 - GFPGDAEGAM 0 4 1.0 2 - 0 7 1.0 3 - NVVFSPYGVSSVLAMLQMTTAGK 5 13 1.0 4 - SGHSGSHHSHTTSQGRSDASHGQSGSR 5 13 1.0 , 'JG_24hrs1.raw')
import matplotlib.pyplot as plt
fig,ax=plt.subplots(figsize=(12,8),ncols=1,nrows=2)
temp = msmsScans[(msmsScans['Identified']=='-') &
(msmsScans['Raw file']==raw_list[0].split('.')[0])]
for charge in range(2,4):
temp_ = temp[temp['Charge']==charge]
temp_ = temp_.groupby('RT_round')['Retention time'].size()
ax[0].plot(temp_.index.values, temp_.values,'-', label=charge)
temp_ = temp[temp['Charge']<=1]
temp_ = temp_.groupby('RT_round')['Retention time'].size()
ax[0].plot(temp_.index.values, temp_.values,'-', label='1,0')
temp_ = temp[temp['Charge']>=4]
temp_ = temp_.groupby('RT_round')['Retention time'].size()
ax[0].plot(temp_.index.values, temp_.values,'-', label='4+')
ax[0].set_title('not identified')
ax[0].legend()
temp = msmsScans[(msmsScans['Identified']=='+') &
(msmsScans['Raw file']==raw_list[0].split('.')[0]) ]
for charge in range(2,4):
temp_ = temp[temp['Charge']==charge]
temp_ = temp_.groupby('RT_round')['Retention time'].size()
ax[1].plot(temp_.index.values, temp_.values,'-', label=charge)
temp_ = temp[temp['Charge']==charge]
temp_ = temp[temp['Charge']<=1]
temp_ = temp_.groupby('RT_round')['Retention time'].size()
ax[1].plot(temp_.index.values, temp_.values,'-', label='1,0')
temp_ = temp[temp['Charge']>=4]
temp_ = temp_.groupby('RT_round')['Retention time'].size()
ax[1].plot(temp_.index.values, temp_.values,'-', label='4+')
ax[1].set_title('identified')
ax[1].legend()
plt.show()
df_summary.loc[df_summary.norm_retention_spread.sort_values().tail().index.values][['Raw file','norm_retention_spread']]
Raw file | norm_retention_spread | |
---|---|---|
8 | JG_CTRL3 | 0.903149 |
3 | JG_48hrs1 | 0.910619 |
0 | JG_24hrs1 | 0.910767 |
2 | JG_24hrs3 | 0.912309 |
9 | Total | NaN |
df_summary.loc[df_summary.norm_retention_spread.sort_values().head().index.values][['Raw file','norm_retention_spread']]
Raw file | norm_retention_spread | |
---|---|---|
5 | JG_48hrs3 | 0.889369 |
1 | JG_24hrs2 | 0.897215 |
7 | JG_CTRL2 | 0.898525 |
6 | JG_CTRL1 | 0.898730 |
4 | JG_48hrs2 | 0.899600 |
infile = os.path.join(TXT_PATH,'ms3Scans.txt')
infile = pd.read_csv(infile,sep='\t')
infile.head()
--------------------------------------------------------------------------- EmptyDataError Traceback (most recent call last) <ipython-input-27-94aed7a73fc8> in <module> 1 infile = os.path.join(TXT_PATH,'ms3Scans.txt') ----> 2 infile = pd.read_csv(infile,sep='\t') 3 infile.head() D:\michele\miniconda3\envs\prediction\lib\site-packages\pandas\io\parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision) 700 skip_blank_lines=skip_blank_lines) 701 --> 702 return _read(filepath_or_buffer, kwds) 703 704 parser_f.__name__ = name D:\michele\miniconda3\envs\prediction\lib\site-packages\pandas\io\parsers.py in _read(filepath_or_buffer, kwds) 427 428 # Create the parser. --> 429 parser = TextFileReader(filepath_or_buffer, **kwds) 430 431 if chunksize or iterator: D:\michele\miniconda3\envs\prediction\lib\site-packages\pandas\io\parsers.py in __init__(self, f, engine, **kwds) 893 self.options['has_index_names'] = kwds['has_index_names'] 894 --> 895 self._make_engine(self.engine) 896 897 def close(self): D:\michele\miniconda3\envs\prediction\lib\site-packages\pandas\io\parsers.py in _make_engine(self, engine) 1120 def _make_engine(self, engine='c'): 1121 if engine == 'c': -> 1122 self._engine = CParserWrapper(self.f, **self.options) 1123 else: 1124 if engine == 'python': D:\michele\miniconda3\envs\prediction\lib\site-packages\pandas\io\parsers.py in __init__(self, src, **kwds) 1851 kwds['usecols'] = self.usecols 1852 -> 1853 self._reader = parsers.TextReader(src, **kwds) 1854 self.unnamed_cols = self._reader.unnamed_cols 1855 pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader.__cinit__() EmptyDataError: No columns to parse from file
infile.describe()
Scan number | Retention time | Ion injection time | Total ion current | |
---|---|---|---|---|
count | 3.100391e+07 | 3.100391e+07 | 5.982516e+06 | 3.100391e+07 |
mean | 6.336763e+04 | 1.336692e+02 | 1.278913e+02 | 1.942007e+06 |
std | 3.707307e+04 | 6.459373e+01 | 1.032622e+02 | 1.095132e+07 |
min | 4.000000e+00 | 9.487000e-03 | 6.800000e-02 | 5.555700e+03 |
25% | 3.248500e+04 | 7.964500e+01 | 4.316800e+01 | 1.766300e+05 |
50% | 6.089000e+04 | 1.319300e+02 | 8.731700e+01 | 5.606600e+05 |
75% | 9.090300e+04 | 1.866300e+02 | 2.115000e+02 | 1.566200e+06 |
max | 1.680000e+05 | 2.650400e+02 | 3.000000e+02 | 7.168800e+09 |
#rawfile = MSFileReader("myfile.raw")
from tqdm import tqdm_notebook
from pymsfilereader import MSFileReader
#rawfile = MSFileReader("myfile.raw")
#RawReader.py
rawfile=MSFileReader.MSFileReader(os.path.join(RAW_FILE_PATH, raw_file_name))
with open('test.tsv', 'wt') as f:
print('\t'.join(map(str, ('scan_number',
'RetentionTime',
'scan_number',
'GetFilterForScanNum(i)',
'GetMSOrderForScanNum(i)',
'GetNumberOfMSOrdersFromScanNum(i)',
'GetScanTypeForScanNum(i)',
'GetDetectorTypeForScanNum(i)',
'GetMassAnalyzerTypeForScanNum(i)',
'GetActivationTypeForScanNum(i,MSOrder=2)',
'IsProfileScanForScanNum(i)',
'IsCentroidScanForScanNum(i)',
'GetIsolationWidthForScanNum(i,MSOrder=1)',
'GetCollisionEnergyForScanNum(i,MSOrder=1)',
'GetPrecursorInfoFromScanNum(i)',
'GetMassCalibrationValueFromScanNum(i,massCalibrationIndex=0)',
'GetScanEventForScanNum(i)',
'GetSegmentAndEventForScanNum(i)',
'GetCycleNumberFromScanNumber(i)',
'GetAValueFromScanNum(i)',
'GetBValueFromScanNum(i)',
'GetKValueFromScanNum(i)',
'GetRValueFromScanNum(i)',
'GetVValueFromScanNum(i)',
'GetMSXMultiplexValueFromScanNum(i)',
'GetCompoundNameFromScanNum(i)',
'GetNumberOfMassRangesFromScanNum(i)',
'GetMassRangeFromScanNum(i,0)',
'GetMassRangeFromScanNum(i,1)',
'GetNumberOfSourceFragmentsFromScanNum(i)',
'GetSourceFragmentValueFromScanNum(i,0)',
'GetNumberOfSourceFragmentationMassRangesFromScanNum(i)'
))), file=f)
for i in tqdm_notebook(range(rawfile.FirstSpectrumNumber, rawfile.LastSpectrumNumber + 1)):
print('\t'.join(map(str, (i,
rawfile.RTFromScanNum(i),
rawfile.ScanNumFromRT(rawfile.RTFromScanNum(i)),
rawfile.GetFilterForScanNum(i),
rawfile.GetMSOrderForScanNum(i),
rawfile.GetNumberOfMSOrdersFromScanNum(i),
rawfile.GetScanTypeForScanNum(i),
rawfile.GetDetectorTypeForScanNum(i),
rawfile.GetMassAnalyzerTypeForScanNum(i),
rawfile.GetActivationTypeForScanNum(i, MSOrder=2),
rawfile.IsProfileScanForScanNum(i),
rawfile.IsCentroidScanForScanNum(i),
'none',
#rawfile.GetIsolationWidthForScanNum(i, MSOrder=1),
'none',
#rawfile.GetCollisionEnergyForScanNum(i, MSOrder=1),
rawfile.GetPrecursorInfoFromScanNum(i),
rawfile.GetMassCalibrationValueFromScanNum(i, massCalibrationIndex=0),
rawfile.GetScanEventForScanNum(i),
rawfile.GetSegmentAndEventForScanNum(i),
rawfile.GetCycleNumberFromScanNumber(i),
rawfile.GetAValueFromScanNum(i),
rawfile.GetBValueFromScanNum(i),
rawfile.GetKValueFromScanNum(i),
rawfile.GetRValueFromScanNum(i),
rawfile.GetVValueFromScanNum(i),
rawfile.GetMSXMultiplexValueFromScanNum(i),
rawfile.GetCompoundNameFromScanNum(i),
rawfile.GetNumberOfMassRangesFromScanNum(i),
rawfile.GetMassRangeFromScanNum(i, 0),
rawfile.GetMassRangeFromScanNum(i, 1),
rawfile.GetNumberOfSourceFragmentsFromScanNum(i),
rawfile.GetSourceFragmentValueFromScanNum(i, 0),
rawfile.GetNumberOfSourceFragmentationMassRangesFromScanNum(i)
))), file=f)
rawfile.Close()
E:\miniconda3\envs\myEnv\lib\site-packages\ipykernel_launcher.py:42: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0 Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
HBox(children=(FloatProgress(value=0.0, max=125873.0), HTML(value='')))