#!mv /cellar/users/btsui/0.pickle.gz .
import pandas as pd
import numpy as np
#!mv /cellar/users/btsui/1000.pickle.gz .
tmpDf=pd.read_pickle('1000.pickle.gz')
#resetDf.Run.value_counts()
resetDf=tmpDf.reset_index()#.iloc[:100]
#resetDf.columns.astype('category')
resetDf['Chr']=resetDf.astype(np.str)
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) <ipython-input-5-91814d716c5b> in <module>() ----> 1 resetDf['Chr']=resetDf.astype(np.str) /cellar/users/btsui/anaconda2/lib/python2.7/site-packages/pandas/util/_decorators.pyc in wrapper(*args, **kwargs) 89 else: 90 kwargs[new_arg_name] = new_arg_value ---> 91 return func(*args, **kwargs) 92 return wrapper 93 return _deprecate_kwarg /cellar/users/btsui/anaconda2/lib/python2.7/site-packages/pandas/core/generic.pyc in astype(self, dtype, copy, errors, **kwargs) 3408 # else, only a single dtype is given 3409 new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors, -> 3410 **kwargs) 3411 return self._constructor(new_data).__finalize__(self) 3412 /cellar/users/btsui/anaconda2/lib/python2.7/site-packages/pandas/core/internals.pyc in astype(self, dtype, **kwargs) 3222 3223 def astype(self, dtype, **kwargs): -> 3224 return self.apply('astype', dtype=dtype, **kwargs) 3225 3226 def convert(self, **kwargs): /cellar/users/btsui/anaconda2/lib/python2.7/site-packages/pandas/core/internals.pyc in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs) 3089 3090 kwargs['mgr'] = self -> 3091 applied = getattr(b, f)(**kwargs) 3092 result_blocks = _extend_blocks(applied, result_blocks) 3093 /cellar/users/btsui/anaconda2/lib/python2.7/site-packages/pandas/core/internals.pyc in astype(self, dtype, copy, errors, values, **kwargs) 469 def astype(self, dtype, copy=False, errors='raise', values=None, **kwargs): 470 return self._astype(dtype, copy=copy, errors=errors, values=values, --> 471 **kwargs) 472 473 def _astype(self, dtype, copy=False, errors='raise', values=None, /cellar/users/btsui/anaconda2/lib/python2.7/site-packages/pandas/core/internals.pyc in _astype(self, dtype, copy, errors, values, klass, mgr, raise_on_error, **kwargs) 519 520 # _astype_nansafe works fine with 1-d only --> 521 values = astype_nansafe(values.ravel(), dtype, copy=True) 522 values = values.reshape(self.shape) 523 /cellar/users/btsui/anaconda2/lib/python2.7/site-packages/pandas/core/dtypes/cast.pyc in astype_nansafe(arr, dtype, copy) 583 return lib.astype_unicode(arr.ravel()).reshape(arr.shape) 584 elif issubclass(dtype.type, string_types): --> 585 return lib.astype_str(arr.ravel()).reshape(arr.shape) 586 elif is_datetime64_dtype(arr): 587 if dtype == object: pandas/_libs/lib.pyx in pandas._libs.lib.astype_str() pandas/_libs/lib.pyx in pandas._libs.lib.astype_str() /cellar/users/btsui/anaconda2/lib/python2.7/site-packages/numpy/core/numeric.pyc in array_str(a, max_line_width, precision, suppress_small) 1867 1868 """ -> 1869 return array2string(a, max_line_width, precision, suppress_small, ' ', "", str) 1870 1871 def set_string_function(f, repr=True): /cellar/users/btsui/anaconda2/lib/python2.7/site-packages/numpy/core/arrayprint.pyc in array2string(a, max_line_width, precision, suppress_small, separator, prefix, style, formatter) 436 437 if a.shape == (): --> 438 x = a.item() 439 if isinstance(x, tuple): 440 x = _convert_arrays(x) KeyboardInterrupt:
resetDf['Pos']=resetDf['Pos'].astype(np.uint32)
VC=resetDf['Chr'].value_counts()
myChrFactors=sorted(VC.index.astype(np.str
).unique())
len(myChrFactors)
25
myChrFactors
['1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2', '20', '21', '22', '3', '4', '5', '6', '7', '8', '9', 'MT', 'X', 'Y']
###
for attrib in ['Chr','base','Run']:
resetDf[attrib]=resetDf[attrib].astype('category')
cleanMultiDf=resetDf.set_index([u'Chr', u'Pos', u'base', u'Run'])
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) <ipython-input-55-65df979f9bd2> in <module>() 1 ### 2 resetDf=tmpDf.reset_index()#.iloc[:100] ----> 3 resetDf['Chr']=resetDf.astype('str') 4 resetDf['Pos']=resetDf['Pos'].astype(np.uint32) 5 for attrib in ['Chr','base','Run']: /cellar/users/btsui/anaconda2/lib/python2.7/site-packages/pandas/util/_decorators.pyc in wrapper(*args, **kwargs) 89 else: 90 kwargs[new_arg_name] = new_arg_value ---> 91 return func(*args, **kwargs) 92 return wrapper 93 return _deprecate_kwarg /cellar/users/btsui/anaconda2/lib/python2.7/site-packages/pandas/core/generic.pyc in astype(self, dtype, copy, errors, **kwargs) 3408 # else, only a single dtype is given 3409 new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors, -> 3410 **kwargs) 3411 return self._constructor(new_data).__finalize__(self) 3412 /cellar/users/btsui/anaconda2/lib/python2.7/site-packages/pandas/core/internals.pyc in astype(self, dtype, **kwargs) 3222 3223 def astype(self, dtype, **kwargs): -> 3224 return self.apply('astype', dtype=dtype, **kwargs) 3225 3226 def convert(self, **kwargs): /cellar/users/btsui/anaconda2/lib/python2.7/site-packages/pandas/core/internals.pyc in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs) 3089 3090 kwargs['mgr'] = self -> 3091 applied = getattr(b, f)(**kwargs) 3092 result_blocks = _extend_blocks(applied, result_blocks) 3093 /cellar/users/btsui/anaconda2/lib/python2.7/site-packages/pandas/core/internals.pyc in astype(self, dtype, copy, errors, values, **kwargs) 469 def astype(self, dtype, copy=False, errors='raise', values=None, **kwargs): 470 return self._astype(dtype, copy=copy, errors=errors, values=values, --> 471 **kwargs) 472 473 def _astype(self, dtype, copy=False, errors='raise', values=None, /cellar/users/btsui/anaconda2/lib/python2.7/site-packages/pandas/core/internals.pyc in _astype(self, dtype, copy, errors, values, klass, mgr, raise_on_error, **kwargs) 519 520 # _astype_nansafe works fine with 1-d only --> 521 values = astype_nansafe(values.ravel(), dtype, copy=True) 522 values = values.reshape(self.shape) 523 /cellar/users/btsui/anaconda2/lib/python2.7/site-packages/pandas/core/dtypes/cast.pyc in astype_nansafe(arr, dtype, copy) 583 return lib.astype_unicode(arr.ravel()).reshape(arr.shape) 584 elif issubclass(dtype.type, string_types): --> 585 return lib.astype_str(arr.ravel()).reshape(arr.shape) 586 elif is_datetime64_dtype(arr): 587 if dtype == object: pandas/_libs/lib.pyx in pandas._libs.lib.astype_str() pandas/_libs/lib.pyx in pandas._libs.lib.astype_str() /cellar/users/btsui/anaconda2/lib/python2.7/site-packages/numpy/core/numeric.pyc in array_str(a, max_line_width, precision, suppress_small) 1833 return cName + "(%s, %sdtype=%s)" % (lst, lf, typename) 1834 -> 1835 def array_str(a, max_line_width=None, precision=None, suppress_small=None): 1836 """ 1837 Return a string representation of the data in an array. KeyboardInterrupt:
resetDf.dtypes
resetDf['Chr'].value_counts()
resetDf.dtypes
VC=resetDf.Chr.value_counts()
#pd.Series(['a', 'b', 'c'], dtype='|S1')
VC.index
2 4603018 1 3739335 17 3591853 6 3325063 11 3025210 16 2528363 3 2484969 5 2444334 7 2313831 X 2184393 12 2147656 19 2143306 10 1850144 9 1849974 13 1795397 15 1534249 8 1366216 14 1238826 4 1225138 22 927637 20 794814 18 723331 MT 701180 21 528329 Y 53161 Name: Chr, dtype: int64
cleanMultiDf=resetDf.set_index([u'Chr', u'Pos', u'base', u'Run'])
#cleanMultiDf
testSubDf=cleanMultiDf#.iloc[:1000]
#'nameoffile.h5', 'key_to_store', table=True, mode='a'
testSubDf.to_hdf('./0.h5','0',mode='w',format='fixed',complib='zlib')
testSubDf.to_hdf('./0.h5','1',mode='a',format='fixed',complib='zlib')
#244/3
#create a query for chunks
#129K
#1.7GB for table
#565M for unfixed
#85M for the input pandas pickle
#!rm ./0.h5.gz
#!zip
#when increasing the size, does it help improving the speed?
#163M for combining two times
!ls -lah ./0.h5
-rw-r--r-- 1 btsui users 163M Jan 3 16:56 ./0.h5
!ls -lah 1000.pickle.gz
-rw-r--r-- 1 btsui users 85M Jan 3 16:16 1000.pickle.gz
"""
is hdf5 appendtable,
It's already at 1.6gb
1.7 gb for the data
"""
(49119727, 2)
"""
how big is the h5 object?
"""
tmpDf=pd.read_hdf('./0.h5',where='Run=ERR243068')
tmpDf
ReadDepth | AverageBaseQuality | ||||
---|---|---|---|---|---|
Chr | Pos | base | Run | ||
1 | 14727.0 | A | ERR243068 | 1 | 22 |
630825.0 | T | ERR243068 | 7 | 37 |
#4:23
!grep -rn multiprocessing ./*.ipynb
./ParseBamReadCount_base_case.ipynb:27: "from multiprocessing import Pool\n", ./ParseBamReadCount_base_case.ipynb:167: " from multiprocessing import Pool\n", ./ParseBamReadCount_base_case.ipynb:592: "from multiprocessing import Pool\n",