yeah the line that failed was "%time tmpDf=pd.read_pickle('{}/{}.pickle.gz'.format(base_mergedBySRR_dir,chunkSize))"

In [1]:

import pandas as pd
import numpy as np
from tqdm import tqdm
uploadDir='/cellar/users/btsui/Data/merged/snp/hg38/mergedBySRR/'

/cellar/users/btsui/anaconda3/envs/deep_nlp_cpu/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88
  return f(*args, **kwds)

In [2]:

tmpDir='/cellar/users/btsui/Data/merged/snp/hg38/mergedBySRR/100000.pickle.gz'

In [3]:

tmpDf=pd.read_pickle(tmpDir)

In [ ]:

"""
library(rhdf5)
mydata <- h5read("/tmp/100000.hdf5", "/mygroup/mydata")

"""

In [18]:

#!rm -f /tmp/100000.hdf5

In [19]:

tmpDf.to_hdf('/tmp/100000.hdf5',format='fixed',key='chunk')

In [23]:

#!chmod 400 /tmp/100000.hdf5

In [22]:

#pd.read_hdf('/tmp/100000.hdf5',key='chunk')

In [13]:

#tmpDf.to_parquet('/tmp/100000.parquet',engine='pyarrow')

In [ ]:

#https://stackoverflow.com/questions/37010212/what-is-the-fastest-way-to-upload-a-big-csv-file-in-notebook-to-work-with-python/37012035#37012035