import sys
sys.path.insert(0, '..')
import zarr
zarr.__version__
'2.0.2.dev0+dirty'
store = zarr.ZipStore('/data/coluzzi/ag1000g/data/phase1/release/AR3.1/haplotypes/main/zarr2/zstd/ag1000g.phase1.ar3.1.haplotypes.zip',
mode='r')
grp = zarr.Group(store)
z = grp['3L/calldata/genotype']
z
Array(/3L/calldata/genotype, (7449486, 773, 2), int8, chunks=(13107, 40, 2), order=C) nbytes: 10.7G; nbytes_stored: 193.5M; ratio: 56.7; initialized: 11380/11380 compressor: Blosc(cname='zstd', clevel=1, shuffle=2) store: ZipStore
import cProfile
cProfile.run('z[:10]', sort='cumtime')
1832 function calls in 0.024 seconds Ordered by: cumulative time ncalls tottime percall cumtime percall filename:lineno(function) 1 0.000 0.000 0.024 0.024 {built-in method builtins.exec} 1 0.000 0.000 0.024 0.024 <string>:1(<module>) 1 0.000 0.000 0.024 0.024 core.py:292(__getitem__) 20 0.000 0.000 0.023 0.001 core.py:539(_chunk_getitem) 20 0.000 0.000 0.020 0.001 core.py:679(_decode_chunk) 20 0.000 0.000 0.020 0.001 codecs.py:355(decode) 20 0.020 0.001 0.020 0.001 {zarr.blosc.decompress} 20 0.000 0.000 0.002 0.000 storage.py:766(__getitem__) 20 0.000 0.000 0.001 0.000 zipfile.py:1235(open) 20 0.000 0.000 0.001 0.000 zipfile.py:821(read) 20 0.000 0.000 0.001 0.000 zipfile.py:901(_read1) 80 0.000 0.000 0.001 0.000 zipfile.py:660(read) 20 0.000 0.000 0.000 0.000 zipfile.py:854(_update_crc) 40 0.000 0.000 0.000 0.000 {built-in method zlib.crc32} 80 0.000 0.000 0.000 0.000 {method 'read' of '_io.BufferedReader' objects} 20 0.000 0.000 0.000 0.000 zipfile.py:937(_read2) 80 0.000 0.000 0.000 0.000 core.py:390(<genexpr>) 20 0.000 0.000 0.000 0.000 zipfile.py:953(close) 20 0.000 0.000 0.000 0.000 {method 'reshape' of 'numpy.ndarray' objects} 20 0.000 0.000 0.000 0.000 util.py:106(is_total_slice) 20 0.000 0.000 0.000 0.000 zipfile.py:708(__init__) 20 0.000 0.000 0.000 0.000 {method 'decode' of 'bytes' objects} 20 0.000 0.000 0.000 0.000 core.py:676(_chunk_key) 80 0.000 0.000 0.000 0.000 {method 'seek' of '_io.BufferedReader' objects} 20 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.frombuffer} 80 0.000 0.000 0.000 0.000 core.py:398(<genexpr>) 20 0.000 0.000 0.000 0.000 {method 'join' of 'str' objects} 20 0.000 0.000 0.000 0.000 core.py:386(<listcomp>) 20 0.000 0.000 0.000 0.000 {built-in method builtins.all} 40 0.000 0.000 0.000 0.000 util.py:121(<genexpr>) 231 0.000 0.000 0.000 0.000 {built-in method builtins.isinstance} 20 0.000 0.000 0.000 0.000 cp437.py:14(decode) 80 0.000 0.000 0.000 0.000 {method 'tell' of '_io.BufferedReader' objects} 20 0.000 0.000 0.000 0.000 zipfile.py:667(close) 20 0.000 0.000 0.000 0.000 {built-in method _struct.unpack} 140 0.000 0.000 0.000 0.000 {built-in method builtins.max} 20 0.000 0.000 0.000 0.000 {function ZipExtFile.close at 0x7f8cd5ca2048} 20 0.000 0.000 0.000 0.000 zipfile.py:1194(getinfo) 140 0.000 0.000 0.000 0.000 {built-in method builtins.min} 20 0.000 0.000 0.000 0.000 threading.py:1224(current_thread) 20 0.000 0.000 0.000 0.000 zipfile.py:654(__init__) 1 0.000 0.000 0.000 0.000 util.py:195(get_chunk_range) 20 0.000 0.000 0.000 0.000 {built-in method _codecs.charmap_decode} 1 0.000 0.000 0.000 0.000 util.py:166(normalize_array_selection) 1 0.000 0.000 0.000 0.000 util.py:198(<listcomp>) 20 0.000 0.000 0.000 0.000 zipfile.py:1715(_fpclose) 20 0.000 0.000 0.000 0.000 {method 'get' of 'dict' objects} 63 0.000 0.000 0.000 0.000 {built-in method builtins.len} 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty} 2 0.000 0.000 0.000 0.000 util.py:182(<genexpr>) 20 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr} 20 0.000 0.000 0.000 0.000 {built-in method _thread.get_ident} 1 0.000 0.000 0.000 0.000 util.py:130(normalize_axis_selection) 20 0.000 0.000 0.000 0.000 zipfile.py:636(_get_decompressor) 20 0.000 0.000 0.000 0.000 threading.py:1298(main_thread) 4 0.000 0.000 0.000 0.000 core.py:373(<genexpr>) 3 0.000 0.000 0.000 0.000 util.py:187(<genexpr>) 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}
import dask
import dask.array as da
dask.__version__
'0.11.0'
d = da.from_array(z, chunks=z.chunks)
d
dask.array<array-f..., shape=(7449486, 773, 2), dtype=int8, chunksize=(13107, 40, 2)>
%time d.sum(axis=1).compute()
CPU times: user 3min 35s, sys: 4.36 s, total: 3min 40s Wall time: 29.5 s
array([[3, 0], [1, 0], [2, 0], ..., [2, 8], [8, 8], [0, 1]])
# compare with same data via directory store
store_dir = zarr.DirectoryStore('/data/coluzzi/ag1000g/data/phase1/release/AR3.1/haplotypes/main/zarr2/zstd/ag1000g.phase1.ar3.1.haplotypes')
grp_dir = zarr.Group(store_dir)
z_dir = grp_dir['3L/calldata/genotype']
z_dir
Array(/3L/calldata/genotype, (7449486, 773, 2), int8, chunks=(13107, 40, 2), order=C) nbytes: 10.7G; nbytes_stored: 193.5M; ratio: 56.7; initialized: 11380/11380 compressor: Blosc(cname='zstd', clevel=1, shuffle=2) store: DirectoryStore
d_dir = da.from_array(z_dir, chunks=z_dir.chunks)
d_dir
dask.array<array-7..., shape=(7449486, 773, 2), dtype=int8, chunksize=(13107, 40, 2)>
%time d_dir.sum(axis=1).compute()
CPU times: user 3min 39s, sys: 4.91 s, total: 3min 44s Wall time: 31.1 s
array([[3, 0], [1, 0], [2, 0], ..., [2, 8], [8, 8], [0, 1]])