%autosave 10
Autosaving every 10 seconds
import numpy as np
import biggus
np_array = np.empty((700, 200), dtype=int32)
arr = biggus.NumpyArrayAdapter(np_array)
print arr
<NumpyArrayAdapter shape=(700, 200) dtype=dtype('int32')>
# np.concatenate
bigger_arr = biggus.LinearMosaic([arr, arr], axis=0)
print bigger_arr
<LinearMosaic shape=(1400, 200) dtype=dtype('int32')>
# no memory copying
print biggus.LinearMosaic([arr, arr] * 20, axis=0)
<LinearMosaic shape=(28000, 200) dtype=dtype('int32')>
# new dimension
biggus.ArrayStack(np.array([arr, arr]))
<ArrayStack shape=(2, 700, 200) dtype=dtype('int32')>
import h5py
hdf_dataset = h5py.File('data.hdf5')['arange']
# this is lazy; no data is loaded
arr_hdf = biggus.NumpyArrayAdapter(hdf_dataset)
print arr
LinearMosaic
) HDF5 and regular arrays.bigger_arr = biggus.LinearMosaic([bigger_arr, arr_hdf], axis=0)
print bigger_arr
The ndarray
method realizes arrays, and brings it into memory.
type(bigger_arr.ndarray()), bigger_arr.ndarray().shape
(numpy.ndarray, (1400, 200))
You can do basic processing on massive arrays in chunks.
# These operations don't run when you do this
mean = biggus.mean(bigger_arr, axis=0)
std = biggus.std(bigger_arr, axis=0)
print mean
<_Aggregation shape=(200,) dtype=dtype('float64')>
# _now_ we realize it, calculate mean in mean.ndarray()
# done is chunks, data never all in-memory
print np.all(mean.ndarray() == bigger_arr.ndarray().mean(axis=0))
True
Really though as you go chunk-by-chunk you want to do many operations at the same time.
# this realizes the result. it really is chunking the array
# into sub-arrays, aggregating results
mean_np, std_np = biggus.ndarrays([mean, std])
print type(mean_np)
<type 'numpy.ndarray'>
import h5py
with h5py.File('result.hdf5', mode='w') as f_out:
df = f_out.create_dataset('my_result', mean.shape, mean.dtype)
biggus.save([mean], [df])
!!AI see video.