%pylab inline
Welcome to pylab, a matplotlib-based Python environment [backend: module://IPython.kernel.zmq.pylab.backend_inline]. For more information, type 'help(pylab)'.
import numpy as np
import pandas as pd
import pylab as pl
from sklearn.utils import murmurhash3_32
n_groups = 100
n_samples = 100000
group_id = np.asarray(np.random.randint(n_groups, size=n_samples), dtype=np.int32)
data = np.random.normal(size=n_samples)
df = pd.DataFrame({'group_id': group_id, 'data': data})
%time grouped = df.groupby('group_id')
%time aggregate = grouped.sum()
aggregate[:10]
CPU times: user 253 µs, sys: 16 µs, total: 269 µs Wall time: 271 µs CPU times: user 4.9 ms, sys: 474 µs, total: 5.37 ms Wall time: 4.86 ms
data | |
---|---|
group_id | |
0 | 4.974845 |
1 | -50.376857 |
2 | 5.168091 |
3 | 6.355739 |
4 | 22.481601 |
5 | -0.748311 |
6 | -73.982681 |
7 | 4.263864 |
8 | 43.841003 |
9 | 14.636269 |
murmurhash3_32(df.group_id, 0) % 4
array([1, 2, 3, ..., 0, 1, 2], dtype=int32)
from IPython.parallel import Client
client = Client()
len(client)
4
from sklearn.utils import gen_even_slices
list(gen_even_slices(100, 4))[2]
slice(50, 75, None)
import numpy as np
n_samples = int(1e7)
a = np.random.normal(size=n_samples)
%time a.sort()
CPU times: user 1.02 s, sys: 1.87 ms, total: 1.02 s Wall time: 1.02 s
_ = pl.plot(a)
b = np.random.normal(size=n_samples)
%time b.sort()
CPU times: user 1e+03 ms, sys: 1.43 ms, total: 1 s Wall time: 1 s
c = np.r_[a[:n_samples / 2], b[:n_samples / 2]]
_ = pl.plot(c)
c.shape
(10000000,)
c.nbytes / 1e6
80.0
%time c.sort()
CPU times: user 664 ms, sys: 1.61 ms, total: 666 ms Wall time: 665 ms
%load_ext cython
The cython module is not an IPython extension.
# TODO: implement merge of sorted collections