In [1]:
import pandas as pd
import numpy as np
In [2]:
n = 10e6
m = 1e6
In [3]:
%time  d = pd.DataFrame({"x": np.random.randint(0,m,n), "y": np.random.random(n)})
CPU times: user 336 ms, sys: 144 ms, total: 480 ms
Wall time: 482 ms

Filter

In [4]:
%time  dd = d[(d.x>=10) & (d.x<20)]
CPU times: user 96 ms, sys: 20 ms, total: 116 ms
Wall time: 115 ms

Sort

In [5]:
%time  dd = d.sort("x")
CPU times: user 3.49 s, sys: 264 ms, total: 3.76 s
Wall time: 3.77 s

New column

In [6]:
%time  dd = d.copy()
CPU times: user 24 ms, sys: 72 ms, total: 96 ms
Wall time: 96.9 ms
In [7]:
%time  dd["y2"] = 2*dd["y"]
CPU times: user 60 ms, sys: 88 ms, total: 148 ms
Wall time: 64.9 ms

Aggregate

In [8]:
%time  dd = d.groupby("x")["y"].mean()
CPU times: user 1.3 s, sys: 156 ms, total: 1.46 s
Wall time: 1.46 s
In [9]:
type(dd)
Out[9]:
pandas.core.series.Series
In [10]:
%time  dd = d.groupby("x", as_index = False)["y"].mean()
CPU times: user 1.39 s, sys: 140 ms, total: 1.53 s
Wall time: 1.53 s
In [11]:
type(dd)
Out[11]:
pandas.core.frame.DataFrame
In [12]:
%time  dk = d.sort_index(by = "x")
CPU times: user 3.48 s, sys: 240 ms, total: 3.72 s
Wall time: 3.72 s
In [13]:
%time  dd = dk.groupby("x")["y"].mean()
CPU times: user 284 ms, sys: 120 ms, total: 404 ms
Wall time: 402 ms
In [14]:
%time  dd = dk.groupby("x", as_index = False)["y"].mean()
CPU times: user 352 ms, sys: 132 ms, total: 484 ms
Wall time: 485 ms

Join

In [15]:
%time dm = pd.DataFrame({"x": np.random.permutation(np.arange(m))})
CPU times: user 176 ms, sys: 4 ms, total: 180 ms
Wall time: 179 ms
In [16]:
%time dd = pd.merge(d, dm)
CPU times: user 5.38 s, sys: 504 ms, total: 5.88 s
Wall time: 5.89 s
In [17]:
%time  dmk = dm.sort_index(by = "x")
CPU times: user 212 ms, sys: 4 ms, total: 216 ms
Wall time: 217 ms
In [18]:
%time dd = pd.merge(dk, dmk)
CPU times: user 1.78 s, sys: 380 ms, total: 2.16 s
Wall time: 2.16 s