import numpy as np
import pandas as pd
pd.options.display.max_rows = 20
pd.options.display.max_colwidth = 80
pd.options.display.max_columns = 20
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc("figure", figsize=(10, 6))
np.set_printoptions(precision=4, suppress=True)
data = pd.Series(np.random.uniform(size=9),
index=[["a", "a", "a", "b", "b", "c", "c", "d", "d"],
[1, 2, 3, 1, 3, 1, 2, 2, 3]])
data
data.index
data["b"]
data["b":"c"]
data.loc[["b", "d"]]
data.loc[:, 2]
data.unstack()
data.unstack().stack()
frame = pd.DataFrame(np.arange(12).reshape((4, 3)),
index=[["a", "a", "b", "b"], [1, 2, 1, 2]],
columns=[["Ohio", "Ohio", "Colorado"],
["Green", "Red", "Green"]])
frame
frame.index.names = ["key1", "key2"]
frame.columns.names = ["state", "color"]
frame
frame.index.nlevels
frame["Ohio"]
frame.swaplevel("key1", "key2")
frame.sort_index(level=1)
frame.swaplevel(0, 1).sort_index(level=0)
frame.groupby(level="key2").sum()
frame.groupby(level="color", axis="columns").sum()
frame = pd.DataFrame({"a": range(7), "b": range(7, 0, -1),
"c": ["one", "one", "one", "two", "two",
"two", "two"],
"d": [0, 1, 2, 0, 1, 2, 3]})
frame
frame2 = frame.set_index(["c", "d"])
frame2
frame.set_index(["c", "d"], drop=False)
frame2.reset_index()
df1 = pd.DataFrame({"key": ["b", "b", "a", "c", "a", "a", "b"],
"data1": pd.Series(range(7), dtype="Int64")})
df2 = pd.DataFrame({"key": ["a", "b", "d"],
"data2": pd.Series(range(3), dtype="Int64")})
df1
df2
pd.merge(df1, df2)
pd.merge(df1, df2, on="key")
df3 = pd.DataFrame({"lkey": ["b", "b", "a", "c", "a", "a", "b"],
"data1": pd.Series(range(7), dtype="Int64")})
df4 = pd.DataFrame({"rkey": ["a", "b", "d"],
"data2": pd.Series(range(3), dtype="Int64")})
pd.merge(df3, df4, left_on="lkey", right_on="rkey")
pd.merge(df1, df2, how="outer")
pd.merge(df3, df4, left_on="lkey", right_on="rkey", how="outer")
df1 = pd.DataFrame({"key": ["b", "b", "a", "c", "a", "b"],
"data1": pd.Series(range(6), dtype="Int64")})
df2 = pd.DataFrame({"key": ["a", "b", "a", "b", "d"],
"data2": pd.Series(range(5), dtype="Int64")})
df1
df2
pd.merge(df1, df2, on="key", how="left")
pd.merge(df1, df2, how="inner")
left = pd.DataFrame({"key1": ["foo", "foo", "bar"],
"key2": ["one", "two", "one"],
"lval": pd.Series([1, 2, 3], dtype='Int64')})
right = pd.DataFrame({"key1": ["foo", "foo", "bar", "bar"],
"key2": ["one", "one", "one", "two"],
"rval": pd.Series([4, 5, 6, 7], dtype='Int64')})
pd.merge(left, right, on=["key1", "key2"], how="outer")
pd.merge(left, right, on="key1")
pd.merge(left, right, on="key1", suffixes=("_left", "_right"))
left1 = pd.DataFrame({"key": ["a", "b", "a", "a", "b", "c"],
"value": pd.Series(range(6), dtype="Int64")})
right1 = pd.DataFrame({"group_val": [3.5, 7]}, index=["a", "b"])
left1
right1
pd.merge(left1, right1, left_on="key", right_index=True)
pd.merge(left1, right1, left_on="key", right_index=True, how="outer")
lefth = pd.DataFrame({"key1": ["Ohio", "Ohio", "Ohio",
"Nevada", "Nevada"],
"key2": [2000, 2001, 2002, 2001, 2002],
"data": pd.Series(range(5), dtype="Int64")})
righth_index = pd.MultiIndex.from_arrays(
[
["Nevada", "Nevada", "Ohio", "Ohio", "Ohio", "Ohio"],
[2001, 2000, 2000, 2000, 2001, 2002]
]
)
righth = pd.DataFrame({"event1": pd.Series([0, 2, 4, 6, 8, 10], dtype="Int64",
index=righth_index),
"event2": pd.Series([1, 3, 5, 7, 9, 11], dtype="Int64",
index=righth_index)})
lefth
righth
pd.merge(lefth, righth, left_on=["key1", "key2"], right_index=True)
pd.merge(lefth, righth, left_on=["key1", "key2"],
right_index=True, how="outer")
left2 = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]],
index=["a", "c", "e"],
columns=["Ohio", "Nevada"]).astype("Int64")
right2 = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]],
index=["b", "c", "d", "e"],
columns=["Missouri", "Alabama"]).astype("Int64")
left2
right2
pd.merge(left2, right2, how="outer", left_index=True, right_index=True)
left2.join(right2, how="outer")
left1.join(right1, on="key")
another = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [16., 17.]],
index=["a", "c", "e", "f"],
columns=["New York", "Oregon"])
another
left2.join([right2, another])
left2.join([right2, another], how="outer")
arr = np.arange(12).reshape((3, 4))
arr
np.concatenate([arr, arr], axis=1)
s1 = pd.Series([0, 1], index=["a", "b"], dtype="Int64")
s2 = pd.Series([2, 3, 4], index=["c", "d", "e"], dtype="Int64")
s3 = pd.Series([5, 6], index=["f", "g"], dtype="Int64")
s1
s2
s3
pd.concat([s1, s2, s3])
pd.concat([s1, s2, s3], axis="columns")
s4 = pd.concat([s1, s3])
s4
pd.concat([s1, s4], axis="columns")
pd.concat([s1, s4], axis="columns", join="inner")
result = pd.concat([s1, s1, s3], keys=["one", "two", "three"])
result
result.unstack()
pd.concat([s1, s2, s3], axis="columns", keys=["one", "two", "three"])
df1 = pd.DataFrame(np.arange(6).reshape(3, 2), index=["a", "b", "c"],
columns=["one", "two"])
df2 = pd.DataFrame(5 + np.arange(4).reshape(2, 2), index=["a", "c"],
columns=["three", "four"])
df1
df2
pd.concat([df1, df2], axis="columns", keys=["level1", "level2"])
pd.concat({"level1": df1, "level2": df2}, axis="columns")
pd.concat([df1, df2], axis="columns", keys=["level1", "level2"],
names=["upper", "lower"])
df1 = pd.DataFrame(np.random.standard_normal((3, 4)),
columns=["a", "b", "c", "d"])
df2 = pd.DataFrame(np.random.standard_normal((2, 3)),
columns=["b", "d", "a"])
df1
df2
pd.concat([df1, df2], ignore_index=True)
a = pd.Series([np.nan, 2.5, 0.0, 3.5, 4.5, np.nan],
index=["f", "e", "d", "c", "b", "a"])
b = pd.Series([0., np.nan, 2., np.nan, np.nan, 5.],
index=["a", "b", "c", "d", "e", "f"])
a
b
np.where(pd.isna(a), b, a)
a.combine_first(b)
df1 = pd.DataFrame({"a": [1., np.nan, 5., np.nan],
"b": [np.nan, 2., np.nan, 6.],
"c": range(2, 18, 4)})
df2 = pd.DataFrame({"a": [5., 4., np.nan, 3., 7.],
"b": [np.nan, 3., 4., 6., 8.]})
df1
df2
df1.combine_first(df2)
data = pd.DataFrame(np.arange(6).reshape((2, 3)),
index=pd.Index(["Ohio", "Colorado"], name="state"),
columns=pd.Index(["one", "two", "three"],
name="number"))
data
result = data.stack()
result
result.unstack()
result.unstack(level=0)
result.unstack(level="state")
s1 = pd.Series([0, 1, 2, 3], index=["a", "b", "c", "d"], dtype="Int64")
s2 = pd.Series([4, 5, 6], index=["c", "d", "e"], dtype="Int64")
data2 = pd.concat([s1, s2], keys=["one", "two"])
data2
data2.unstack()
data2.unstack().stack()
data2.unstack().stack(dropna=False)
df = pd.DataFrame({"left": result, "right": result + 5},
columns=pd.Index(["left", "right"], name="side"))
df
df.unstack(level="state")
df.unstack(level="state").stack(level="side")
data = pd.read_csv("examples/macrodata.csv")
data = data.loc[:, ["year", "quarter", "realgdp", "infl", "unemp"]]
data.head()
periods = pd.PeriodIndex(year=data.pop("year"),
quarter=data.pop("quarter"),
name="date")
periods
data.index = periods.to_timestamp("D")
data.head()
data = data.reindex(columns=["realgdp", "infl", "unemp"])
data.columns.name = "item"
data.head()
long_data = (data.stack()
.reset_index()
.rename(columns={0: "value"}))
long_data[:10]
pivoted = long_data.pivot(index="date", columns="item",
values="value")
pivoted.head()
long_data.index.name = None
long_data["value2"] = np.random.standard_normal(len(long_data))
long_data[:10]
pivoted = long_data.pivot(index="date", columns="item")
pivoted.head()
pivoted["value"].head()
unstacked = long_data.set_index(["date", "item"]).unstack(level="item")
unstacked.head()
df = pd.DataFrame({"key": ["foo", "bar", "baz"],
"A": [1, 2, 3],
"B": [4, 5, 6],
"C": [7, 8, 9]})
df
melted = pd.melt(df, id_vars="key")
melted
reshaped = melted.pivot(index="key", columns="variable",
values="value")
reshaped
reshaped.reset_index()
pd.melt(df, id_vars="key", value_vars=["A", "B"])
pd.melt(df, value_vars=["A", "B", "C"])
pd.melt(df, value_vars=["key", "A", "B"])