import pandas as pd
from pandas import DataFrame, Series
import numpy as np
It's an array containing values and index
obj = Series([1, 2, 3, 4])
obj
0 1 1 2 2 3 3 4 dtype: int64
obj.values
array([1, 2, 3, 4])
obj.index
RangeIndex(start=0, stop=4, step=1)
obj2 = Series([1, 2, 3, 4], index = ["a", "b", "c", "d"])
obj2
a 1 b 2 c 3 d 4 dtype: int64
obj2["b"]
2
obj2[["a", "d"]]
a 1 d 4 dtype: int64
obj2["c"] = 100
obj2
a 1 b 2 c 100 d 4 dtype: int64
obj2[obj2.values > 50]
c 100 dtype: int64
np.exp(obj2)
a 2.718282e+00 b 7.389056e+00 c 2.688117e+43 d 5.459815e+01 dtype: float64
obj2[obj2 > 50]
c 100 dtype: int64
obj2 * 2
a 2 b 4 c 200 d 8 dtype: int64
pop_data = {"nevada" : 1200, "texas" : 1450, "newyork" : 20000, "ohio" : 9500}
pop_data
{'nevada': 1200, 'newyork': 20000, 'ohio': 9500, 'texas': 1450}
obj3 = pd.Series(pop_data)
obj3
nevada 1200 newyork 20000 ohio 9500 texas 1450 dtype: int64
obj3.index
Index(['nevada', 'newyork', 'ohio', 'texas'], dtype='object')
obj3.values
array([ 1200, 20000, 9500, 1450])
pd.isnull(obj3)
nevada False newyork False ohio False texas False dtype: bool
obj3.isnull()
nevada False newyork False ohio False texas False dtype: bool
obj3 + obj3
nevada 2400 newyork 40000 ohio 19000 texas 2900 dtype: int64
obj3.name = "population"
obj3.index.name = "state"
obj3
state nevada 1200 newyork 20000 ohio 9500 texas 1450 Name: population, dtype: int64
obj3.index = ["oregon", "virginia", "ohio"]
obj3
oregon 1200 virginia 2000000 ohio 1450 Name: population, dtype: object
A pandas DataFrame has both a row and column index.
data = {"state" : ["ohio", "oregon", "texas", "oregon", "texas"],
"pop" : [2000, 2002, 2003, 2002, 2003],
"year" : [1990, 1991, 1992, 1991, 1992]}
data
{'pop': [2000, 2002, 2003, 2002, 2003], 'state': ['ohio', 'oregon', 'texas', 'oregon', 'texas'], 'year': [1990, 1991, 1992, 1991, 1992]}
df = pd.DataFrame(data)
df
pop | state | year | |
---|---|---|---|
0 | 2000 | ohio | 1990 |
1 | 2002 | oregon | 1991 |
2 | 2003 | texas | 1992 |
3 | 2002 | oregon | 1991 |
4 | 2003 | texas | 1992 |
df.rename(columns = {"pop" : "population"})
population | state | year | |
---|---|---|---|
0 | 2000 | ohio | 1990 |
1 | 2002 | oregon | 1991 |
2 | 2003 | texas | 1992 |
3 | 2002 | oregon | 1991 |
4 | 2003 | texas | 1992 |
df.columns = ["popn", "states", "years"]
df
popn | states | years | |
---|---|---|---|
0 | 2000 | ohio | 1990 |
1 | 2002 | oregon | 1991 |
2 | 2003 | texas | 1992 |
3 | 2002 | oregon | 1991 |
4 | 2003 | texas | 1992 |
states = df["states"]
states
0 ohio 1 oregon 2 texas 3 oregon 4 texas Name: states, dtype: object
3 ** 2
9
states1 = df.states
states1
0 ohio 1 oregon 2 texas 3 oregon 4 texas Name: states, dtype: object
df2 = pd.DataFrame(data, columns = ["state", "pop", "year", "rainfall"])
df2
state | pop | year | rainfall | |
---|---|---|---|---|
0 | ohio | 2000 | 1990 | NaN |
1 | oregon | 2002 | 1991 | NaN |
2 | texas | 2003 | 1992 | NaN |
3 | oregon | 2002 | 1991 | NaN |
4 | texas | 2003 | 1992 | NaN |
df2.rainfall = 20
df2
state | pop | year | rainfall | |
---|---|---|---|---|
0 | ohio | 2000 | 1990 | 20 |
1 | oregon | 2002 | 1991 | 20 |
2 | texas | 2003 | 1992 | 20 |
3 | oregon | 2002 | 1991 | 20 |
4 | texas | 2003 | 1992 | 20 |
df2["rainfall"] = 15
df2
state | pop | year | rainfall | |
---|---|---|---|---|
0 | ohio | 2000 | 1990 | 15 |
1 | oregon | 2002 | 1991 | 15 |
2 | texas | 2003 | 1992 | 15 |
3 | oregon | 2002 | 1991 | 15 |
4 | texas | 2003 | 1992 | 15 |
df2.rainfall = np.arange(5)
df2
state | pop | year | rainfall | |
---|---|---|---|---|
0 | ohio | 2000 | 1990 | 0 |
1 | oregon | 2002 | 1991 | 1 |
2 | texas | 2003 | 1992 | 2 |
3 | oregon | 2002 | 1991 | 3 |
4 | texas | 2003 | 1992 | 4 |
# retrieve rows by position
df2.loc[3]
state oregon pop 2002 year 1991 rainfall 3 Name: 3, dtype: object
val = Series([50, 60, 70], index = [0, 2, 4])
val
0 50 2 60 4 70 dtype: int64
df2.rainfall = val
df2
state | pop | year | rainfall | |
---|---|---|---|---|
0 | ohio | 2000 | 1990 | 50.0 |
1 | oregon | 2002 | 1991 | NaN |
2 | texas | 2003 | 1992 | 60.0 |
3 | oregon | 2002 | 1991 | NaN |
4 | texas | 2003 | 1992 | 70.0 |
df2["temperature"] = ""
df2
state | pop | year | rainfall | temperature | |
---|---|---|---|---|---|
0 | ohio | 2000 | 1990 | 50.0 | |
1 | oregon | 2002 | 1991 | NaN | |
2 | texas | 2003 | 1992 | 60.0 | |
3 | oregon | 2002 | 1991 | NaN | |
4 | texas | 2003 | 1992 | 70.0 |
df2.temperature = df2.rainfall
df2
state | pop | year | rainfall | temperature | |
---|---|---|---|---|---|
0 | ohio | 2000 | 1990 | 50.0 | 50.0 |
1 | oregon | 2002 | 1991 | NaN | NaN |
2 | texas | 2003 | 1992 | 60.0 | 60.0 |
3 | oregon | 2002 | 1991 | NaN | NaN |
4 | texas | 2003 | 1992 | 70.0 | 70.0 |
del df2["temperature"]
df2
state | pop | year | rainfall | |
---|---|---|---|---|
0 | ohio | 2000 | 1990 | 50.0 |
1 | oregon | 2002 | 1991 | NaN |
2 | texas | 2003 | 1992 | 60.0 |
3 | oregon | 2002 | 1991 | NaN |
4 | texas | 2003 | 1992 | 70.0 |
# transpose
df2.T
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
state | ohio | oregon | texas | oregon | texas |
pop | 2000 | 2002 | 2003 | 2002 | 2003 |
year | 1990 | 1991 | 1992 | 1991 | 1992 |
rainfall | 50 | 60 | 70 | NaN | NaN |
df2.index
RangeIndex(start=0, stop=5, step=1)
df2.values
array([['ohio', 2000, 1990, 50.0], ['oregon', 2002, 1991, nan], ['texas', 2003, 1992, 60.0], ['oregon', 2002, 1991, nan], ['texas', 2003, 1992, 70.0]], dtype=object)
# from nested dictionaries
# pandas will interpret the outer dict keys as the columns and the inner keys as the row indices
nested_dict = {"nevada" : {"rainfall" : 20, "temp" : 30, "vote" : "republican"},
"ohio" : {},
"oregon" : {}}
nested_df = pd.DataFrame(nested_dict)
nested_df
nevada | ohio | oregon | |
---|---|---|---|
rainfall | 20 | NaN | NaN |
temp | 30 | NaN | NaN |
vote | republican | NaN | NaN |
nested_df.T
rainfall | temp | vote | |
---|---|---|---|
nevada | 20 | 30 | republican |
ohio | NaN | NaN | NaN |
oregon | NaN | NaN | NaN |
nested_df.index.name = "states"; nested_df.columns.name = "conditions"
nested_df
conditions | nevada | ohio | oregon |
---|---|---|---|
states | |||
rainfall | 20 | NaN | NaN |
temp | 30 | NaN | NaN |
vote | republican | NaN | NaN |
nested_df.columns
Index(['nevada', 'ohio', 'oregon'], dtype='object', name='conditions')
nested_df.values
array([[20, nan, nan], [30, nan, nan], ['republican', nan, nan]], dtype=object)
obj = pd.Series(range(4), index = ["a", "b", "c", "d"])
obj
a 0 b 1 c 2 d 3 dtype: int64
index = obj.index
index
Index(['a', 'b', 'c', 'd'], dtype='object')
index[1:]
Index(['b', 'c', 'd'], dtype='object')
np.arange(4)
array([0, 1, 2, 3])
index = pd.Index(np.arange(4))
index
Int64Index([0, 1, 2, 3], dtype='int64')
df2
state | pop | year | rainfall | |
---|---|---|---|---|
0 | ohio | 2000 | 1990 | 50.0 |
1 | oregon | 2002 | 1991 | NaN |
2 | texas | 2003 | 1992 | 60.0 |
3 | oregon | 2002 | 1991 | NaN |
4 | texas | 2003 | 1992 | 70.0 |
df2.describe()
pop | year | rainfall | |
---|---|---|---|
count | 5.000000 | 5.00000 | 3.0 |
mean | 2002.000000 | 1991.20000 | 60.0 |
std | 1.224745 | 0.83666 | 10.0 |
min | 2000.000000 | 1990.00000 | 50.0 |
25% | 2002.000000 | 1991.00000 | 55.0 |
50% | 2002.000000 | 1991.00000 | 60.0 |
75% | 2003.000000 | 1992.00000 | 65.0 |
max | 2003.000000 | 1992.00000 | 70.0 |
df2.sum()
state ohiooregontexasoregontexas pop 10010 year 9956 rainfall 180 dtype: object
df2.sum(axis = 1)
0 4040.0 1 3993.0 2 4055.0 3 3993.0 4 4065.0 dtype: float64
df2.mean()
pop 2002.0 year 1991.2 rainfall 60.0 dtype: float64
df2.min()
state ohio pop 2000 year 1990 rainfall 50 dtype: object
df2.max()
state texas pop 2003 year 1992 rainfall 70 dtype: object
df2.rainfall.median()
60.0