import pandas as pd
import numpy as np
string_data = pd.Series(["aardvark", "artichoke", np.nan, "avocado"])
string_data
0 aardvark 1 artichoke 2 NaN 3 avocado dtype: object
# check for NAs i.e "not available" or NaN i.e "not a number"
string_data.isnull()
0 False 1 False 2 True 3 False dtype: bool
string_data[0] = None
string_data
0 None 1 artichoke 2 NaN 3 avocado dtype: object
string_data.dropna()
1 artichoke 3 avocado dtype: object
from numpy import nan as NA
df = pd.DataFrame([[1, 2, 3, 4, 5, 7],
[4, 5, 6, None, NA, 7],
[NA, NA, NA, NA, NA, NA],
[4, 5, NA, NA, None, 6]])
df
0 | 1 | 2 | 3 | 4 | 5 | |
---|---|---|---|---|---|---|
0 | 1.0 | 2.0 | 3.0 | 4.0 | 5.0 | 7.0 |
1 | 4.0 | 5.0 | 6.0 | NaN | NaN | 7.0 |
2 | NaN | NaN | NaN | NaN | NaN | NaN |
3 | 4.0 | 5.0 | NaN | NaN | NaN | 6.0 |
df.dropna()
0 | 1 | 2 | 3 | 4 | 5 | |
---|---|---|---|---|---|---|
0 | 1.0 | 2.0 | 3.0 | 4.0 | 5.0 | 7.0 |
df.dropna(how = "all")
0 | 1 | 2 | 3 | 4 | 5 | |
---|---|---|---|---|---|---|
0 | 1.0 | 2.0 | 3.0 | 4.0 | 5.0 | 7.0 |
1 | 4.0 | 5.0 | 6.0 | NaN | NaN | 7.0 |
3 | 4.0 | 5.0 | NaN | NaN | NaN | 6.0 |
df[6] = NA
df
0 | 1 | 2 | 3 | 4 | 5 | 6 | |
---|---|---|---|---|---|---|---|
0 | 1.0 | 2.0 | 3.0 | 4.0 | 5.0 | 7.0 | NaN |
1 | 4.0 | 5.0 | 6.0 | NaN | NaN | 7.0 | NaN |
2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | 4.0 | 5.0 | NaN | NaN | NaN | 6.0 | NaN |
df.dropna(axis = 1, how = "all")
0 | 1 | 2 | 3 | 4 | 5 | |
---|---|---|---|---|---|---|
0 | 1.0 | 2.0 | 3.0 | 4.0 | 5.0 | 7.0 |
1 | 4.0 | 5.0 | 6.0 | NaN | NaN | 7.0 |
2 | NaN | NaN | NaN | NaN | NaN | NaN |
3 | 4.0 | 5.0 | NaN | NaN | NaN | 6.0 |
df2 = df.fillna(0)
df2
0 | 1 | 2 | 3 | 4 | 5 | 6 | |
---|---|---|---|---|---|---|---|
0 | 1.0 | 2.0 | 3.0 | 4.0 | 5.0 | 7.0 | 0.0 |
1 | 4.0 | 5.0 | 6.0 | 0.0 | 0.0 | 7.0 | 0.0 |
2 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | 4.0 | 5.0 | 0.0 | 0.0 | 0.0 | 6.0 | 0.0 |
df3 = df.fillna(df.mean())
df3
0 | 1 | 2 | 3 | 4 | 5 | 6 | |
---|---|---|---|---|---|---|---|
0 | 1.0 | 2.0 | 3.0 | 4.0 | 5.0 | 7.000000 | NaN |
1 | 4.0 | 5.0 | 6.0 | 4.0 | 5.0 | 7.000000 | NaN |
2 | 3.0 | 4.0 | 4.5 | 4.0 | 5.0 | 6.666667 | NaN |
3 | 4.0 | 5.0 | 4.5 | 4.0 | 5.0 | 6.000000 | NaN |
df = pd.DataFrame({"h1" : ["one", "two", "three"] * 3,
"h2" : [1, 2, 2, 1, 5, 5, 2, 2, 5]})
df
h1 | h2 | |
---|---|---|
0 | one | 1 |
1 | two | 2 |
2 | three | 2 |
3 | one | 1 |
4 | two | 5 |
5 | three | 5 |
6 | one | 2 |
7 | two | 2 |
8 | three | 5 |
df.duplicated()
0 False 1 False 2 False 3 True 4 False 5 False 6 False 7 True 8 True dtype: bool
df2 = df.drop_duplicates()
df2
h1 | h2 | |
---|---|---|
0 | one | 1 |
1 | two | 2 |
2 | three | 2 |
4 | two | 5 |
5 | three | 5 |
6 | one | 2 |
df2 = df["h1"].drop_duplicates()
df2
0 one 1 two 2 three Name: h1, dtype: object
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon','Pastrami', 'corned beef', 'Bacon', 'pastrami', 'honey ham', 'nova lox'],
'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data
food | ounces | |
---|---|---|
0 | bacon | 4.0 |
1 | pulled pork | 3.0 |
2 | bacon | 12.0 |
3 | Pastrami | 6.0 |
4 | corned beef | 7.5 |
5 | Bacon | 8.0 |
6 | pastrami | 3.0 |
7 | honey ham | 5.0 |
8 | nova lox | 6.0 |
meat_to_animal = {
'bacon': 'pig',
'pulled pork': 'pig',
'pastrami': 'cow',
'corned beef': 'cow',
'honey ham': 'pig',
'nova lox': 'salmon'}
lowered = data.food.str.lower()
lowered
0 bacon 1 pulled pork 2 bacon 3 pastrami 4 corned beef 5 bacon 6 pastrami 7 honey ham 8 nova lox Name: food, dtype: object
data["animal"] = lowered.map(meat_to_animal)
data
food | ounces | animal | |
---|---|---|---|
0 | bacon | 4.0 | pig |
1 | pulled pork | 3.0 | pig |
2 | bacon | 12.0 | pig |
3 | Pastrami | 6.0 | cow |
4 | corned beef | 7.5 | cow |
5 | Bacon | 8.0 | pig |
6 | pastrami | 3.0 | cow |
7 | honey ham | 5.0 | pig |
8 | nova lox | 6.0 | salmon |
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data
0 1.0 1 -999.0 2 2.0 3 -999.0 4 -1000.0 5 3.0 dtype: float64
data.replace(-999, np.nan)
0 1.0 1 NaN 2 2.0 3 NaN 4 -1000.0 5 3.0 dtype: float64
data.replace([-999, -1000], np.nan)
0 1.0 1 NaN 2 2.0 3 NaN 4 NaN 5 3.0 dtype: float64
data.replace({-999 : np.nan,
-1000: 0})
0 1.0 1 NaN 2 2.0 3 NaN 4 0.0 5 3.0 dtype: float64
df = pd.DataFrame(np.arange(start = 1, stop = 13, step = 1).reshape(3, 4))
df.index = ["ohio", "colorado", "ny"]
df.columns = ["one", "two", "three", "four"]
df
one | two | three | four | |
---|---|---|---|---|
ohio | 1 | 2 | 3 | 4 |
colorado | 5 | 6 | 7 | 8 |
ny | 9 | 10 | 11 | 12 |
to_upper = lambda x: x.upper()
df.index.map(to_upper)
Index(['OHIO', 'COLORADO', 'NY'], dtype='object')
df.index = df.index.map(to_upper)
df
one | two | three | four | |
---|---|---|---|---|
OHIO | 1 | 2 | 3 | 4 |
COLORADO | 5 | 6 | 7 | 8 |
NY | 9 | 10 | 11 | 12 |
df.rename(index = str.upper, columns = str.title)
One | Two | Three | Four | |
---|---|---|---|---|
OHIO | 1 | 2 | 3 | 4 |
COLORADO | 5 | 6 | 7 | 8 |
NY | 9 | 10 | 11 | 12 |
age = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
age
[20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins =[18, 25, 35, 60, 100]
cats = pd.cut(age, bins)
cats
[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]] Length: 12 Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
cats.categories
IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]] closed='right', dtype='interval[int64]')
cats.codes
array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)
pd.value_counts(cats)
(18, 25] 5 (35, 60] 3 (25, 35] 3 (60, 100] 1 dtype: int64
df = pd.DataFrame(np.random.randn(1000, 4))
df.head()
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | -1.060885 | 0.257677 | -0.439057 | -0.585580 |
1 | 0.992414 | 0.848633 | 0.279456 | 1.804831 |
2 | -0.465408 | -1.573807 | 0.313331 | 0.939957 |
3 | 0.247694 | -0.232562 | -0.763437 | 3.137660 |
4 | -0.455083 | 1.724922 | -0.467447 | 0.595734 |
df.describe()
0 | 1 | 2 | 3 | |
---|---|---|---|---|
count | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 |
mean | 0.024221 | -0.005284 | -0.007271 | 0.058116 |
std | 0.999047 | 0.981500 | 1.027020 | 0.993744 |
min | -3.060966 | -3.082180 | -3.907713 | -3.536450 |
25% | -0.651549 | -0.667142 | -0.709838 | -0.641517 |
50% | 0.028659 | 0.023977 | -0.028669 | 0.048662 |
75% | 0.683996 | 0.725258 | 0.675920 | 0.739192 |
max | 3.759436 | 2.994700 | 3.263231 | 3.197917 |
df[2][np.abs(df[2]) > 3]
145 -3.907713 971 3.263231 Name: 2, dtype: float64
df.sample(frac = 0.75).head()
0 | 1 | 2 | 3 | |
---|---|---|---|---|
555 | -0.055178 | -0.704377 | 1.067797 | -0.264149 |
295 | -0.338811 | -0.589736 | -0.562751 | -0.357064 |
329 | 0.634184 | 0.472945 | -0.562836 | -0.461018 |
727 | -0.735358 | -2.652772 | -2.111481 | 0.254709 |
484 | 1.133932 | 0.770841 | 1.053564 | -1.719157 |
val = "a, b, guide"
val
'a, b, guide'
val.split(",")
['a', ' b', ' guide']
pieces = [x.strip() for x in val.split(",")]
pieces
['a', 'b', 'guide']
"__".join(pieces)
'a__b__guide'
val.count("a")
1
Flexible way to search or match string patterns
import re
text = "foo bar\t baz \tqux"
text
'foo bar\t baz \tqux'
re.split("\s+", text)
['foo', 'bar', 'baz', 'qux']
regex = re.compile("\s+")
regex.split(text)
['foo', 'bar', 'baz', 'qux']