Pandas is a Python library build on top of NumPy that is used for cleaning, analysing and visualising data.
import pandas as pd
import numpy as np
Panda's Series
class extends NumPy's ndarray
with a labelled index. The key to using Series is to understand how to use its index.
# Create a Series with auto-generated indices
pd.Series(data=[100, 101, 110, 111], dtype=np.int8)
0 100 1 101 2 110 3 111 dtype: int8
# Create a Series with custom indices
pd.Series(data=[100, 101, 110, 111], index=['a', 'b', 'c', 'd'], dtype=np.int8)
a 100 b 101 c 110 d 111 dtype: int8
# Create a Series using a dictionary
d = {'a' : 100, 'b': 101, 'c': 110, 'd': 111}
pd.Series(data=d, dtype=np.int8)
a 100 b 101 c 110 d 111 dtype: int8
day1 = pd.Series(data=[400, 600, 400], index=['breakfast', 'lunch', 'dinner'], dtype=np.int16)
day1
breakfast 400 lunch 600 dinner 400 dtype: int16
day2 = pd.Series(data=[350, 500, 150], index=['breakfast', 'lunch', 'snack'], dtype=np.int16)
day2
breakfast 350 lunch 500 snack 150 dtype: int16
# Note that only values of matched indices are added together.
day1 + day2
breakfast 750.0 dinner NaN lunch 1100.0 snack NaN dtype: float64
A DataFrame is container for tabular data. Basically, a DataFrame is just a collection of Series that share the same index.
def init_df():
return pd.DataFrame(data=np.arange(1,17).reshape(4,4), index='w x y z'.split(), columns='A B C D'.split())
df = init_df()
df
A | B | C | D | |
---|---|---|---|---|
w | 1 | 2 | 3 | 4 |
x | 5 | 6 | 7 | 8 |
y | 9 | 10 | 11 | 12 |
z | 13 | 14 | 15 | 16 |
# Create a new column based on another column
df['E'] = df['A'] ** 2
df
A | B | C | D | E | |
---|---|---|---|---|---|
w | 1 | 2 | 3 | 4 | 1 |
x | 5 | 6 | 7 | 8 | 25 |
y | 9 | 10 | 11 | 12 | 81 |
z | 13 | 14 | 15 | 16 | 169 |
# Create a new DataFrame, where certain columns are excluded.
df.drop(['A', 'E'], axis=1)
B | C | D | |
---|---|---|---|
w | 2 | 3 | 4 |
x | 6 | 7 | 8 |
y | 10 | 11 | 12 |
z | 14 | 15 | 16 |
# Remove a column permanently
df.drop('E', axis=1, inplace=True)
df
A | B | C | D | |
---|---|---|---|---|
w | 1 | 2 | 3 | 4 |
x | 5 | 6 | 7 | 8 |
y | 9 | 10 | 11 | 12 |
z | 13 | 14 | 15 | 16 |
# Select column 'A'
df['A']
w 1 x 5 y 9 z 13 Name: A, dtype: int32
# Note that all columns are stored as Series objects
type(df['A'])
pandas.core.series.Series
# Selecting multiple columns, we get a new DataFrame object
df[['A', 'D']]
A | D | |
---|---|---|
w | 1 | 4 |
x | 5 | 8 |
y | 9 | 12 |
z | 13 | 16 |
# Select a row by its label
df.loc['x']
A 5 B 6 C 7 D 8 Name: x, dtype: int32
# Select a row by its numerical index position
df.iloc[0]
A 1 B 2 C 3 D 4 Name: w, dtype: int32
# Select the value of the first cell
df.loc['w', 'A']
1
# Select a subset of the DataFrame
df.loc[['x', 'y'], ['B', 'C']]
B | C | |
---|---|---|
x | 6 | 7 |
y | 10 | 11 |
# Conditional selection
df[df > 10]
A | B | C | D | |
---|---|---|---|---|
w | NaN | NaN | NaN | NaN |
x | NaN | NaN | NaN | NaN |
y | NaN | NaN | 11.0 | 12.0 |
z | 13.0 | 14.0 | 15.0 | 16.0 |
# Note that the conditional selection only
# returns cells whose boolean value is True
# in the following DataFrame
df > 10
A | B | C | D | |
---|---|---|---|---|
w | False | False | False | False |
x | False | False | False | False |
y | False | False | True | True |
z | True | True | True | True |
# Select the rows where column A is larger or equal to 9
df[df['A'] >= 9]
A | B | C | D | |
---|---|---|---|---|
y | 9 | 10 | 11 | 12 |
z | 13 | 14 | 15 | 16 |
# Note that we use `&` as conjunction since Python's `and` operator
# can only deal with single Boolean values e.g. `True and True`
df[(df['A'] >= 9) & (df['C'] == 11)]
A | B | C | D | |
---|---|---|---|---|
y | 9 | 10 | 11 | 12 |
df[(df['A'] >= 9) | (df['C'] == 3)]
A | B | C | D | |
---|---|---|---|---|
w | 1 | 2 | 3 | 4 |
y | 9 | 10 | 11 | 12 |
z | 13 | 14 | 15 | 16 |
# Reset the index to a numerical value
# Note that the old index will become
# a column in our DataFrame.
df.reset_index()
index | A | B | C | D | |
---|---|---|---|---|---|
0 | w | 1 | 2 | 3 | 4 |
1 | x | 5 | 6 | 7 | 8 |
2 | y | 9 | 10 | 11 | 12 |
3 | z | 13 | 14 | 15 | 16 |
# Set a new index.
df['Country'] = 'CA DE DK NO'.split()
df.set_index('Country')
# To overrides the old index use following line instead:
# df.set_index('Country', inplace=True)
A | B | C | D | |
---|---|---|---|---|
Country | ||||
CA | 1 | 2 | 3 | 4 |
DE | 5 | 6 | 7 | 8 |
DK | 9 | 10 | 11 | 12 |
NO | 13 | 14 | 15 | 16 |
outside = 'p p p q q q'.split()
inside = [1, 2, 3, 1, 2, 3]
hierarchical_index = list(zip(outside, inside))
multi_index = pd.MultiIndex.from_tuples(hierarchical_index, names='outside inside'.split())
multi_index
MultiIndex(levels=[['p', 'q'], [1, 2, 3]], labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]], names=['outside', 'inside'])
df = pd.DataFrame(data=np.random.randn(6,2), index=multi_index, columns=['Column 1', 'Column 2'])
df
Column 1 | Column 2 | ||
---|---|---|---|
outside | inside | ||
p | 1 | 1.982499 | 0.240303 |
2 | -0.744630 | -0.233123 | |
3 | -0.127521 | -0.899706 | |
q | 1 | -0.978671 | 2.158961 |
2 | 0.476506 | 0.681723 | |
3 | -1.201109 | 0.632565 |
# Select using the outer index
df.loc['p']
Column 1 | Column 2 | |
---|---|---|
inside | ||
1 | 1.982499 | 0.240303 |
2 | -0.744630 | -0.233123 |
3 | -0.127521 | -0.899706 |
# Select using the inside index
df.loc['p'].loc[2]
Column 1 -0.744630 Column 2 -0.233123 Name: 2, dtype: float64
# Select a specific cell
df.loc['p'].loc[2]['Column 1']
-0.74462953774755614
# Rename index names
df.index.names = ['O', 'I']
df
Column 1 | Column 2 | ||
---|---|---|---|
O | I | ||
p | 1 | 1.982499 | 0.240303 |
2 | -0.744630 | -0.233123 | |
3 | -0.127521 | -0.899706 | |
q | 1 | -0.978671 | 2.158961 |
2 | 0.476506 | 0.681723 | |
3 | -1.201109 | 0.632565 |
Cross section is used when we need to select data at a particular level.
# Select rows whose inside index is equal 1
df.xs(1, level='I')
Column 1 | Column 2 | |
---|---|---|
O | ||
p | 1.982499 | 0.240303 |
q | -0.978671 | 2.158961 |
d = {'A': [1, 2, np.nan], 'B': [1, np.nan, np.nan], 'C': [1, 2, 3]}
df = pd.DataFrame(d)
df
A | B | C | |
---|---|---|---|
0 | 1.0 | 1.0 | 1 |
1 | 2.0 | NaN | 2 |
2 | NaN | NaN | 3 |
# Drop any rows with missing values
df.dropna()
A | B | C | |
---|---|---|---|
0 | 1.0 | 1.0 | 1 |
# Keep only the rows with at least 2 non-na values:
df.dropna(thresh=2)
A | B | C | |
---|---|---|---|
0 | 1.0 | 1.0 | 1 |
1 | 2.0 | NaN | 2 |
The subset
parameter can be used to specify which columns an action should apply to instead of all columns. For instance, if we want to drop rows with missing values, subset
specifies a list of columns to include.
For instance, df.dropna(thresh=1, subset=['A','B'])
will drop all rows with less than 1 NA value in only columns A and B(rather than all the columns to consider for thresh=1).
The line df.dropna(how=all, subset=['A','B'])
will drop all rows with all NA values in only columns A and B.
# Drop any columns with missing values
df.dropna(axis=1)
C | |
---|---|
0 | 1 |
1 | 2 |
2 | 3 |
# Replace missing values
df.fillna(0)
A | B | C | |
---|---|---|---|
0 | 1.0 | 1.0 | 1 |
1 | 2.0 | 0.0 | 2 |
2 | 0.0 | 0.0 | 3 |
# Replace missing values with the mean of the column
df['A'].fillna(value=df['A'].mean())
0 1.0 1 2.0 2 1.5 Name: A, dtype: float64
columns = 'Id EmployeeName JobTitle TotalPay Year'.split()
salaries_df = pd.read_csv('data/sf-salaries-subset.csv', index_col='Id', usecols=columns)
salaries_df.head()
EmployeeName | JobTitle | TotalPay | Year | |
---|---|---|---|---|
Id | ||||
1 | NATHANIEL FORD | GENERAL MANAGER-METROPOLITAN TRANSIT AUTHORITY | 567595.43 | 2011 |
2 | GARY JIMENEZ | CAPTAIN III (POLICE DEPARTMENT) | 538909.28 | 2011 |
3 | ALBERT PARDINI | CAPTAIN III (POLICE DEPARTMENT) | 335279.91 | 2011 |
4 | CHRISTOPHER CHONG | WIRE ROPE CABLE MAINTENANCE MECHANIC | 332343.61 | 2011 |
5 | PATRICK GARDNER | DEPUTY CHIEF OF DEPARTMENT,(FIRE DEPARTMENT) | 326373.19 | 2011 |
# Group by job title
salaries_by_job_df = salaries_df.groupby('JobTitle')
# Get some statistics on the TotalPay column
salaries_by_job_df['TotalPay'].describe()
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
JobTitle | ||||||||
ASSISTANT CHIEF OF DEPARTMENT, (FIRE DEPARTMENT) | 1.0 | 299494.1700 | NaN | 299494.17 | 299494.1700 | 299494.170 | 299494.1700 | 299494.17 |
ASSISTANT DEPUTY CHIEF II | 1.0 | 316285.7400 | NaN | 316285.74 | 316285.7400 | 316285.740 | 316285.7400 | 316285.74 |
ASSISTANT MEDICAL EXAMINER | 1.0 | 274550.2500 | NaN | 274550.25 | 274550.2500 | 274550.250 | 274550.2500 | 274550.25 |
BATTALION CHIEF, (FIRE DEPARTMENT) | 4.0 | 295547.4675 | 17602.943941 | 276434.22 | 283868.8425 | 294887.300 | 306565.9250 | 315981.05 |
CAPTAIN III (POLICE DEPARTMENT) | 3.0 | 390599.3700 | 129813.897272 | 297608.92 | 316444.4150 | 335279.910 | 437094.5950 | 538909.28 |
CAPTAIN, EMERGENCYCY MEDICAL SERVICES | 1.0 | 278569.2100 | NaN | 278569.21 | 278569.2100 | 278569.210 | 278569.2100 | 278569.21 |
CHIEF OF DEPARTMENT, (FIRE DEPARTMENT) | 1.0 | 302377.7300 | NaN | 302377.73 | 302377.7300 | 302377.730 | 302377.7300 | 302377.73 |
COMMANDER III, (POLICE DEPARTMENT) | 1.0 | 286213.8600 | NaN | 286213.86 | 286213.8600 | 286213.860 | 286213.8600 | 286213.86 |
DEPARTMENT HEAD V | 2.0 | 288696.0250 | 5622.340368 | 284720.43 | 286708.2275 | 288696.025 | 290683.8225 | 292671.62 |
DEPUTY CHIEF OF DEPARTMENT,(FIRE DEPARTMENT) | 1.0 | 326373.1900 | NaN | 326373.19 | 326373.1900 | 326373.190 | 326373.1900 | 326373.19 |
DEPUTY DIRECTOR OF INVESTMENTS | 1.0 | 307899.4600 | NaN | 307899.46 | 307899.4600 | 307899.460 | 307899.4600 | 307899.46 |
EXECUTIVE CONTRACT EMPLOYEE | 1.0 | 294580.0200 | NaN | 294580.02 | 294580.0200 | 294580.020 | 294580.0200 | 294580.02 |
GENERAL MANAGER-METROPOLITAN TRANSIT AUTHORITY | 1.0 | 567595.4300 | NaN | 567595.43 | 567595.4300 | 567595.430 | 567595.4300 | 567595.43 |
WIRE ROPE CABLE MAINTENANCE MECHANIC | 1.0 | 332343.6100 | NaN | 332343.61 | 332343.6100 | 332343.610 | 332343.6100 | 332343.61 |
# Get some statistics on all numeric columns
salaries_by_job_df.describe()
TotalPay | Year | |||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | mean | std | min | 25% | 50% | 75% | max | count | mean | std | min | 25% | 50% | 75% | max | |
JobTitle | ||||||||||||||||
ASSISTANT CHIEF OF DEPARTMENT, (FIRE DEPARTMENT) | 1.0 | 299494.1700 | NaN | 299494.17 | 299494.1700 | 299494.170 | 299494.1700 | 299494.17 | 1.0 | 2011.0 | NaN | 2011.0 | 2011.0 | 2011.0 | 2011.0 | 2011.0 |
ASSISTANT DEPUTY CHIEF II | 1.0 | 316285.7400 | NaN | 316285.74 | 316285.7400 | 316285.740 | 316285.7400 | 316285.74 | 1.0 | 2011.0 | NaN | 2011.0 | 2011.0 | 2011.0 | 2011.0 | 2011.0 |
ASSISTANT MEDICAL EXAMINER | 1.0 | 274550.2500 | NaN | 274550.25 | 274550.2500 | 274550.250 | 274550.2500 | 274550.25 | 1.0 | 2011.0 | NaN | 2011.0 | 2011.0 | 2011.0 | 2011.0 | 2011.0 |
BATTALION CHIEF, (FIRE DEPARTMENT) | 4.0 | 295547.4675 | 17602.943941 | 276434.22 | 283868.8425 | 294887.300 | 306565.9250 | 315981.05 | 4.0 | 2011.0 | 0.0 | 2011.0 | 2011.0 | 2011.0 | 2011.0 | 2011.0 |
CAPTAIN III (POLICE DEPARTMENT) | 3.0 | 390599.3700 | 129813.897272 | 297608.92 | 316444.4150 | 335279.910 | 437094.5950 | 538909.28 | 3.0 | 2011.0 | 0.0 | 2011.0 | 2011.0 | 2011.0 | 2011.0 | 2011.0 |
CAPTAIN, EMERGENCYCY MEDICAL SERVICES | 1.0 | 278569.2100 | NaN | 278569.21 | 278569.2100 | 278569.210 | 278569.2100 | 278569.21 | 1.0 | 2011.0 | NaN | 2011.0 | 2011.0 | 2011.0 | 2011.0 | 2011.0 |
CHIEF OF DEPARTMENT, (FIRE DEPARTMENT) | 1.0 | 302377.7300 | NaN | 302377.73 | 302377.7300 | 302377.730 | 302377.7300 | 302377.73 | 1.0 | 2011.0 | NaN | 2011.0 | 2011.0 | 2011.0 | 2011.0 | 2011.0 |
COMMANDER III, (POLICE DEPARTMENT) | 1.0 | 286213.8600 | NaN | 286213.86 | 286213.8600 | 286213.860 | 286213.8600 | 286213.86 | 1.0 | 2011.0 | NaN | 2011.0 | 2011.0 | 2011.0 | 2011.0 | 2011.0 |
DEPARTMENT HEAD V | 2.0 | 288696.0250 | 5622.340368 | 284720.43 | 286708.2275 | 288696.025 | 290683.8225 | 292671.62 | 2.0 | 2011.0 | 0.0 | 2011.0 | 2011.0 | 2011.0 | 2011.0 | 2011.0 |
DEPUTY CHIEF OF DEPARTMENT,(FIRE DEPARTMENT) | 1.0 | 326373.1900 | NaN | 326373.19 | 326373.1900 | 326373.190 | 326373.1900 | 326373.19 | 1.0 | 2011.0 | NaN | 2011.0 | 2011.0 | 2011.0 | 2011.0 | 2011.0 |
DEPUTY DIRECTOR OF INVESTMENTS | 1.0 | 307899.4600 | NaN | 307899.46 | 307899.4600 | 307899.460 | 307899.4600 | 307899.46 | 1.0 | 2011.0 | NaN | 2011.0 | 2011.0 | 2011.0 | 2011.0 | 2011.0 |
EXECUTIVE CONTRACT EMPLOYEE | 1.0 | 294580.0200 | NaN | 294580.02 | 294580.0200 | 294580.020 | 294580.0200 | 294580.02 | 1.0 | 2011.0 | NaN | 2011.0 | 2011.0 | 2011.0 | 2011.0 | 2011.0 |
GENERAL MANAGER-METROPOLITAN TRANSIT AUTHORITY | 1.0 | 567595.4300 | NaN | 567595.43 | 567595.4300 | 567595.430 | 567595.4300 | 567595.43 | 1.0 | 2011.0 | NaN | 2011.0 | 2011.0 | 2011.0 | 2011.0 | 2011.0 |
WIRE ROPE CABLE MAINTENANCE MECHANIC | 1.0 | 332343.6100 | NaN | 332343.61 | 332343.6100 | 332343.610 | 332343.6100 | 332343.61 | 1.0 | 2011.0 | NaN | 2011.0 | 2011.0 | 2011.0 | 2011.0 | 2011.0 |
# Present statistics in a different way
salaries_by_job_df.describe().transpose()
JobTitle | ASSISTANT CHIEF OF DEPARTMENT, (FIRE DEPARTMENT) | ASSISTANT DEPUTY CHIEF II | ASSISTANT MEDICAL EXAMINER | BATTALION CHIEF, (FIRE DEPARTMENT) | CAPTAIN III (POLICE DEPARTMENT) | CAPTAIN, EMERGENCYCY MEDICAL SERVICES | CHIEF OF DEPARTMENT, (FIRE DEPARTMENT) | COMMANDER III, (POLICE DEPARTMENT) | DEPARTMENT HEAD V | DEPUTY CHIEF OF DEPARTMENT,(FIRE DEPARTMENT) | DEPUTY DIRECTOR OF INVESTMENTS | EXECUTIVE CONTRACT EMPLOYEE | GENERAL MANAGER-METROPOLITAN TRANSIT AUTHORITY | WIRE ROPE CABLE MAINTENANCE MECHANIC | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
TotalPay | count | 1.00 | 1.00 | 1.00 | 4.000000 | 3.000000 | 1.00 | 1.00 | 1.00 | 2.000000 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 |
mean | 299494.17 | 316285.74 | 274550.25 | 295547.467500 | 390599.370000 | 278569.21 | 302377.73 | 286213.86 | 288696.025000 | 326373.19 | 307899.46 | 294580.02 | 567595.43 | 332343.61 | |
std | NaN | NaN | NaN | 17602.943941 | 129813.897272 | NaN | NaN | NaN | 5622.340368 | NaN | NaN | NaN | NaN | NaN | |
min | 299494.17 | 316285.74 | 274550.25 | 276434.220000 | 297608.920000 | 278569.21 | 302377.73 | 286213.86 | 284720.430000 | 326373.19 | 307899.46 | 294580.02 | 567595.43 | 332343.61 | |
25% | 299494.17 | 316285.74 | 274550.25 | 283868.842500 | 316444.415000 | 278569.21 | 302377.73 | 286213.86 | 286708.227500 | 326373.19 | 307899.46 | 294580.02 | 567595.43 | 332343.61 | |
50% | 299494.17 | 316285.74 | 274550.25 | 294887.300000 | 335279.910000 | 278569.21 | 302377.73 | 286213.86 | 288696.025000 | 326373.19 | 307899.46 | 294580.02 | 567595.43 | 332343.61 | |
75% | 299494.17 | 316285.74 | 274550.25 | 306565.925000 | 437094.595000 | 278569.21 | 302377.73 | 286213.86 | 290683.822500 | 326373.19 | 307899.46 | 294580.02 | 567595.43 | 332343.61 | |
max | 299494.17 | 316285.74 | 274550.25 | 315981.050000 | 538909.280000 | 278569.21 | 302377.73 | 286213.86 | 292671.620000 | 326373.19 | 307899.46 | 294580.02 | 567595.43 | 332343.61 | |
Year | count | 1.00 | 1.00 | 1.00 | 4.000000 | 3.000000 | 1.00 | 1.00 | 1.00 | 2.000000 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 |
mean | 2011.00 | 2011.00 | 2011.00 | 2011.000000 | 2011.000000 | 2011.00 | 2011.00 | 2011.00 | 2011.000000 | 2011.00 | 2011.00 | 2011.00 | 2011.00 | 2011.00 | |
std | NaN | NaN | NaN | 0.000000 | 0.000000 | NaN | NaN | NaN | 0.000000 | NaN | NaN | NaN | NaN | NaN | |
min | 2011.00 | 2011.00 | 2011.00 | 2011.000000 | 2011.000000 | 2011.00 | 2011.00 | 2011.00 | 2011.000000 | 2011.00 | 2011.00 | 2011.00 | 2011.00 | 2011.00 | |
25% | 2011.00 | 2011.00 | 2011.00 | 2011.000000 | 2011.000000 | 2011.00 | 2011.00 | 2011.00 | 2011.000000 | 2011.00 | 2011.00 | 2011.00 | 2011.00 | 2011.00 | |
50% | 2011.00 | 2011.00 | 2011.00 | 2011.000000 | 2011.000000 | 2011.00 | 2011.00 | 2011.00 | 2011.000000 | 2011.00 | 2011.00 | 2011.00 | 2011.00 | 2011.00 | |
75% | 2011.00 | 2011.00 | 2011.00 | 2011.000000 | 2011.000000 | 2011.00 | 2011.00 | 2011.00 | 2011.000000 | 2011.00 | 2011.00 | 2011.00 | 2011.00 | 2011.00 | |
max | 2011.00 | 2011.00 | 2011.00 | 2011.000000 | 2011.000000 | 2011.00 | 2011.00 | 2011.00 | 2011.000000 | 2011.00 | 2011.00 | 2011.00 | 2011.00 | 2011.00 |
# Count number of rows in each group
salaries_by_job_df.count()
EmployeeName | TotalPay | Year | |
---|---|---|---|
JobTitle | |||
ASSISTANT CHIEF OF DEPARTMENT, (FIRE DEPARTMENT) | 1 | 1 | 1 |
ASSISTANT DEPUTY CHIEF II | 1 | 1 | 1 |
ASSISTANT MEDICAL EXAMINER | 1 | 1 | 1 |
BATTALION CHIEF, (FIRE DEPARTMENT) | 4 | 4 | 4 |
CAPTAIN III (POLICE DEPARTMENT) | 3 | 3 | 3 |
CAPTAIN, EMERGENCYCY MEDICAL SERVICES | 1 | 1 | 1 |
CHIEF OF DEPARTMENT, (FIRE DEPARTMENT) | 1 | 1 | 1 |
COMMANDER III, (POLICE DEPARTMENT) | 1 | 1 | 1 |
DEPARTMENT HEAD V | 2 | 2 | 2 |
DEPUTY CHIEF OF DEPARTMENT,(FIRE DEPARTMENT) | 1 | 1 | 1 |
DEPUTY DIRECTOR OF INVESTMENTS | 1 | 1 | 1 |
EXECUTIVE CONTRACT EMPLOYEE | 1 | 1 | 1 |
GENERAL MANAGER-METROPOLITAN TRANSIT AUTHORITY | 1 | 1 | 1 |
WIRE ROPE CABLE MAINTENANCE MECHANIC | 1 | 1 | 1 |
# Find the mean of numeric columns
salaries_by_job_df.mean()
TotalPay | Year | |
---|---|---|
JobTitle | ||
ASSISTANT CHIEF OF DEPARTMENT, (FIRE DEPARTMENT) | 299494.1700 | 2011 |
ASSISTANT DEPUTY CHIEF II | 316285.7400 | 2011 |
ASSISTANT MEDICAL EXAMINER | 274550.2500 | 2011 |
BATTALION CHIEF, (FIRE DEPARTMENT) | 295547.4675 | 2011 |
CAPTAIN III (POLICE DEPARTMENT) | 390599.3700 | 2011 |
CAPTAIN, EMERGENCYCY MEDICAL SERVICES | 278569.2100 | 2011 |
CHIEF OF DEPARTMENT, (FIRE DEPARTMENT) | 302377.7300 | 2011 |
COMMANDER III, (POLICE DEPARTMENT) | 286213.8600 | 2011 |
DEPARTMENT HEAD V | 288696.0250 | 2011 |
DEPUTY CHIEF OF DEPARTMENT,(FIRE DEPARTMENT) | 326373.1900 | 2011 |
DEPUTY DIRECTOR OF INVESTMENTS | 307899.4600 | 2011 |
EXECUTIVE CONTRACT EMPLOYEE | 294580.0200 | 2011 |
GENERAL MANAGER-METROPOLITAN TRANSIT AUTHORITY | 567595.4300 | 2011 |
WIRE ROPE CABLE MAINTENANCE MECHANIC | 332343.6100 | 2011 |
# Get the highest pay
salaries_df['TotalPay'].max()
567595.43000000005
# Get the position of the highest pay
salaries_df['TotalPay'].argmax()
1
# Get the person with the highest pay
salaries_df.iloc[salaries_df['TotalPay'].argmax()]
EmployeeName GARY JIMENEZ JobTitle CAPTAIN III (POLICE DEPARTMENT) TotalPay 538909 Year 2011 Name: 2, dtype: object
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']},
index=[0, 1, 2, 3])
df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
'B': ['B4', 'B5', 'B6', 'B7'],
'C': ['C4', 'C5', 'C6', 'C7'],
'D': ['D4', 'D5', 'D6', 'D7']},
index=[4, 5, 6, 7])
df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'],
'B': ['B8', 'B9', 'B10', 'B11'],
'C': ['C8', 'C9', 'C10', 'C11'],
'D': ['D8', 'D9', 'D10', 'D11']},
index=[8, 9, 10, 11])
# Combine along the rows
pd.concat([df1, df2, df3])
A | B | C | D | |
---|---|---|---|---|
0 | A0 | B0 | C0 | D0 |
1 | A1 | B1 | C1 | D1 |
2 | A2 | B2 | C2 | D2 |
3 | A3 | B3 | C3 | D3 |
4 | A4 | B4 | C4 | D4 |
5 | A5 | B5 | C5 | D5 |
6 | A6 | B6 | C6 | D6 |
7 | A7 | B7 | C7 | D7 |
8 | A8 | B8 | C8 | D8 |
9 | A9 | B9 | C9 | D9 |
10 | A10 | B10 | C10 | D10 |
11 | A11 | B11 | C11 | D11 |
# Combine along the columns
# Note that Pandas assigns cell values that does not align correct to NaN
pd.concat([df1, df2, df3], axis=1)
A | B | C | D | A | B | C | D | A | B | C | D | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | A0 | B0 | C0 | D0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | A1 | B1 | C1 | D1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | A2 | B2 | C2 | D2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | A3 | B3 | C3 | D3 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
4 | NaN | NaN | NaN | NaN | A4 | B4 | C4 | D4 | NaN | NaN | NaN | NaN |
5 | NaN | NaN | NaN | NaN | A5 | B5 | C5 | D5 | NaN | NaN | NaN | NaN |
6 | NaN | NaN | NaN | NaN | A6 | B6 | C6 | D6 | NaN | NaN | NaN | NaN |
7 | NaN | NaN | NaN | NaN | A7 | B7 | C7 | D7 | NaN | NaN | NaN | NaN |
8 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | A8 | B8 | C8 | D8 |
9 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | A9 | B9 | C9 | D9 |
10 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | A10 | B10 | C10 | D10 |
11 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | A11 | B11 | C11 | D11 |
The merge
function is useful if we want to combine DataFrames like we join tables using SQL.
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
'key2': ['K0', 'K1', 'K0', 'K1'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
'key2': ['K0', 'K0', 'K0', 'K0'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
pd.merge(left, right, how='inner', on=['key1', 'key2'])
A | B | key1 | key2 | C | D | |
---|---|---|---|---|---|---|
0 | A0 | B0 | K0 | K0 | C0 | D0 |
1 | A2 | B2 | K1 | K0 | C1 | D1 |
2 | A2 | B2 | K1 | K0 | C2 | D2 |
The join
function is used to combine the columns of DataFrames that may have different indices. It works exactly like the merge
function except the keys that we join on are on the indices instead of the columns.
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
'B': ['B0', 'B1', 'B2']},
index=['K0', 'K1', 'K2'])
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
'D': ['D0', 'D2', 'D3']},
index=['K0', 'K2', 'K3'])
left.join(right)
A | B | C | D | |
---|---|---|---|---|
K0 | A0 | B0 | C0 | D0 |
K1 | A1 | B1 | NaN | NaN |
K2 | A2 | B2 | C2 | D2 |
left.join(right, how='outer')
A | B | C | D | |
---|---|---|---|---|
K0 | A0 | B0 | C0 | D0 |
K1 | A1 | B1 | NaN | NaN |
K2 | A2 | B2 | C2 | D2 |
K3 | NaN | NaN | C3 | D3 |
df = pd.DataFrame({'col1':[1,2,3,4],
'col2':[444,555,666,444],
'col3':['abc','def','ghi','xyz']})
df.head()
col1 | col2 | col3 | |
---|---|---|---|
0 | 1 | 444 | abc |
1 | 2 | 555 | def |
2 | 3 | 666 | ghi |
3 | 4 | 444 | xyz |
# Find the unique values in col2
df['col2'].unique()
array([444, 555, 666], dtype=int64)
# Find the number of unique values in col2
df['col2'].nunique()
3
# Find the unique values in col2
df['col2'].value_counts()
444 2 555 1 666 1 Name: col2, dtype: int64
# The value_counts() can be used to find top X row most common value
df['col2'].value_counts().head(1)
444 2 Name: col2, dtype: int64
# Apply custom function to each element of a column
df['col1'].apply(lambda element_value: element_value**2)
0 1 1 4 2 9 3 16 Name: col1, dtype: int64
# Find the names of all the columns in the DataFrame
df.columns
Index(['col1', 'col2', 'col3'], dtype='object')
# Sort data
df.sort_values(by='col2')
col1 | col2 | col3 | |
---|---|---|---|
0 | 1 | 444 | abc |
3 | 4 | 444 | xyz |
1 | 2 | 555 | def |
2 | 3 | 666 | ghi |
# Find null values
df.isnull()
col1 | col2 | col3 | |
---|---|---|---|
0 | False | False | False |
1 | False | False | False |
2 | False | False | False |
3 | False | False | False |
data = pd.read_html('https://borsen.dk/kurser/danske_aktier/c20_cap.html', thousands='.', decimal=',')
df = data[0]
# Show information about the data
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 20 entries, 0 to 19 Data columns (total 8 columns): Aktie 20 non-null object % 20 non-null float64 +/- 20 non-null float64 Kurs 20 non-null float64 ÅTD% 20 non-null float64 Bud 20 non-null float64 Udbud 20 non-null float64 Omsætning 20 non-null float64 dtypes: float64(7), object(1) memory usage: 1.3+ KB
df.columns
Index(['\n\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\tAktie\n\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t', '\n\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t%\n\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t', '\n\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\t+/-\n\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t', '\n\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\tKurs\n\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t', '\n\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\tÅTD%\n\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t', '\n\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\tBud\n\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t', '\n\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\tUdbud\n\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t', '\n\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\t\tOmsætning\n\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t'], dtype='object')
df.columns = ['Akie', '%', '+/-', 'Kurs', 'ATD%', 'Bud', 'Udbud', 'Omsætning']
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 20 entries, 0 to 19 Data columns (total 8 columns): Akie 20 non-null object % 20 non-null float64 +/- 20 non-null float64 Kurs 20 non-null float64 ATD% 20 non-null float64 Bud 20 non-null float64 Udbud 20 non-null float64 Omsætning 20 non-null float64 dtypes: float64(7), object(1) memory usage: 1.3+ KB
df['Omsætning'][0]
82104014.530000001