import pandas as pd
# Read csv file in pandas:
flights = pd.read_csv("D:\PythonPandas\FlightsNew.csv")
flights.head()
year | month | day | dep_time | dep_delay | arr_time | arr_delay | cancelled | carrier | tailnum | flight | origin | dest | air_time | distance | hour | min | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2014 | 1 | 1 | 914 | 14 | 1238 | 13 | 0 | AA | N338AA | 1 | JFK | LAX | 359 | 2475 | 9 | 14 |
1 | 2014 | 1 | 1 | 1157 | -3 | 1523 | 13 | 0 | AA | N335AA | 3 | JFK | LAX | 363 | 2475 | 11 | 57 |
2 | 2014 | 1 | 1 | 1902 | 2 | 2224 | 9 | 0 | AA | N327AA | 21 | JFK | LAX | 351 | 2475 | 19 | 2 |
3 | 2014 | 1 | 1 | 722 | -8 | 1014 | -26 | 0 | AA | N3EHAA | 29 | LGA | PBI | 157 | 1035 | 7 | 22 |
4 | 2014 | 1 | 1 | 1347 | 2 | 1706 | 1 | 0 | AA | N319AA | 117 | JFK | LAX | 350 | 2475 | 13 | 47 |
In this notebook, we will :
start with basics - how to subset rows, select and compute on columns
and then we will look at performing data aggregations by group.
ans=flights.loc[(flights["origin"].isin(["JFK"]))&(flights["month"]==3)].reset_index()
ans.head()
index | year | month | day | dep_time | dep_delay | arr_time | arr_delay | cancelled | carrier | tailnum | flight | origin | dest | air_time | distance | hour | min | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 43609 | 2014 | 3 | 1 | 849 | -11 | 1306 | 36 | 0 | AA | N784AA | 1 | JFK | LAX | 375 | 2475 | 8 | 49 |
1 | 43610 | 2014 | 3 | 1 | 1157 | -3 | 1529 | 14 | 0 | AA | N787AA | 3 | JFK | LAX | 368 | 2475 | 11 | 57 |
2 | 43611 | 2014 | 3 | 1 | 652 | -3 | 1019 | 4 | 0 | AA | N785AA | 9 | JFK | LAX | 366 | 2475 | 6 | 52 |
3 | 43612 | 2014 | 3 | 1 | 1117 | 47 | 1502 | 77 | 0 | AA | N788AA | 19 | JFK | LAX | 379 | 2475 | 11 | 17 |
4 | 43613 | 2014 | 3 | 1 | 1950 | 50 | 2321 | 66 | 0 | AA | N336AA | 21 | JFK | LAX | 350 | 2475 | 19 | 50 |
flights_tworows=flights.loc[0:1]
flights_tworows.head()
year | month | day | dep_time | dep_delay | arr_time | arr_delay | cancelled | carrier | tailnum | flight | origin | dest | air_time | distance | hour | min | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2014 | 1 | 1 | 914 | 14 | 1238 | 13 | 0 | AA | N338AA | 1 | JFK | LAX | 359 | 2475 | 9 | 14 |
1 | 2014 | 1 | 1 | 1157 | -3 | 1523 | 13 | 0 | AA | N335AA | 3 | JFK | LAX | 363 | 2475 | 11 | 57 |
flights_sort=flights.sort_values(["origin","dest"],ascending=[1,0]).reset_index()
flights_sort.head()
index | year | month | day | dep_time | dep_delay | arr_time | arr_delay | cancelled | carrier | tailnum | flight | origin | dest | air_time | distance | hour | min | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2790 | 2014 | 1 | 5 | 836 | 6 | 1151 | 49 | 0 | EV | N12175 | 4419 | EWR | XNA | 195 | 1131 | 8 | 36 |
1 | 3351 | 2014 | 1 | 6 | 833 | 7 | 1111 | 13 | 0 | EV | N24128 | 4419 | EWR | XNA | 190 | 1131 | 8 | 33 |
2 | 4066 | 2014 | 1 | 7 | 811 | -6 | 1035 | -13 | 0 | EV | N12142 | 4419 | EWR | XNA | 179 | 1131 | 8 | 11 |
3 | 4825 | 2014 | 1 | 8 | 810 | -7 | 1036 | -12 | 0 | EV | N11193 | 4419 | EWR | XNA | 184 | 1131 | 8 | 10 |
4 | 5658 | 2014 | 1 | 9 | 833 | 16 | 1055 | 7 | 0 | EV | N14198 | 4419 | EWR | XNA | 181 | 1131 | 8 | 33 |
arr_dep_delay=flights[["arr_delay","dep_delay"]].copy()
arr_dep_delay.head()
arr_delay | dep_delay | |
---|---|---|
0 | 13 | 14 |
1 | 13 | -3 |
2 | 9 | 2 |
3 | -26 | -8 |
4 | 1 | 2 |
delay_arr_dep=flights[["arr_delay","dep_delay"]].rename(columns={"arr_delay":"delay_arr","dep_delay":"delay_dep"})
delay_arr_dep.head()
delay_arr | delay_dep | |
---|---|---|
0 | 13 | 14 |
1 | 13 | -3 |
2 | 9 | 2 |
3 | -26 | -8 |
4 | 1 | 2 |
#Step1: Create a column called "delay" in "flights" dataframe and assign 0 to it.
flights["delay"]=0
#Step2: Check for cases where arr_delay + dep_delay is less than 0
flights.loc[(flights["arr_delay"]+flights["dep_delay"])<0,"delay"]=1
#Step3: Count the number of 1 in the delay column
flights.loc[flights["delay"]==1]["delay"].count()
141814
flights.loc[flights["origin"].isin(["JFK"])][["arr_delay","dep_delay"]].mean()
arr_delay 7.731465 dep_delay 11.446167 dtype: float64
len(flights.loc[(flights["origin"].isin(["JFK"]))&(flights["month"]==6)]["origin"].tolist())
8422
aggregations={"origin":"count"}
flights.groupby(["origin"]).agg({"origin":"count"}).rename(columns={"origin":"number_trips"}).reset_index()
origin | number_trips | |
---|---|---|
0 | EWR | 87400 |
1 | JFK | 81483 |
2 | LGA | 84433 |
flights.loc[flights["carrier"].isin(["AA"])].groupby(["origin"]).count().reset_index()[['origin','flight']].rename(columns={"flight":"number_trips"})
origin | number_trips | |
---|---|---|
0 | EWR | 2649 |
1 | JFK | 11923 |
2 | LGA | 11730 |
flights.loc[flights["carrier"].isin(["AA"])].groupby(["origin","dest"]).count().reset_index()[["origin","dest","flight"]].rename(columns={"flight":"number_trips"}).head()
origin | dest | number_trips | |
---|---|---|---|
0 | EWR | DFW | 1618 |
1 | EWR | LAX | 62 |
2 | EWR | MIA | 848 |
3 | EWR | PHX | 121 |
4 | JFK | AUS | 297 |
flights.loc[flights["carrier"].isin(["AA"])].groupby(["origin","dest","month"])[["arr_delay","dep_delay"]].mean().reset_index().head()
origin | dest | month | arr_delay | dep_delay | |
---|---|---|---|---|---|
0 | EWR | DFW | 1 | 6.427673 | 10.012579 |
1 | EWR | DFW | 2 | 10.536765 | 11.345588 |
2 | EWR | DFW | 3 | 12.865031 | 8.079755 |
3 | EWR | DFW | 4 | 17.792683 | 12.920732 |
4 | EWR | DFW | 5 | 18.487805 | 18.682927 |
family_data=pd.DataFrame({'family_id':[1,2,3,4,5],
'age_mother':[30,27,26,32,29],
'dob_child1':['1998-11-26','1996-06-22','2002-07-11','2004-10-10','2000-12-05'],
'dob_child2':['NA','2004-04-05','2004-04-05','2009-08-27','2005-02-28'],
'dob_child3':['NA','NA','2007-09-02','2012-07-21', ""]
})[['family_id','age_mother','dob_child1','dob_child2','dob_child3']]
family_data
family_id | age_mother | dob_child1 | dob_child2 | dob_child3 | |
---|---|---|---|---|---|
0 | 1 | 30 | 1998-11-26 | NA | NA |
1 | 2 | 27 | 1996-06-22 | 2004-04-05 | NA |
2 | 3 | 26 | 2002-07-11 | 2004-04-05 | 2007-09-02 |
3 | 4 | 32 | 2004-10-10 | 2009-08-27 | 2012-07-21 |
4 | 5 | 29 | 2000-12-05 | 2005-02-28 |
family_melt=pd.melt(family_data,
id_vars=['family_id','age_mother'],
value_vars=['dob_child1','dob_child2','dob_child3'],
var_name=['dob_child'],
value_name='dob')
family_melt
family_id | age_mother | dob_child | dob | |
---|---|---|---|---|
0 | 1 | 30 | dob_child1 | 1998-11-26 |
1 | 2 | 27 | dob_child1 | 1996-06-22 |
2 | 3 | 26 | dob_child1 | 2002-07-11 |
3 | 4 | 32 | dob_child1 | 2004-10-10 |
4 | 5 | 29 | dob_child1 | 2000-12-05 |
5 | 1 | 30 | dob_child2 | NA |
6 | 2 | 27 | dob_child2 | 2004-04-05 |
7 | 3 | 26 | dob_child2 | 2004-04-05 |
8 | 4 | 32 | dob_child2 | 2009-08-27 |
9 | 5 | 29 | dob_child2 | 2005-02-28 |
10 | 1 | 30 | dob_child3 | NA |
11 | 2 | 27 | dob_child3 | NA |
12 | 3 | 26 | dob_child3 | 2007-09-02 |
13 | 4 | 32 | dob_child3 | 2012-07-21 |
14 | 5 | 29 | dob_child3 |
#family_melt.set_index(['dob','family_id']).unstack('dob')
family_dcast=family_melt.pivot_table(index=['family_id','age_mother'],columns="dob_child",values="dob",aggfunc='first')
family_dcast
dob_child | dob_child1 | dob_child2 | dob_child3 | |
---|---|---|---|---|
family_id | age_mother | |||
1 | 30 | 1998-11-26 | NA | NA |
2 | 27 | 1996-06-22 | 2004-04-05 | NA |
3 | 26 | 2002-07-11 | 2004-04-05 | 2007-09-02 |
4 | 32 | 2004-10-10 | 2009-08-27 | 2012-07-21 |
5 | 29 | 2000-12-05 | 2005-02-28 |
family_dcast.columns
Index(['dob_child1', 'dob_child2', 'dob_child3'], dtype='object', name='dob_child')
family_dcast.reset_index().rename_axis("",axis="columns")
family_id | age_mother | dob_child1 | dob_child2 | dob_child3 | |
---|---|---|---|---|---|
0 | 1 | 30 | 1998-11-26 | NA | NA |
1 | 2 | 27 | 1996-06-22 | 2004-04-05 | NA |
2 | 3 | 26 | 2002-07-11 | 2004-04-05 | 2007-09-02 |
3 | 4 | 32 | 2004-10-10 | 2009-08-27 | 2012-07-21 |
4 | 5 | 29 | 2000-12-05 | 2005-02-28 |