#!/usr/bin/env python # coding: utf-8 # # Data Manupaltion using Python Pandas # ## The following operations in Pandas are done to achieve the same results shown in vignette of data.table package in R. # In[1]: import pandas as pd # ## 1. Basics # In[2]: # Read csv file in pandas: flights = pd.read_csv("D:\PythonPandas\FlightsNew.csv") flights.head() #

In this notebook, we will :

start with basics - how to subset rows, select and compute on columns

and then we will look at performing data aggregations by group.

# #### Get all the flights with “JFK” as the origin airport in the month of March. # In[3]: ans=flights.loc[(flights["origin"].isin(["JFK"]))&(flights["month"]==3)].reset_index() ans.head() # #### Get the first two rows from flights. # In[4]: flights_tworows=flights.loc[0:1] flights_tworows.head() # #### Sort flights first by column origin in ascending order, and then by dest in descending order: # In[5]: flights_sort=flights.sort_values(["origin","dest"],ascending=[1,0]).reset_index() flights_sort.head() # #### Select both arr_delay and dep_delay columns. # In[6]: arr_dep_delay=flights[["arr_delay","dep_delay"]].copy() arr_dep_delay.head() # #### Select both arr_delay and dep_delay columns and rename them to delay_arr and delay_dep. # In[7]: delay_arr_dep=flights[["arr_delay","dep_delay"]].rename(columns={"arr_delay":"delay_arr","dep_delay":"delay_dep"}) delay_arr_dep.head() # #### How many trips have had total delay < 0? # In[8]: #Step1: Create a column called "delay" in "flights" dataframe and assign 0 to it. flights["delay"]=0 #Step2: Check for cases where arr_delay + dep_delay is less than 0 flights.loc[(flights["arr_delay"]+flights["dep_delay"])<0,"delay"]=1 #Step3: Count the number of 1 in the delay column flights.loc[flights["delay"]==1]["delay"].count() # #### Calculate the average arrival and departure delay for all flights with “JFK” as the origin airport in the month of June. # In[9]: flights.loc[flights["origin"].isin(["JFK"])][["arr_delay","dep_delay"]].mean() # #### How many trips have been made in 2014 from “JFK” airport in the month of June? # In[10]: len(flights.loc[(flights["origin"].isin(["JFK"]))&(flights["month"]==6)]["origin"].tolist()) # ## 2. Aggregations # #### How can we get the number of trips corresponding to each origin airport? # In[11]: aggregations={"origin":"count"} # In[12]: flights.groupby(["origin"]).agg({"origin":"count"}).rename(columns={"origin":"number_trips"}).reset_index() # #### How can we calculate the number of trips for each origin airport for carrier code “AA”? # In[13]: flights.loc[flights["carrier"].isin(["AA"])].groupby(["origin"]).count().reset_index()[['origin','flight']].rename(columns={"flight":"number_trips"}) # #### How can we get the total number of trips for each origin, dest pair for carrier code “AA”? # In[14]: flights.loc[flights["carrier"].isin(["AA"])].groupby(["origin","dest"]).count().reset_index()[["origin","dest","flight"]].rename(columns={"flight":"number_trips"}).head() # #### How can we get the average arrival and departure delay for each orig,dest pair for each month for carrier code “AA”? # In[15]: flights.loc[flights["carrier"].isin(["AA"])].groupby(["origin","dest","month"])[["arr_delay","dep_delay"]].mean().reset_index().head() # ## 3. Melting Operation # #### Melting operation is converting from wide form to long form # In[16]: family_data=pd.DataFrame({'family_id':[1,2,3,4,5], 'age_mother':[30,27,26,32,29], 'dob_child1':['1998-11-26','1996-06-22','2002-07-11','2004-10-10','2000-12-05'], 'dob_child2':['NA','2004-04-05','2004-04-05','2009-08-27','2005-02-28'], 'dob_child3':['NA','NA','2007-09-02','2012-07-21', ""] })[['family_id','age_mother','dob_child1','dob_child2','dob_child3']] family_data # #### We need to melt the above dataframe # In[17]: family_melt=pd.melt(family_data, id_vars=['family_id','age_mother'], value_vars=['dob_child1','dob_child2','dob_child3'], var_name=['dob_child'], value_name='dob') family_melt # ## 4. Dcast operation # In[18]: #family_melt.set_index(['dob','family_id']).unstack('dob') family_dcast=family_melt.pivot_table(index=['family_id','age_mother'],columns="dob_child",values="dob",aggfunc='first') # In[19]: family_dcast # In[20]: family_dcast.columns # In[47]: family_dcast.reset_index().rename_axis("",axis="columns")