#!/usr/bin/env python # coding: utf-8 # # Merging DataFrames # # ```pd.merge(left, right, how='inner', on=None, left_on=None, right_on=None, # left_index=False, right_index=False, sort=True)``` # # ```merge``` and ```update``` have a lot in common. # # [There's a whole family of methods.](https://pandas.pydata.org/pandas-docs/stable/merging.html) # # I can understand where a busy IT worker, with the job of helping numerous IT workers, might get frustrated with the redundancy of it all. How many times must we reinvent the same wheels? That's the skeptic's view. # # The good news the same as the bad news: redundancy means generic fluency is possible. Some background in SQL helps with pandas. Some background in pandas helps with SQL. # In[ ]: import pandas as pd import numpy as np # In[ ]: dfA = pd.DataFrame({'A':[1,1,1,1,1,1,1,1], 'B':[2,2,2,2,2,2,2,2]}) # In[ ]: dfA # In[ ]: dfB = pd.DataFrame({'C':[3,3,3,3,3,3,3,3], 'D':[4,4,4,4,4,4,4,4]}) # You get to choose which columns, left and right, serve as "gear teeth" for synchronizing rows (sewing them together). Or choose the index, not a column. # # In the expression below, we go with the one synchronizing element: the index, on both input tables. # In[ ]: pd.merge(dfA, dfB, left_index=True, right_index=True) # In[ ]: import string dfA.index = list(string.ascii_lowercase[:8]) # new index, of letters instead # In[ ]: dfA # In[ ]: dfB.index = list(string.ascii_lowercase[5:8+5]) # overlapping letters # In[ ]: dfB # In[ ]: pd.merge(dfA, dfB, left_index=True, right_index=True) # intersection, not the union # In[ ]: pd.merge(dfA, dfB, left_index=True, right_index=True, how="left") # left side governs # In[ ]: pd.merge(dfA, dfB, left_index=True, right_index=True, how="right") # right side governs # In[ ]: pd.merge(dfA, dfB, left_index=True, right_index=True, how="outer") # the full union # In[ ]: pd.merge(dfA, dfB, left_index=True, right_index=True, how="inner") # same as intersection # In[ ]: dfA = pd.DataFrame({'A':[1,2,3,4,5,6,7,8], 'B':[2,2,2,2,2,2,2,2], 'key':['dog', 'pig', 'rooster', 'monkey', 'hen', 'cat', 'slug', 'human']}) # In[ ]: from numpy.random import shuffle keys = dfA.key.values.copy() # copy or dfA key will also reorder shuffle(keys) # in place dfB = pd.DataFrame({'C':[1,2,3,4,5,6,7,8], 'D':[4,4,4,4,4,4,4,4], 'key': keys}) keys # In[ ]: dfA # In[ ]: dfB # In[ ]: pd.merge(dfA, dfB, on='key') # like "zipping together" on a common column # In[ ]: dfB.rename({"C":"A", "D":"B"}, axis=1, inplace = True) dfB # In[ ]: dfA # In[ ]: pd.merge(dfA, dfB, on='key', sort=False) # sort on key if sort is True # In[ ]: pd.merge(dfA, dfB, on='key', sort=True) # sort on key if sort is True # In[ ]: pd.merge(dfA, dfB, on='A') # In[ ]: pd.merge(dfA, dfB, left_index=True, right_on="A") # In[ ]: merged = pd.merge(dfA, dfB, left_index=True, right_on="A") merged.to_json("merged.json") # save for later # ### LAB CHALLENGE # # How might one simply swap the axes? That's a hint.