Updated: Using Pandas to Create and Excel Diff¶

The original article contains some Updating the Excel diff article to work with more recent versions of pandas that no longer use panel.

The new article can be read here

In [1]:

import pandas as pd

In [2]:

# Define the diff function to show the changes in each field
def report_diff(x):
    return x[0] if x[0] == x[1] else '{} ---> {}'.format(*x)

In [3]:

# Read in the two files but call the data old and new and create columns to track
old = pd.read_excel('sample-address-1.xlsx', 'Sheet1', na_values=['NA'])
new = pd.read_excel('sample-address-2.xlsx', 'Sheet1', na_values=['NA'])
old['version'] = "old"
new['version'] = "new"

In [4]:

old.head()

Out[4]:

	account number	name	street	city	state	postal code	version
0	935480	Bruen Group	5131 Nienow Viaduct Apt. 290	Port Arlie	Alabama	14118	old
1	371770	Cruickshank-Boyer	839 Lana Expressway Suite 234	South Viviana	Alabama	57838	old
2	548367	Spencer, Grady and Herman	65387 Lang Circle Apt. 516	Greenholtbury	Alaska	58394	old
3	296620	Schamberger, Hagenes and Brown	26340 Ferry Neck Apt. 612	McCulloughstad	Alaska	74052	old
4	132971	Williamson, Schumm and Hettinger	89403 Casimer Spring	Jeremieburgh	Arkansas	62785	old

In [5]:

new.head()

Out[5]:

	account number	name	street	city	state	postal code	version
0	935480	Bruen Group	5131 Nienow Viaduct Apt. 290	Port Arlie	Alabama	14118	new
1	371770	Cruickshank-Boyer	839 Lana Expressway Suite 234	South Viviana	Alabama	57838	new
2	548367	Spencer, Grady and Herman	65387 Lang Circle Apt. 516	Greenholtbury	Alaska	58394	new
3	132971	Williamson, Schumm and Hettinger	89403 Casimer Spring	Jeremieburgh	Arkansas	62785	new
4	985603	Bosco-Upton	03369 Moe Way	Port Casandra	Arkansas	86014	new

In [6]:

# We use the account numbers as the keys to check what is added, dropped and potentially changed
# Using sets makes the deduping easy and we can use set operations to figure out groupings
old_accts_all = set(old['account number'])
new_accts_all = set(new['account number'])

dropped_accts = old_accts_all - new_accts_all
added_accts = new_accts_all - old_accts_all

In [7]:

#Join all the data together and ignore indexes so it all gets concatenated
all_data = pd.concat([old,new],ignore_index=True)

In [8]:

all_data.head()

Out[8]:

	account number	name	street	city	state	postal code	version
0	935480	Bruen Group	5131 Nienow Viaduct Apt. 290	Port Arlie	Alabama	14118	old
1	371770	Cruickshank-Boyer	839 Lana Expressway Suite 234	South Viviana	Alabama	57838	old
2	548367	Spencer, Grady and Herman	65387 Lang Circle Apt. 516	Greenholtbury	Alaska	58394	old
3	296620	Schamberger, Hagenes and Brown	26340 Ferry Neck Apt. 612	McCulloughstad	Alaska	74052	old
4	132971	Williamson, Schumm and Hettinger	89403 Casimer Spring	Jeremieburgh	Arkansas	62785	old

In [9]:

# Let's see what changes in the main columns we care about
# Change drop_duplicates syntax: keep=last
changes = all_data.drop_duplicates(subset=["account number", 
                                           "name", "street", 
                                           "city","state", 
                                           "postal code"], keep='last')

In [10]:

changes.head()

Out[10]:

	account number	name	street	city	state	postal code	version
3	296620	Schamberger, Hagenes and Brown	26340 Ferry Neck Apt. 612	McCulloughstad	Alaska	74052	old
24	595932	Kuhic, Eichmann and West	4059 Tobias Inlet	New Rylanfurt	Illinois	89271	old
30	558879	Watsica Group	95616 Enos Grove Suite 139	West Atlas	Iowa	47419	old
96	880043	Beatty Inc	3641 Schaefer Isle Suite 171	North Gardnertown	Wyoming	64318	old
100	935480	Bruen Group	5131 Nienow Viaduct Apt. 290	Port Arlie	Alabama	14118	new

In [11]:

#Get all the duplicate rows
dupe_accts = changes[changes['account number'].duplicated() == True]['account number'].tolist()
dupes = changes[changes["account number"].isin(dupe_accts)]

In [12]:

dupes

Out[12]:

	account number	name	street	city	state	postal code	version
24	595932	Kuhic, Eichmann and West	4059 Tobias Inlet	New Rylanfurt	Illinois	89271	old
30	558879	Watsica Group	95616 Enos Grove Suite 139	West Atlas	Iowa	47419	old
96	880043	Beatty Inc	3641 Schaefer Isle Suite 171	North Gardnertown	Wyoming	64318	old
123	595932	Kuhic, Eichmann and West	4059 Tobias St	New Rylanfurt	Illinois	89271	new
129	558879	Watsica Group	829 Big street	Smithtown	Ohio	47919	new
195	880043	Beatty Inc	3641 Schaefer Isle Suite 171	North Gardnertown	Wyoming	64918	new

In [13]:

# Pull out the old and new data into separate dataframes
change_new = dupes[(dupes["version"] == "new")]
change_old = dupes[(dupes["version"] == "old")]

In [14]:

# Drop the temp columns - we don't need them now
change_new = change_new.drop(['version'], axis=1)
change_old = change_old.drop(['version'], axis=1)

In [15]:

# Index on the account numbers
change_new.set_index('account number', inplace=True)
change_old.set_index('account number', inplace=True)

In [16]:

df_all_changes = pd.concat([change_old, change_new],
                           axis='columns',
                           keys=['old', 'new'],
                           join='outer')

In [17]:

df_all_changes

Out[17]:

	old					new
	name	street	city	state	postal code	name	street	city	state	postal code
account number
595932	Kuhic, Eichmann and West	4059 Tobias Inlet	New Rylanfurt	Illinois	89271	Kuhic, Eichmann and West	4059 Tobias St	New Rylanfurt	Illinois	89271
558879	Watsica Group	95616 Enos Grove Suite 139	West Atlas	Iowa	47419	Watsica Group	829 Big street	Smithtown	Ohio	47919
880043	Beatty Inc	3641 Schaefer Isle Suite 171	North Gardnertown	Wyoming	64318	Beatty Inc	3641 Schaefer Isle Suite 171	North Gardnertown	Wyoming	64918

In [18]:

df_all_changes = df_all_changes.swaplevel(axis='columns')[change_new.columns[0:]]

In [19]:

df_all_changes

Out[19]:

	name		street		city		state		postal code
	old	new	old	new	old	new	old	new	old	new
account number
595932	Kuhic, Eichmann and West	Kuhic, Eichmann and West	4059 Tobias Inlet	4059 Tobias St	New Rylanfurt	New Rylanfurt	Illinois	Illinois	89271	89271
558879	Watsica Group	Watsica Group	95616 Enos Grove Suite 139	829 Big street	West Atlas	Smithtown	Iowa	Ohio	47419	47919
880043	Beatty Inc	Beatty Inc	3641 Schaefer Isle Suite 171	3641 Schaefer Isle Suite 171	North Gardnertown	North Gardnertown	Wyoming	Wyoming	64318	64918

In [20]:

df_changed = df_all_changes.groupby(level=0, axis=1).apply(lambda frame: frame.apply(report_diff, axis=1))
df_changed = df_changed.reset_index()

In [21]:

df_changed

Out[21]:

	account number	city	name	postal code	state	street
0	595932	New Rylanfurt	Kuhic, Eichmann and West	89271	Illinois	4059 Tobias Inlet ---> 4059 Tobias St
1	558879	West Atlas ---> Smithtown	Watsica Group	47419 ---> 47919	Iowa ---> Ohio	95616 Enos Grove Suite 139 ---> 829 Big street
2	880043	North Gardnertown	Beatty Inc	64318 ---> 64918	Wyoming	3641 Schaefer Isle Suite 171

In [22]:

# Diff'ing is done, we need to get a list of removed and added items

In [23]:

df_removed = changes[changes["account number"].isin(dropped_accts)]
df_removed

Out[23]:

	account number	name	street	city	state	postal code	version
3	296620	Schamberger, Hagenes and Brown	26340 Ferry Neck Apt. 612	McCulloughstad	Alaska	74052	old

In [24]:

df_added = changes[changes["account number"].isin(added_accts)]
df_added

Out[24]:

	account number	name	street	city	state	postal code	version
199	34777	MyCo	7833 Old Pine Drive	Orlando	Florida	32789	new

In [25]:

#Save the changes to excel but only include the columns we care about
output_columns = ["account number", "name", "street", "city", "state", "postal code"]
writer = pd.ExcelWriter("my-diff.xlsx")
df_changed.to_excel(writer,"changed", index=False, columns=output_columns)
df_removed.to_excel(writer,"removed",index=False, columns=output_columns)
df_added.to_excel(writer,"added",index=False, columns=output_columns)
writer.save()

In [ ]: