Combining and Merging Data Sets¶

pandas.merge connects rows in DataFrames based on one or more keys. 
pandas.concat glues or stacks together objects along an axis.
combine_first instance method enables splicing together overlapping data to fill
   in missing values in one object with values from another.

In [2]:

import pandas as pd
import numpy as np

In [7]:

df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                 'data1': range(7)})
df2 = pd.DataFrame({'key': ['a', 'b', 'd'],
                 'data2': range(3)})

print(df1, "\n\n", df2)

   data1 key
0      0   b
1      1   b
2      2   a
3      3   c
4      4   a
5      5   a
6      6   b 

    data2 key
0      0   a
1      1   b
2      2   d

In [8]:

pd.merge(df1, df2)

Out[8]:

	data1	key	data2
0	0	b	1
1	1	b	1
2	6	b	1
3	2	a	0
4	4	a	0
5	5	a	0

In [9]:

pd.merge(df1, df2, on='key')

Out[9]:

	data1	key	data2
0	0	b	1
1	1	b	1
2	6	b	1
3	2	a	0
4	4	a	0
5	5	a	0

In [10]:

# if column name are diff, then use as below - 

pd.merge(df1, df2, left_on='data1', right_on='data2')

Out[10]:

	data1	key_x	data2	key_y
0	0	b	0	a
1	1	b	1	b
2	2	a	2	d

In [11]:

# by default, merge use the inner join

pd.merge(df1, df2, how='outer')

Out[11]:

	data1	key	data2
0	0.0	b	1.0
1	1.0	b	1.0
2	6.0	b	1.0
3	2.0	a	0.0
4	4.0	a	0.0
5	5.0	a	0.0
6	3.0	c	NaN
7	NaN	d	2.0

In [12]:

pd.merge(df1, df2, how='left')

Out[12]:

	data1	key	data2
0	0	b	1.0
1	1	b	1.0
2	2	a	0.0
3	3	c	NaN
4	4	a	0.0
5	5	a	0.0
6	6	b	1.0

In [13]:

# To merge with multiple keys, pass a list of column names
pd.merge(df1, df2, left_on=['key', 'data1'], right_on=['key', 'data2'], how='outer')

Out[13]:

	data1	key	data2
0	0.0	b	NaN
1	1.0	b	1.0
2	2.0	a	NaN
3	3.0	c	NaN
4	4.0	a	NaN
5	5.0	a	NaN
6	6.0	b	NaN
7	NaN	a	0.0
8	NaN	d	2.0

In [14]:

# check the column name for KEY
pd.merge(df1, df2, left_on='data1', right_on='data2')

Out[14]:

	data1	key_x	data2	key_y
0	0	b	0	a
1	1	b	1	b
2	2	a	2	d

In [15]:

# for proper renaming of overlapping columns, use suffixes option
pd.merge(df1, df2, left_on='data1', right_on='data2', suffixes=["_left", "_right"])

Out[15]:

	data1	key_left	data2	key_right
0	0	b	0	a
1	1	b	1	b
2	2	a	2	d

merge function arguments¶

left                DataFrame to be merged on the left side
right               DataFrame to be merged on the right side
how                 One of 'inner', 'outer', 'left' or 'right'. 'inner' by default
on                  Column names to join on. Must be found in both DataFrame objects. If not specified and no other join keys given, will use the intersection of the column names in left and right as the join keys
left_on             Columns in left DataFrame to use as join keys
right_on            Analogous to left_on for left DataFrame
left_index          Use row index in left as its join key (or keys, if a MultiIndex)
right_index         Analogous to left_index
sort                Sort merged data lexicographically by join keys; True by default. Disable to get better performance in some cases on large datasets
suffixes            Tuple of string values to append to column names in case of overlap; defaults to ('_x', '_y'). For example, if 'data' in both DataFrame objects, would appear as 'data_x' and 'data_y' in result
copy                If False, avoid copying data into resulting data structure in some exceptional cases. By default always copies

Merging on Index¶

In [17]:

left1 = pd.DataFrame({'key': ['a', 'b', 'a', 'a', 'b', 'c'],
                   'value': range(6)})
right1 = pd.DataFrame({'group_val': [3.5, 7]}, index=['a', 'b'])

In [18]:

pd.merge(left1, right1, left_on='key', right_index=True)

Out[18]:

	key	value	group_val
0	a	0	3.5
2	a	2	3.5
3	a	3	3.5
1	b	1	7.0
4	b	4	7.0

In [22]:

# With hierarchically-indexed data, things are a bit more complicated
# we need to pass the multiple keys to join 

lefth = pd.DataFrame({'key1': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
                       'key2': [2000, 2001, 2002, 2001, 2002],
                       'data': np.arange(5.)})

righth = pd.DataFrame(np.arange(12).reshape((6, 2)),
                       index=[['Nevada', 'Nevada', 'Ohio', 'Ohio', 'Ohio', 'Ohio'],
                              [2001, 2000, 2000, 2000, 2001, 2002]],
                       columns=['event1', 'event2'])

print(lefth, "\n\n", righth)

   data    key1  key2
0   0.0    Ohio  2000
1   1.0    Ohio  2001
2   2.0    Ohio  2002
3   3.0  Nevada  2001
4   4.0  Nevada  2002 

              event1  event2
Nevada 2001       0       1
       2000       2       3
Ohio   2000       4       5
       2000       6       7
       2001       8       9
       2002      10      11

In [23]:

pd.merge(lefth, righth, left_on=['key1', 'key2'], right_index=True)

Out[23]:

	data	key1	key2	event1	event2
0	0.0	Ohio	2000	4	5
0	0.0	Ohio	2000	6	7
1	1.0	Ohio	2001	8	9
2	2.0	Ohio	2002	10	11
3	3.0	Nevada	2001	0	1

In [28]:

# Using the indexes of both sides of the merge is also not an issue

left2 = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]], index=['a', 'c', 'e'],
                     columns=['Ohio', 'Nevada'])
right2 = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]],
                       index=['b', 'c', 'd', 'e'], columns=['Missouri', 'Alabama'])

print(left2, "\n\n", right2)

   Ohio  Nevada
a   1.0     2.0
c   3.0     4.0
e   5.0     6.0 

    Missouri  Alabama
b       7.0      8.0
c       9.0     10.0
d      11.0     12.0
e      13.0     14.0

In [30]:

pd.merge(left2, right2, how='outer', left_index=True, right_index=True)

Out[30]:

	Ohio	Nevada	Missouri	Alabama
a	1.0	2.0	NaN	NaN
b	NaN	NaN	7.0	8.0
c	3.0	4.0	9.0	10.0
d	NaN	NaN	11.0	12.0
e	5.0	6.0	13.0	14.0

In [32]:

# DataFrame has a more convenient join instance for merging by index. It can also be
# used to combine together many DataFrame objects having the same or similar indexes
# but non-overlapping columns.

left2.join(right2, how='outer')

Out[32]:

	Ohio	Nevada	Missouri	Alabama
a	1.0	2.0	NaN	NaN
b	NaN	NaN	7.0	8.0
c	3.0	4.0	9.0	10.0
d	NaN	NaN	11.0	12.0
e	5.0	6.0	13.0	14.0

In [34]:

left1.join(right1, on='key', how='inner')

Out[34]:

	key	value	group_val
0	a	0	3.5
2	a	2	3.5
3	a	3	3.5
1	b	1	7.0
4	b	4	7.0

In [35]:

another = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [16., 17.]],
                        index=['a', 'c', 'e', 'f'], columns=['New York', 'Oregon'])
print(another)

   New York  Oregon
a       7.0     8.0
c       9.0    10.0
e      11.0    12.0
f      16.0    17.0

In [36]:

left2.join([right2, another])

Out[36]:

	Ohio	Nevada	Missouri	Alabama	New York	Oregon
a	1.0	2.0	NaN	NaN	7.0	8.0
c	3.0	4.0	9.0	10.0	9.0	10.0
e	5.0	6.0	13.0	14.0	11.0	12.0

In [37]:

left2.join([right2, another], how='outer')

Out[37]:

	Ohio	Nevada	Missouri	Alabama	New York	Oregon
a	1.0	2.0	NaN	NaN	7.0	8.0
b	NaN	NaN	7.0	8.0	NaN	NaN
c	3.0	4.0	9.0	10.0	9.0	10.0
d	NaN	NaN	11.0	12.0	NaN	NaN
e	5.0	6.0	13.0	14.0	11.0	12.0
f	NaN	NaN	NaN	NaN	16.0	17.0

Concatenating Along an Axis¶

In [4]:

s1 = pd.Series([0, 1], index=['a', 'b'])
s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = pd.Series([5, 6], index=['f', 'g'])

print(s1, "\n\n",s2, "\n\n",s3)

a    0
b    1
dtype: int64 

 c    2
d    3
e    4
dtype: int64 

 f    5
g    6
dtype: int64

In [5]:

pd.concat([s1, s2, s3])

Out[5]:

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

In [6]:

# By default concat works along axis=0, producing another Series. If you pass axis=1, the
# result will instead be a DataFrame (axis=1 is the columns):
pd.concat([s1, s2, s3], axis=1)

Out[6]:

	0	1	2
a	0.0	NaN	NaN
b	1.0	NaN	NaN
c	NaN	2.0	NaN
d	NaN	3.0	NaN
e	NaN	4.0	NaN
f	NaN	NaN	5.0
g	NaN	NaN	6.0

In [7]:

# In this case there is no overlap on the other axis, which as you can see is the sorted
# union (the 'outer' join) of the indexes. You can instead intersect them by passing join='inner':

s4 = pd.concat([s1 * 5, s3])
s4

Out[7]:

a    0
b    5
f    5
g    6
dtype: int64

In [8]:

pd.concat([s1, s4], axis=1)

Out[8]:

	0	1
a	0.0	0
b	1.0	5
f	NaN	5
g	NaN	6

In [9]:

pd.concat([s1, s4], axis=1, join='inner')

Out[9]:

	0	1
a	0	0
b	1	5

In [10]:

pd.concat([s1, s4], axis=1, join_axes=[['a', 'c', 'b', 'e']])

Out[10]:

	0	1
a	0.0	0.0
c	NaN	NaN
b	1.0	5.0
e	NaN	NaN

In [11]:

# to create a hierarchical index on the concatenation axis. To do this, use the keys argument:
result = pd.concat([s1, s1, s3], keys=['one', 'two', 'three'])
result

Out[11]:

one    a    0
       b    1
two    a    0
       b    1
three  f    5
       g    6
dtype: int64

In [12]:

result.unstack()

Out[12]:

	a	b	f	g
one	0.0	1.0	NaN	NaN
two	0.0	1.0	NaN	NaN
three	NaN	NaN	5.0	6.0

In [13]:

# In the case of combining Series along axis=1, the keys become the DataFrame column headers:
pd.concat([s1, s2, s3], axis=1, keys=['one', 'two', 'three'])

Out[13]:

	one	two	three
a	0.0	NaN	NaN
b	1.0	NaN	NaN
c	NaN	2.0	NaN
d	NaN	3.0	NaN
e	NaN	4.0	NaN
f	NaN	NaN	5.0
g	NaN	NaN	6.0

In [14]:

df1 = pd.DataFrame(np.arange(6).reshape(3, 2), index=['a', 'b', 'c'],
 columns=['one', 'two'])
df2 = pd.DataFrame(5 + np.arange(4).reshape(2, 2), index=['a', 'c'],
 columns=['three', 'four'])


print(df1, "\n\n", df2)

   one  two
a    0    1
b    2    3
c    4    5 

    three  four
a      5     6
c      7     8

In [15]:

pd.concat([df1, df2], axis=1, keys=['level1', 'level2'])

Out[15]:

	level1		level2
	one	two	three	four
a	0	1	5.0	6.0
b	2	3	NaN	NaN
c	4	5	7.0	8.0

In [16]:

# If you pass a dict of objects instead of a list, the dict’s keys will be used for the keys option:
pd.concat({'level1': df1, 'level2': df2}, axis=1)

Out[16]:

	level1		level2
	one	two	three	four
a	0	1	5.0	6.0
b	2	3	NaN	NaN
c	4	5	7.0	8.0

In [17]:

pd.concat([df1, df2], axis=1, keys=['level1', 'level2'], names=['upper', 'lower'])

Out[17]:

upper	level1		level2
lower	one	two	three	four
a	0	1	5.0	6.0
b	2	3	NaN	NaN
c	4	5	7.0	8.0

In [18]:

# DataFrames in which the row index is not meaningful in the context of the analysis

df1 = pd.DataFrame(np.random.randn(3, 4), columns=['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.random.randn(2, 3), columns=['b', 'd', 'a'])

print(df1, "\n\n", df2)

          a         b         c         d
0 -0.743842  0.366076  0.043001  0.315239
1 -0.126674  1.312018  0.689079 -0.647427
2 -0.929330  1.053159  0.470513 -0.418311 

           b         d         a
0 -1.432140 -1.802894  1.099453
1 -1.137721 -0.052989  0.728343

In [19]:

pd.concat([df1, df2])

Out[19]:

	a	b	c	d
0	-0.743842	0.366076	0.043001	0.315239
1	-0.126674	1.312018	0.689079	-0.647427
2	-0.929330	1.053159	0.470513	-0.418311
0	1.099453	-1.432140	NaN	-1.802894
1	0.728343	-1.137721	NaN	-0.052989

In [22]:

pd.concat([df1, df2], ignore_index=True)

Out[22]:

	a	b	c	d
0	-0.743842	0.366076	0.043001	0.315239
1	-0.126674	1.312018	0.689079	-0.647427
2	-0.929330	1.053159	0.470513	-0.418311
3	1.099453	-1.432140	NaN	-1.802894
4	0.728343	-1.137721	NaN	-0.052989

concat function arguments¶

objs          List or dict of pandas objects to be concatenated. The only required argument
axis          Axis to concatenate along; defaults to 0
join          One of 'inner', 'outer', defaulting to 'outer'; whether to intersection (inner) or union (outer) together indexes along the other axes
join_axes     Specific indexes to use for the other n-1 axes instead of performing union/intersection logic
keys          Values to associate with objects being concatenated, forming a hierarchical index along the concatenation axis. Can either be a list or array of arbitrary values, an array of tuples, or a list of arrays (if multiple level arrays passed in levels)
levels        Specific indexes to use as hierarchical index level or levels if keys passed
names         Names for created hierarchical levels if keys and / or levels passed
verify_integrity      Check new axis in concatenated object for duplicates and raise exception if so. By default (False) allows duplicates
ignore_index          Do not preserve indexes along concatenation axis, instead producing a new range(total_length) index

Combining Data with Overlap¶

Another data combination situation can’t be expressed as either a merge or concatenation operation. You may have two datasets whose indexes overlap in full or part. As a motivating example, consider NumPy’s where function, which expressed a vectorized if-else:

In [23]:

a = pd.Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan],
index=['f', 'e', 'd', 'c', 'b', 'a'])

b = pd.Series(np.arange(len(a), dtype=np.float64),
index=['f', 'e', 'd', 'c', 'b', 'a'])

print(a, "\n\n", b)

f    NaN
e    2.5
d    NaN
c    3.5
b    4.5
a    NaN
dtype: float64 

 f    0.0
e    1.0
d    2.0
c    3.0
b    4.0
a    5.0
dtype: float64

In [24]:

b[-1] = np.nan

In [25]:

np.where(pd.isnull(a), b, a)

Out[25]:

array([ 0. ,  2.5,  2. ,  3.5,  4.5,  nan])

In [26]:

# Series has a combine_first method, which performs the equivalent of this operation plus data alignment
b[:-2].combine_first(a[2:])

Out[26]:

a    NaN
b    4.5
c    3.0
d    2.0
e    1.0
f    0.0
dtype: float64

In [27]:

# With DataFrames, combine_first naturally does the same thing column by column, so
# you can think of it as “patching” missing data in the calling object with data from the
# object you pass:

df1 = pd.DataFrame({'a': [1., np.nan, 5., np.nan],
'b': [np.nan, 2., np.nan, 6.],
      'c': range(2, 18, 4)})
df2 = pd.DataFrame({'a': [5., 4., np.nan, 3., 7.],
      'b': [np.nan, 3., 4., 6., 8.]})

df1.combine_first(df2)

Out[27]:

	a	b	c
0	1.0	NaN	2.0
1	4.0	2.0	6.0
2	5.0	4.0	10.0
3	3.0	6.0	14.0
4	7.0	8.0	NaN

In [ ]:

	data1	key	data2
0	0.0	b	1.0
1	1.0	b	1.0
2	6.0	b	1.0
3	2.0	a	0.0
4	4.0	a	0.0
5	5.0	a	0.0
6	3.0	c	NaN
7	NaN	d	2.0

	data1	key	data2
0	0.0	b	NaN
1	1.0	b	1.0
2	2.0	a	NaN
3	3.0	c	NaN
4	4.0	a	NaN
5	5.0	a	NaN
6	6.0	b	NaN
7	NaN	a	0.0
8	NaN	d	2.0

	0	1	2
a	0.0	NaN	NaN
b	1.0	NaN	NaN
c	NaN	2.0	NaN
d	NaN	3.0	NaN
e	NaN	4.0	NaN
f	NaN	NaN	5.0
g	NaN	NaN	6.0

	one	two	three
a	0.0	NaN	NaN
b	1.0	NaN	NaN
c	NaN	2.0	NaN
d	NaN	3.0	NaN
e	NaN	4.0	NaN
f	NaN	NaN	5.0
g	NaN	NaN	6.0

	data1	key	data2
0	0.0	b	1.0
1	1.0	b	1.0
2	6.0	b	1.0
3	2.0	a	0.0
4	4.0	a	0.0
5	5.0	a	0.0
6	3.0	c	NaN
7	NaN	d	2.0

	data1	key	data2
0	0.0	b	NaN
1	1.0	b	1.0
2	2.0	a	NaN
3	3.0	c	NaN
4	4.0	a	NaN
5	5.0	a	NaN
6	6.0	b	NaN
7	NaN	a	0.0
8	NaN	d	2.0

	0	1	2
a	0.0	NaN	NaN
b	1.0	NaN	NaN
c	NaN	2.0	NaN
d	NaN	3.0	NaN
e	NaN	4.0	NaN
f	NaN	NaN	5.0
g	NaN	NaN	6.0

	one	two	three
a	0.0	NaN	NaN
b	1.0	NaN	NaN
c	NaN	2.0	NaN
d	NaN	3.0	NaN
e	NaN	4.0	NaN
f	NaN	NaN	5.0
g	NaN	NaN	6.0

	data1	key	data2
0	0.0	b	1.0
1	1.0	b	1.0
2	6.0	b	1.0
3	2.0	a	0.0
4	4.0	a	0.0
5	5.0	a	0.0
6	3.0	c	NaN
7	NaN	d	2.0

	data1	key	data2
0	0.0	b	NaN
1	1.0	b	1.0
2	2.0	a	NaN
3	3.0	c	NaN
4	4.0	a	NaN
5	5.0	a	NaN
6	6.0	b	NaN
7	NaN	a	0.0
8	NaN	d	2.0

	0	1	2
a	0.0	NaN	NaN
b	1.0	NaN	NaN
c	NaN	2.0	NaN
d	NaN	3.0	NaN
e	NaN	4.0	NaN
f	NaN	NaN	5.0
g	NaN	NaN	6.0

	one	two	three
a	0.0	NaN	NaN
b	1.0	NaN	NaN
c	NaN	2.0	NaN
d	NaN	3.0	NaN
e	NaN	4.0	NaN
f	NaN	NaN	5.0
g	NaN	NaN	6.0