In [1]:
from platform import python_version

python_version()
Out[1]:
'3.6.7'
In [15]:
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt
In [27]:
pd.__version__,matplotlib.__version__
Out[27]:
('0.25.1', '3.0.3')
In [28]:
df = pd.DataFrame({
    'title': ['bar','bar','baz','baz','foo','foo'],
    'contents':[
        'Sed mollis tempor accumsan.',
        'Sed mollis tempor accumsan.',
        'Nullam et feugiat turpis, non condimentum dolor.',
        'Aenean eu aliquam nunc.',
        'Lorem ipsum dolor sit amet.',
        'Lorem ipsum dolor sit amet.'
    ],
    'year':[2010,2010,2005,2005,2011,2011]
})

df
Out[28]:
title contents year
0 bar Sed mollis tempor accumsan. 2010
1 bar Sed mollis tempor accumsan. 2010
2 baz Nullam et feugiat turpis, non condimentum dolor. 2005
3 baz Aenean eu aliquam nunc. 2005
4 foo Lorem ipsum dolor sit amet. 2011
5 foo Lorem ipsum dolor sit amet. 2011

show

In [29]:
df[df.duplicated()]
Out[29]:
title contents year
1 bar Sed mollis tempor accumsan. 2010
5 foo Lorem ipsum dolor sit amet. 2011

show including original

In [31]:
df[df.duplicated(keep=False)]
Out[31]:
title contents year
0 bar Sed mollis tempor accumsan. 2010
1 bar Sed mollis tempor accumsan. 2010
4 foo Lorem ipsum dolor sit amet. 2011
5 foo Lorem ipsum dolor sit amet. 2011

count

In [30]:
len(df[df.duplicated()])
Out[30]:
2

show, some columns only

In [20]:
df[df.duplicated(subset=['title','year'])]
Out[20]:
title contents year
1 bar Sed mollis tempor accumsan. 2010
3 baz Aenean eu aliquam nunc. 2005
5 foo Lorem ipsum dolor sit amet. 2011

drop duplicates, keep original

In [21]:
df.drop_duplicates()
Out[21]:
title contents year
0 bar Sed mollis tempor accumsan. 2010
2 baz Nullam et feugiat turpis, non condimentum dolor. 2005
3 baz Aenean eu aliquam nunc. 2005
4 foo Lorem ipsum dolor sit amet. 2011

drop duplicates based on some columns

In [22]:
df.drop_duplicates(subset=['title','year'])
Out[22]:
title contents year
0 bar Sed mollis tempor accumsan. 2010
2 baz Nullam et feugiat turpis, non condimentum dolor. 2005
4 foo Lorem ipsum dolor sit amet. 2011

drop columns that are or have duplicates

In [23]:
df.drop_duplicates(keep=False)
Out[23]:
title contents year
2 baz Nullam et feugiat turpis, non condimentum dolor. 2005
3 baz Aenean eu aliquam nunc. 2005

mark duplicates

In [24]:
df.assign(
    is_duplicate=lambda d: d.duplicated()
)
Out[24]:
title contents year is_duplicate
0 bar Sed mollis tempor accumsan. 2010 False
1 bar Sed mollis tempor accumsan. 2010 True
2 baz Nullam et feugiat turpis, non condimentum dolor. 2005 False
3 baz Aenean eu aliquam nunc. 2005 False
4 foo Lorem ipsum dolor sit amet. 2011 False
5 foo Lorem ipsum dolor sit amet. 2011 True

custom keep logic

In [25]:
df = pd.DataFrame({
    'title': ['bar','bar','baz','baz','foo','foo'],
    'contents':[
        'Sed mollis tempor accumsan.',
        'Sed mollis tempor accumsan.',
        'Nullam et feugiat turpis, non condimentum dolor.',
        'Aenean eu aliquam nunc.',
        'Lorem ipsum dolor sit amet.',
        'Lorem ipsum dolor sit amet.'
    ],
    'year':[2009,2019,2005,2005,2015,1995]
})

df
Out[25]:
title contents year
0 bar Sed mollis tempor accumsan. 2009
1 bar Sed mollis tempor accumsan. 2019
2 baz Nullam et feugiat turpis, non condimentum dolor. 2005
3 baz Aenean eu aliquam nunc. 2005
4 foo Lorem ipsum dolor sit amet. 2015
5 foo Lorem ipsum dolor sit amet. 1995
In [26]:
df.sort_values(
    ['title','contents','year']
).drop_duplicates(
    subset=['title','contents'],keep='last'
)
Out[26]:
title contents year
1 bar Sed mollis tempor accumsan. 2019
3 baz Aenean eu aliquam nunc. 2005
2 baz Nullam et feugiat turpis, non condimentum dolor. 2005
4 foo Lorem ipsum dolor sit amet. 2015
In [ ]:
 
In [ ]:
 
In [ ]: