from platform import python_version
python_version()
'3.6.7'
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
pd.__version__,matplotlib.__version__
('0.25.1', '3.0.3')
df = pd.DataFrame({
'title': ['bar','bar','baz','baz','foo','foo'],
'contents':[
'Sed mollis tempor accumsan.',
'Sed mollis tempor accumsan.',
'Nullam et feugiat turpis, non condimentum dolor.',
'Aenean eu aliquam nunc.',
'Lorem ipsum dolor sit amet.',
'Lorem ipsum dolor sit amet.'
],
'year':[2010,2010,2005,2005,2011,2011]
})
df
title | contents | year | |
---|---|---|---|
0 | bar | Sed mollis tempor accumsan. | 2010 |
1 | bar | Sed mollis tempor accumsan. | 2010 |
2 | baz | Nullam et feugiat turpis, non condimentum dolor. | 2005 |
3 | baz | Aenean eu aliquam nunc. | 2005 |
4 | foo | Lorem ipsum dolor sit amet. | 2011 |
5 | foo | Lorem ipsum dolor sit amet. | 2011 |
df[df.duplicated()]
title | contents | year | |
---|---|---|---|
1 | bar | Sed mollis tempor accumsan. | 2010 |
5 | foo | Lorem ipsum dolor sit amet. | 2011 |
df[df.duplicated(keep=False)]
title | contents | year | |
---|---|---|---|
0 | bar | Sed mollis tempor accumsan. | 2010 |
1 | bar | Sed mollis tempor accumsan. | 2010 |
4 | foo | Lorem ipsum dolor sit amet. | 2011 |
5 | foo | Lorem ipsum dolor sit amet. | 2011 |
len(df[df.duplicated()])
2
df[df.duplicated(subset=['title','year'])]
title | contents | year | |
---|---|---|---|
1 | bar | Sed mollis tempor accumsan. | 2010 |
3 | baz | Aenean eu aliquam nunc. | 2005 |
5 | foo | Lorem ipsum dolor sit amet. | 2011 |
df.drop_duplicates()
title | contents | year | |
---|---|---|---|
0 | bar | Sed mollis tempor accumsan. | 2010 |
2 | baz | Nullam et feugiat turpis, non condimentum dolor. | 2005 |
3 | baz | Aenean eu aliquam nunc. | 2005 |
4 | foo | Lorem ipsum dolor sit amet. | 2011 |
df.drop_duplicates(subset=['title','year'])
title | contents | year | |
---|---|---|---|
0 | bar | Sed mollis tempor accumsan. | 2010 |
2 | baz | Nullam et feugiat turpis, non condimentum dolor. | 2005 |
4 | foo | Lorem ipsum dolor sit amet. | 2011 |
df.drop_duplicates(keep=False)
title | contents | year | |
---|---|---|---|
2 | baz | Nullam et feugiat turpis, non condimentum dolor. | 2005 |
3 | baz | Aenean eu aliquam nunc. | 2005 |
df.assign(
is_duplicate=lambda d: d.duplicated()
)
title | contents | year | is_duplicate | |
---|---|---|---|---|
0 | bar | Sed mollis tempor accumsan. | 2010 | False |
1 | bar | Sed mollis tempor accumsan. | 2010 | True |
2 | baz | Nullam et feugiat turpis, non condimentum dolor. | 2005 | False |
3 | baz | Aenean eu aliquam nunc. | 2005 | False |
4 | foo | Lorem ipsum dolor sit amet. | 2011 | False |
5 | foo | Lorem ipsum dolor sit amet. | 2011 | True |
df = pd.DataFrame({
'title': ['bar','bar','baz','baz','foo','foo'],
'contents':[
'Sed mollis tempor accumsan.',
'Sed mollis tempor accumsan.',
'Nullam et feugiat turpis, non condimentum dolor.',
'Aenean eu aliquam nunc.',
'Lorem ipsum dolor sit amet.',
'Lorem ipsum dolor sit amet.'
],
'year':[2009,2019,2005,2005,2015,1995]
})
df
title | contents | year | |
---|---|---|---|
0 | bar | Sed mollis tempor accumsan. | 2009 |
1 | bar | Sed mollis tempor accumsan. | 2019 |
2 | baz | Nullam et feugiat turpis, non condimentum dolor. | 2005 |
3 | baz | Aenean eu aliquam nunc. | 2005 |
4 | foo | Lorem ipsum dolor sit amet. | 2015 |
5 | foo | Lorem ipsum dolor sit amet. | 1995 |
df.sort_values(
['title','contents','year']
).drop_duplicates(
subset=['title','contents'],keep='last'
)
title | contents | year | |
---|---|---|---|
1 | bar | Sed mollis tempor accumsan. | 2019 |
3 | baz | Aenean eu aliquam nunc. | 2005 |
2 | baz | Nullam et feugiat turpis, non condimentum dolor. | 2005 |
4 | foo | Lorem ipsum dolor sit amet. | 2015 |