import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
netflix = pd.read_csv('netflix_titles.csv')
netflix.head(3)
show_id | type | title | director | cast | country | date_added | release_year | rating | duration | listed_in | description | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | s1 | TV Show | 3% | NaN | João Miguel, Bianca Comparato, Michel Gomes, R... | Brazil | August 14, 2020 | 2020 | TV-MA | 4 Seasons | International TV Shows, TV Dramas, TV Sci-Fi &... | In a future where the elite inhabit an island ... |
1 | s2 | Movie | 7:19 | Jorge Michel Grau | Demián Bichir, Héctor Bonilla, Oscar Serrano, ... | Mexico | December 23, 2016 | 2016 | TV-MA | 93 min | Dramas, International Movies | After a devastating earthquake hits Mexico Cit... |
2 | s3 | Movie | 23:59 | Gilbert Chan | Tedd Chan, Stella Chung, Henley Hii, Lawrence ... | Singapore | December 20, 2018 | 2011 | R | 78 min | Horror Movies, International Movies | When an army recruit is found dead, his fellow... |
netflix.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 7787 entries, 0 to 7786 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 show_id 7787 non-null object 1 type 7787 non-null object 2 title 7787 non-null object 3 director 5398 non-null object 4 cast 7069 non-null object 5 country 7280 non-null object 6 date_added 7777 non-null object 7 release_year 7787 non-null int64 8 rating 7780 non-null object 9 duration 7787 non-null object 10 listed_in 7787 non-null object 11 description 7787 non-null object dtypes: int64(1), object(11) memory usage: 730.2+ KB
This dataset has 7787 rows and 12 columns Column Types:
#Checking missing values
missing_values = netflix.isnull().sum()
missing_values.sort_values(ascending = True)
missing_values_per = round((missing_values/len(netflix))*100)
missing_values_per.sort_values(ascending=True)
show_id 0.0 type 0.0 title 0.0 date_added 0.0 release_year 0.0 rating 0.0 duration 0.0 listed_in 0.0 description 0.0 country 7.0 cast 9.0 director 31.0 dtype: float64
The columns `Country, 'Cast', 'Director' have missing values.
content_type = netflix['type']
percentage_content_type = (content_type.value_counts()/len(content_type))*100
percentage_content_type
labels = ['Movies', 'TV Shows']
plt.pie(percentage_content_type, labels = labels, autopct='%1.1f%%', startangle = 90)
([<matplotlib.patches.Wedge at 0x7fe98e8f7ca0>, <matplotlib.patches.Wedge at 0x7fe98c867d30>], [Text(-0.9087972267209458, -0.6197480138768643, 'Movies'), Text(0.9087972847459387, 0.6197479287891243, 'TV Shows')], [Text(-0.4957075782114249, -0.3380443712055623, '69.1%'), Text(0.49570760986142104, 0.3380443247940678, '30.9%')])
Majority of the Netflix content are two-thrid movies (69%) and one-third TV shows (31%)
def extract_last_word(year):
return str(year).split()[-1]
year = netflix['date_added'].apply(extract_last_word)
year.value_counts()
2019 2153 2020 2009 2018 1685 2017 1225 2016 443 2021 117 2015 88 2014 25 2011 13 2013 11 nan 10 2012 3 2008 2 2009 2 2010 1 Name: date_added, dtype: int64
#separate the types (TV Shows vs. Movies) into each table
tv = netflix[netflix['type'] == 'TV Shows']
movie = netflix[netflix['type'] == 'Movie']