#!/usr/bin/env python
# coding: utf-8
# # Analyzing Netflix Dataset With Python
#
# (Part-1)
# ### Import libraries
# In[1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.style as style
import os
import re
get_ipython().run_line_magic('matplotlib', 'inline')
# ### Set default options
# In[2]:
pd.options.display.max_colwidth = 100
plt.rcParams["figure.figsize"] = [15, 5]
plt.rcParams["axes.titlesize"] = 20
plt.rcParams["axes.labelsize"] = 12
plt.rcParams["font.size"] = 14
style.use('tableau-colorblind10')
# ### Read and explore Netflix dataset
# In[3]:
# directory that contains netflix dataset
net_dir = os.getcwd() + "\\Netflix_Dataset\\"
# zip file containing netflix dataset
net_zipfile = "Netflix_Titles.zip"
# Read the file and save in raw dataframe
netflix_raw_data = pd.read_csv(net_dir + net_zipfile)
# #### dataframe.info() and dataframe.describe()
# In[4]:
# info and describe raw dataframe
netflix_raw_data.info()
pd.options.display.float_format = "{:.0f}".format
netflix_raw_data.describe()
# In[5]:
# reset float format
pd.options.display.float_format = "{:.3f}".format
# #### Duplicate rows?
# In[6]:
# check for any duplicate records
print("No. of duplicated records (if any):", netflix_raw_data.duplicated().sum())
# #### Content type
# In[7]:
# content category
netflix_raw_data["type"].value_counts()
# ### Sample rows
# In[8]:
# set some display options
pd.options.display.max_colwidth = 50
pd.options.display.max_columns = 20
# sample rows from netflix dataset
netflix_raw_data.iloc[7748: 7754, :]
# ### Table for columns with Missing Values
# In[9]:
# this block of code only displays columns which have missing data and the % of rows with the missing values
null_column_list = []
# total no. of rows in dataset
total_rows = netflix_raw_data.shape[0]
for each_col in netflix_raw_data.columns:
if netflix_raw_data[each_col].isnull().sum() > 0:
null_sum = netflix_raw_data[each_col].isnull().sum()
null_perc = "{:.4f}".format(null_sum / total_rows * 100)
null_column_list.append([each_col, null_sum, null_perc])
# display missing row count and % as a dataframe
pd.DataFrame(null_column_list, columns = ["Column Name", "No. of rows with missing values",
"Missing value as % (Netflix Dataset)"])
# In[10]:
# delete the dummy variables
del total_rows, null_column_list
# ## Explore Netflix Dataset
#
# ### type Column
# In[11]:
# create figure
fig, ax = plt.subplots(figsize = (12,3))
# plot content type as percentage
ax.barh(y = netflix_raw_data["type"].value_counts(normalize = True).index,
width = netflix_raw_data["type"].value_counts(normalize = True).mul(100).values)
# set title for the plot
ax.set_title("Types of content available on Netflix")
# hide the x-axis ticks and labels
ax.set_xticklabels(labels = "")
ax.tick_params(bottom = False)
# annotate each bar
for p in ax.patches:
ax.annotate(text = "{:.2f}%".format(p.get_width()),
xy = [p.get_width()/ 2, p.get_y() + 0.35],
color = "white", fontsize = "large")
# despine the plot
[ax.spines[spine].set_visible(False) for spine in ax.spines if spine != "left"]
# show the plot
plt.show()
# ### rating Column
# In[12]:
# identify the titles with missing-values
netflix_raw_data.loc[netflix_raw_data["rating"].isnull(), ["title", "director", "release_year", "rating"]]
# #### Mapper for titles with missing values
# In[13]:
# dictionary to map title with ratings (Copied from Mike's work)
ratings_map = {idx:val for idx,val in zip(netflix_raw_data.loc[netflix_raw_data["rating"].isnull(), "title"].values,
["TV-PG", "TV-14", "TV-Y7", "TV-MA", "TV-MA", "TV-MA", "PG-13"])}
# display ratings dictionary
ratings_map
# In[14]:
# replace the missing ratings
netflix_raw_data["rating"] = netflix_raw_data["title"].map(ratings_map).fillna(netflix_raw_data["rating"])
# In[15]:
# import pywaffle
from pywaffle import Waffle
# In[16]:
# temp variable for value count of ratings for Netflix Movies
data1 = dict(netflix_raw_data[netflix_raw_data["type"] == "Movie"]
["rating"].value_counts())
# temp variable for value count of ratings for Netflix TV Shows
data2 = dict(netflix_raw_data[netflix_raw_data["type"] != "Movie"]
["rating"].value_counts())
# In[17]:
# make pywaffle chart with 2 subplots for each type of Netflix Content
fig = plt.figure(
FigureClass = Waffle,
plots = {
'211':
{'values': data1,
'labels' : [f"{k} ({int(v / sum(data1.values()) * 100)}%)" for k, v in data1.items()],
'legend': {
'loc': 'lower left',
'bbox_to_anchor': (0, -0.6),
'fontsize': 12,
'ncol': 5},
'title' : {"label" : "Ratings Wise Distribution of Netflix Movies"}
},
'212':
{'values': data2,
'labels' : [f"{k} ({int(v / sum(data1.values()) * 100)}%)" for k, v in data2.items()],
'legend': {
'loc': 'lower left',
'bbox_to_anchor': (0, -0.45),
'fontsize': 12,
'ncol': 5},
'title' : {"label" : "Ratings Wise Distribution of Netflix Shows"}
}
},
rows = 3,
columns = 15,
icons = "user",
cmap_name = "tab20",
figsize = (14, 6.5)
)
plt.tight_layout(h_pad = 0)
plt.show()
# ### date_added Column & release Column
# In[18]:
# convert date_added column to a datetime format
netflix_raw_data["date_added"] = pd.to_datetime(netflix_raw_data["date_added"])
# select only those records where date_added column has values
netflix_raw_data = netflix_raw_data[netflix_raw_data["date_added"].notna()]
# shape of resultant netflix_raw_data
netflix_raw_data.shape
# #### Additional Analysis
#
# This is a small analysis of:
# - distribution of titles based on the decade they were released
# - distribution of titles hosted by Netflix since it's inception
#
# For ease of plotting:
# - the titles that were released before the year 1980, have been grouped under `Before 1980`. Why this year? because I was born in this decade!
# - since Netflix established itself in 190 countries by 2016, all the titles added (on Netflix platform) before 2016 have been grouped under `Added before 2016`.
# In[19]:
# calculate the decade for each year of release
decade_df = (netflix_raw_data["release_year"] // 10 * 10).value_counts().reset_index()
# rename the dataframe columns
decade_df.rename(columns = {"index": "Decade", "release_year" : "No. of Titles"}, inplace = True)
# identify titles released before the year 1980
decade_df["Decade"] = decade_df["Decade"].apply(lambda d: d if d > 1970 else "Before 1980")
# label each decade as YYYY's
decade_df["Decade"] = decade_df["Decade"].astype(str).apply(lambda s: s + "'s")
# In[20]:
# group the no. of titles by decade in which they were released
data1 = decade_df.groupby("Decade")["No. of Titles"].sum()
# group the no. of titles by decade in which they were added to Netflix
data2 = netflix_raw_data["date_added"].dt.year.apply(lambda d: "In " + str(d) if d > 2015 else "Before 2016").\
value_counts().sort_index()
# In[21]:
# create figure object
fig, ax = plt.subplots(ncols = 2, figsize = (13.5, 6))
# plot the years when title was released
ax[0].pie(x = data1.values,
labels = data1.index,
labeldistance = 1.03,
autopct = "%.1f%%",
explode = [0.1, 0.05, 0, 0, 0, 0.15],
pctdistance = 0.85,
startangle = 0,
wedgeprops = {"width": 0.4})
# annotation in the center
ax[0].annotate(text = "Decade of Release", xy = (-0.5,0), fontsize = 15)
# plot the years when title was added
ax[1].pie(x = data2.values,
labels = data2.index,
labeldistance = 1.03,
autopct = "%.1f%%",
explode = [0.1, 0.05, 0, 0, 0, 0, 0.15],
pctdistance = 0.85,
startangle = 0,
rotatelabels = False,
wedgeprops = {"width": 0.4})
# annotation in the center
ax[1].annotate(text = "Year of Addition", xy = (-0.3,0), fontsize = 15)
# extras
# plt.subplots_adjust(wspace = 0.2, hspace = 0)
plt.tight_layout(pad = 1)
plt.suptitle(t = "Distribution of Titles on Netflix by the", fontsize = 20)
# show plot
plt.show()
# In[22]:
# delete non-essential variables
del decade_df, data1, data2
# ### duration Column
#
# This column specifies the duration of a title in No. of seasons or runtime in minutes. However, if we analyze the values, the TV Shows are specified with seasons and Movies have runtime minutes. No series has runtime in minutes.
# In[23]:
# title type with duration in seasons
print(netflix_raw_data.loc[netflix_raw_data["duration"].str.contains("Season"), "type"].value_counts(), "\n")
# title type with duration in minutes
print(netflix_raw_data.loc[netflix_raw_data["duration"].str.contains("min"), "type"].value_counts())
# In[24]:
# duration for title type TV Show
netflix_raw_data.loc[netflix_raw_data["duration"].str.contains("Season"), "duration"].value_counts()
# ### Visualize distribution of seasons for titles on netflix
# In[25]:
# temporary variable
data = netflix_raw_data.loc[netflix_raw_data["duration"].str.contains("Season")]["duration"].value_counts()
# group seasons to accommodate in a pie-chart
for each in data.index:
if int(each.split()[0]) < 6:
pass
elif int(each.split()[0]) <= 8:
data.rename({each : "As long as GOT"}, inplace = True)
elif int(each.split()[0]) <= 10:
data.rename({each : "As long as F.R.I.E.N.D.S"}, inplace = True)
else:
data.rename({each : "Too long"}, inplace = True)
# re-group and total
data = data.groupby(data.index).sum()
# In[26]:
# create a figure object
fig, ax = plt.subplots(figsize = (12, 8))
# draw a pie-chart
ax.pie(x = data,
labels = data.index,
autopct = "%.1f%%",
pctdistance = 0.89,
explode = [0, 0, 0, 0, 0.1, 0.2, 0.3, 0.4],
labeldistance = 1.03,
startangle = 30,
radius = 1.2,
wedgeprops = {"width": 0.5})
# set title for the plot
ax.set_title("Netflix and Chill...", pad = 20)
ax.annotate(text = "But for How long?", xy = (-0.5, 0), fontsize = "18")
## Custom legend for grouped items
# handler map
handler_map = {"As long as F.R.I.E.N.D.S" : "Upto 10 Seasons",
"As long as GOT (Game Of Thrones)" : "Upto 8 Seasons",
"Too Long" : "More than 10 Seasons"}
# get default the handles and labels
handles, labels = ax.get_legend_handles_labels()
# place only the last three handles which have grouped seasons
ax.legend(handles = handles[-3:],
labels = (" : ".join([k,v]) for k,v in handler_map.items()),
bbox_to_anchor = (1, 1.05))
# show the plot
plt.show()
# In[27]:
# delete temporary/intermediate variables
del data
# In[28]:
# create figure object
fig, ax = plt.subplots()
# distribution of runtime in minutes
ax.hist(x = netflix_raw_data.loc[netflix_raw_data["duration"].str.contains("min"), "duration"].\
str.strip("min").astype(int),
bins = 50)
# draw vertical line - mean runtime for movies
ax.axvline(x = netflix_raw_data.loc[netflix_raw_data["duration"].str.contains("min"), "duration"].\
str.strip("min").astype(int).mean(),
lw = 3,
dashes = [5, 2, 1, 2],
color = "orange",
label = "Mean")
# annotation
ax.annotate(text = "{:.2f}".format(netflix_raw_data.loc[netflix_raw_data["duration"].str.contains("min"), "duration"].\
str.strip("min").astype(int).mean()),
xy = [100, 700])
# extras
ax.legend()
ax.set_title("Distribution of Runtime Minutes for Movies on Netflix")
ax.set_ylabel("Count", fontsize = "large")
ax.set_xlabel("Minutes", fontsize = "large")
# show plot
plt.show()
# ## Tackle the Missing Values
#
# The big bad boy: Missing Values in 3 columns - Director, Cast and Country of origin of a title.
# The idea is to try out the best approach possible and fill-in as many missing values as we can.
#
# Why IMDB? [IMDB.com](https://www.imdb.com/) provides separate datasets with different information. The common link between these datasets are alpha-numeric columns like "tconst", "nconst" etc. that provide unique identifiers for a title/ cast/ crew member.
# ### Split the Netflix dataset into 2 separate dataframes
#
# - netflix_data : dataframe with non-null values for all the columns
# - missing_data : dataframe with rows where values for any of the 3 columns is missing
# In[29]:
"""split the dataframe with missing and non-missing values"""
# non-null data
netflix_data = netflix_raw_data[(netflix_raw_data["director"].notna()) &
(netflix_raw_data["cast"].notna()) &
(netflix_raw_data["country"].notna())].reset_index(drop = True)
# where data for either of the 3 columns - director, cast and country is missing
missing_data = netflix_raw_data[netflix_raw_data["director"].isnull() |
netflix_raw_data["cast"].isnull() |
netflix_raw_data["country"].isnull()].reset_index(drop = True)
# In[30]:
# shape of resultant datasets
print("Dataframe Name: (Rows, Columns)",
"-".rjust(30, "-"),
"Data with Nulls: " + str(missing_data.shape),
"Non-null data: " + str(netflix_data.shape), sep = "\n")
# In[31]:
missing_data[["director", "cast", "country"]].isnull().sum()
# ### Concatenate the title and release_year columns in missing_data dataframes
#
# In order to compare the Netflix dataset and datasets provided by IMDB,
#
# - special or non-alphanumeric characters have been removed from title names
# - lower case has been applied to all the title names
# - underscore has been added between title and year values while concatenation
# In[32]:
# combination of title and release_year columns
missing_data["combo"] = missing_data["title"].str.replace(r"[^\w\s]+", "", regex = True).str.lower().\
str.cat(missing_data["release_year"].astype(str), sep = "_")
# sample rows with selected columns only
missing_data[["title", "release_year", "combo", "director", "cast", "country"]].head(10)
# ### Read IMDB datasets
# In[33]:
# Set-up directory and Files
imdb_dir = os.getcwd() + "\\IMDB_Dataset\\"
imdb_file_names = ["title_akas.tsv", "title_basics.tsv", "title_crew.tsv", "title_principals.tsv", "name_basics.tsv"]
# In[34]:
# import the IMDB_work.py module aliased as "iw" for now
import IMDB_work as iw
# In[35]:
print("Docstring for the module and function", "\n")
print(iw.__doc__)
print(iw.read_imdb_files.__doc__)
# In[36]:
""" Read the title_basics.tsv file """
title_basics = iw.read_imdb_files(file_name = imdb_file_names[1], file_directory = imdb_dir)
# In[37]:
""" Read the name_basics.tsv file """
name_basics = iw.read_imdb_files(file_name = imdb_file_names[-1], file_directory = imdb_dir)
# In[38]:
""" Read the title_crew.tsv file """
title_crew = iw.read_imdb_files(file_name = imdb_file_names[2], file_directory = imdb_dir)
# In[39]:
"""Read title_principals.tsv file"""
title_principals = iw.read_imdb_files(file_name = imdb_file_names[3], file_directory = imdb_dir)
# ### Finding Common Titles
# In[40]:
# combination of primaryTitle and startYear columns >> title_basics dataset
title_basics["combo"] = title_basics["primaryTitle"].str.replace(r"[^\w\s]+", "", regex = True).str.lower().\
str.cat(title_basics["startYear"].astype(str), sep = "_")
# In[41]:
# sample rows from title_basics dataframe
title_basics.head()
# In[42]:
"""
- match the combination in title_basics and missing_data dataframes
- collect and store the matched records in a temporary dataframe - match_df
"""
all_matches_df = title_basics.loc[title_basics["combo"].isin(missing_data["combo"]),
["tconst", "combo"]].reset_index(drop = True).\
groupby("combo")["tconst"].unique().reset_index()
# sample rows from resultant match_df dataframe
print("Shape:", all_matches_df.shape)
all_matches_df.head()
# In[43]:
# filter out unique title names i.e. no other title of the same name was released at any other time
all_matches_df = all_matches_df[all_matches_df["tconst"].str.len() == 1].\
set_index("combo").explode("tconst").reset_index()
# shape of resultant match_df
print(all_matches_df.shape)
# keep all_macthes_df as backup and create a dataframe for intermediate steps
match_df = all_matches_df.copy()
# ### Finding Missing Director
# In[44]:
"""
- match the tconst ID between the match_df and title_crew dataset
- extract director ID's associated with a title from title_crew dataframe
"""
# match the title ID and map the director ID in match_df dataframe
match_df["director_id"] = match_df["tconst"].map(dict(title_crew.loc[title_crew["tconst"].isin(match_df["tconst"]),
["tconst", "directors"]].dropna().values))
"""
- the title_crew dataset itself has null values. matches with these records won't help us
- so we drop the records where director ID column still contains null
"""
# drop null values
match_df.dropna( inplace = True) #subset = ["director_id"],
# reset the index for match_df to linear >> 0,...,n-1
match_df.reset_index(drop = True, inplace = True)
# sample rows from resultant match_df
print(match_df.shape)
match_df.tail(10)
# In[45]:
"""Since we collected multiple directors as comma separated values
we will explode these ID's so that we get one director ID per row."""
# split the director ID by ","
match_df["director_id"] = match_df["director_id"].str.split(",")
# explode on director ID column to get dataframe with each in separate row
match_df = match_df.set_index(["combo", "tconst"]).explode("director_id").reset_index()
# sample rows from resultant match_df
match_df.tail(10)
# In[46]:
"""Match the director ID with nconst column in Name Basics dataset and extract the name of the director"""
# match the director ID and nconst columns
matched_names = name_basics.loc[name_basics["nconst"].isin(match_df["director_id"]), ["nconst", "primaryName"]].values
# map the names of directors
match_df["director_name"] = match_df["director_id"].map(dict(matched_names))
# sample rows from resultant match_df
match_df.tail()
# This is a reverse process to short our long dataframe into a wide one by grouping (multiple if present) director(s) based on a title.
# In[47]:
# group the directors by title ID
match_df = match_df.groupby("combo")["director_name"].apply(lambda s: ",".join(s)).reset_index()
# How many missing directors have we been able to identify?
match_df.shape
# In[48]:
# replace the missing_directors; this will be later appended to netflix non null dataset
missing_data.loc[missing_data["director"].isnull(),
"director"] = missing_data.loc[missing_data["director"].isnull(),
"combo"].map(dict(match_df.values)).fillna("Unknown")
# In[49]:
# have we been able to reduce the missing value count for "director" column
missing_data[["director", "cast", "country"]].isna().sum()
# In[50]:
# drop match_df dataframe to be re-used for next step
del match_df
# ### Finding Missing Cast
# In[51]:
# utilize the backup of dataframe with matched title_id's with title_basics dataset
match_df = all_matches_df.copy()
# In[52]:
# Value present in category column
sorted(title_principals["category"].unique())
# In[53]:
"""
- match the tconst ID between the match_df and title_principals dataset
- extract director ID's associated with a title from title_principals dataframe
- select only those principals where category is Actor or Actress
"""
match_df["cast_id"] = (
match_df["tconst"].map(dict(title_principals.loc[(title_principals["tconst"].isin(match_df["tconst"])) &
(title_principals["category"].isin(["actress", "actor"])),
["tconst", "nconst"]].reset_index(drop = True).\
groupby("tconst")["nconst"].unique().reset_index().values))
)
# drop null values
match_df.dropna(inplace = True)
# reset index of match_df dataframe
match_df.reset_index(drop = True, inplace = True)
# In[54]:
# explode match_df on cast_id column to get one principal ID per row
match_df = match_df.set_index(["combo", "tconst"]).explode("cast_id").reset_index()
# In[55]:
# match the cast ID with nconst column
matched_names = name_basics.loc[name_basics["nconst"].isin(match_df["cast_id"]), ["nconst", "primaryName"]].values
# map the Names of cast members to match_df dataframe
match_df["cast_name"] = match_df["cast_id"].map(dict(matched_names))
# resultant match_df shape
match_df.shape
# In[56]:
# remove any null values from match_df
match_df.dropna(inplace = True)
# In[57]:
# concatenate multiple cast members as comma separated values
match_df.groupby("combo")["cast_name"].apply(lambda s: ",".join(s)).reset_index()
# what's the tally?
missing_data.isnull().sum()
# In[58]:
# group the by title ID
match_df = match_df.groupby("combo")["cast_name"].apply(lambda s: ",".join(s)).reset_index()
# replace the missing_directors; this will be later appended to netflix non null dataset
missing_data.loc[missing_data["cast"].isnull(),
"cast"] = missing_data.loc[missing_data["cast"].isnull(),
"combo"].map(dict(match_df.values)).fillna("Unknown")
# In[59]:
# recycling the variables
del match_df
# ### Finding Missing Country - Approach 1
#
# Here we have a change of strategy. Since none of the IMDB datasets contain direct information about the origin country of a title content, we will have to capture the information indirectly or perhaps employ multiple ways to capture the information.
# In[60]:
# Import BeautifulSoup module
from bs4 import BeautifulSoup
# import requests module
import requests
# In[61]:
# only select the rows where country column is missing values
match_df = all_matches_df[all_matches_df["combo"].isin(missing_data.loc[missing_data["country"].isnull(), "combo"])]
# reset the index of match_df dataframe
match_df = match_df.reset_index(drop = True)
# In[62]:
# create a column and generate url values with each title_id
# Each title on IMDB can be accessed using "https://www.imdb.com/title//"
match_df["imdb_url"] = match_df["tconst"].apply(lambda str_id: "https://www.imdb.com/title/" + str_id + "/")
# sample rows
match_df.head()
# In[63]:
"""
Function to read the web-page for a title on IMDB.com
-----------------------------------------------------
Name: get_country_of_origin
Parameters:
- imdb_url: dedicated web-page of a title on IMDB.com
Returns: a string value containing name of the country/countries of origin of a title
Additional Info:
- To avoid session time-outs/ multiple requests/ Nonetype errors, WITH requests.Session() statement has been used.
"""
# function definition
def get_country_of_origin(imdb_url):
# request a session
with requests.Session() as s:
# access the web-page using the passed url string
title_page = s.get(imdb_url)
# parse the web-page contents with BeautifulSoup
soup = BeautifulSoup(title_page.content, 'html.parser')
# search for a list item with specific attributes
item = soup.find("li", {"data-testid" : "title-details-origin"},
class_ = "ipc-metadata-list__item",)
# if the search is successful, return the content string
if item:
return item.a.string
# In[64]:
# apply the "get_country_of_origin" function to the imdb_url column
match_df["country_name"] = match_df["imdb_url"].apply(get_country_of_origin, )
# how many results have we got successfully
match_df[match_df["country_name"] != None].shape
# In[65]:
# replace the found country names back to missing_data dataframe
missing_data.loc[missing_data["country"].isnull(),
"country"] = missing_data.loc[missing_data["country"].isnull(),
"combo"].map(dict(match_df[["combo",
"country_name"]].values))
# In[66]:
# missing value count
missing_data.isnull().sum()
# ### Finding Missing Country - Approach 2
# In[67]:
pd.options.display.max_colwidth = 200
# pattern work to match the Step2B in Mike's Presentation
# pattern: title containing "some words (language)"
missing_data.loc[(missing_data["country"].isnull()) &
(missing_data["title"].str.contains(r"\(\w+\)")), ["title"]]
# In[68]:
# select the indexes where the titles matched the pattern
select_idx = missing_data.loc[(missing_data["country"].isnull()) &
(missing_data["title"].str.contains(r"\(\w+\)")), ["title"]].index
# substitute India for country value for the rows at selected indices
missing_data.loc[select_idx, "country"] = "India"
# In[69]:
# select the indexes where the title or listed_in columns contains the pattern "korean"
select_idx = missing_data.loc[(missing_data["country"].isnull()) &
((missing_data["title"].str.contains(r"[Kk]orean", regex = True)) |
(missing_data["listed_in"].str.contains(r"[Kk]orean", regex = True))), ["title"]].index
# substitute "South Korea" for country value for the rows at selected indices
missing_data.iloc[select_idx, 5] = "South Korea"
# In[70]:
# select the indexes where the listed_in column contains the pattern "british"
select_idx = missing_data.loc[(missing_data["country"].isnull()) &
(missing_data["listed_in"].str.contains(r"[Bb]ritish", regex = True)), ["title"]].index
# substitute "United Kingdom" for country value for the rows at selected indices
missing_data.iloc[select_idx, 5] = "United Kingdom"
# In[71]:
# identify the indexes where the listed_in column contains the pattern "spanish"
missing_data.loc[(missing_data["listed_in"].str.lower().str.contains("spanish")) &
(missing_data["country"].isnull())]["title"]
# In[72]:
# spanish title dictionary with countries
spanish_titles = {"Camarón Revolution" : "Spain",
"Especial 20 años Fútbol de Primera" : "Argentina",
"Los 10 años de Peter Capusotto" : "Argentina",
"Surviving Escobar - Alias JJ" : "Columbia",
"The Least Expected Day: Inside the Movistar Team 2019" : "Spain"}
# In[73]:
# select the indexes where the listed_in column contains the pattern "spanish"
selected_idx = missing_data[missing_data["title"].isin(list(spanish_titles.keys()))].index
# replace the None values in country column with dictionary values
missing_data.loc[selected_idx, "country"] = missing_data.loc[selected_idx, "title"].map(spanish_titles)
# In[74]:
# replace the still missing values in missing_data dataframe with "Unknown"
missing_data["country"].fillna("Unknown", inplace = True)
# ### Combining the Found Values
# In[75]:
"""
- merge the non-null dataset with the substituted missing values dataset
- the resultant dataset has been named "netflix_clean_data"
"""
# merge two datasets and reset index
netflix_clean_data = netflix_data.append(missing_data.iloc[:, :-1]).reset_index(drop = True)
# ## Apriori Analysis - Learn Associations
# In[76]:
# import mlxtend modules to find out assciation_rules
from mlxtend.frequent_patterns import association_rules, apriori
from mlxtend.preprocessing import TransactionEncoder
# In[77]:
# subset with only International TV Shows
inter_tv = netflix_clean_data.loc[(netflix_clean_data["type"] == "TV Show") &
(netflix_clean_data["listed_in"].str.contains(r"International", regex = True))]
inter_tv.shape
# In[78]:
# subset with only International Movies
inter_mv = netflix_clean_data.loc[(netflix_clean_data["type"] == "Movie") &
(netflix_clean_data["listed_in"].str.contains(r"International", regex = True))]
inter_mv.shape
# In[79]:
# subset with only non-International TV Shows
domestic_tv = netflix_clean_data.loc[(netflix_clean_data["type"] == "TV Show") &
~(netflix_clean_data["listed_in"].str.contains(r"International", regex = True))]
domestic_tv.shape
# In[80]:
# subset with only non-International Movies
domestic_mv = netflix_clean_data.loc[(netflix_clean_data["type"] == "Movie") &
~(netflix_clean_data["listed_in"].str.contains(r"International", regex = True))]
domestic_mv.shape
# In[81]:
"""
Function to identify similarities/associations between genres, cast members, directors
-----------------------------------------------------
Name: calculate_association
Parameters:
- df: subset of netflix_clean_dataset
- column_name: column containing attributes for which association needs to be calculated
- split_char: string characetrs to split the attributes - default value ", "
- min_support: minimum support value to be passed to apriori module
- max_length: - maximum length of the set value to be passed to apriori module
- helps to decide if paired association is required
- index_col: column to form index for apriori analysis
Returns: a dataframe comprising of support values and itemsets as its two columns
Additional Info:
- mlxtends's TransactionEncoder module has been used to transform the columns values
"""
# function definition
def calculate_association(df, column_name, min_support, max_length, index_col = "show_id", split_char = ", "):
# obtain list of lists by splitting the values in the column using the split character
col_val_list = list(df[column_name].str.split(split_char))
# initialize TransactionEncoder
txn_enc = TransactionEncoder()
# fit the list of lists
txn_enc.fit(col_val_list)
# obtain the transformed values using the encoder
transformed_data = txn_enc.transform(col_val_list)
# create an encoded df with unique show_id column acting as index
encoded_df = pd.DataFrame(transformed_data, columns = txn_enc.columns_, index = df[index_col])
# apply the apriori algorithm to the transformed dataframe and select
apriori_results = apriori(df = encoded_df[encoded_df.sum(axis = 1) >= 1],
use_colnames = True,
min_support = min_support,
max_len = max_length).sort_values(by = "support", ascending = False)
return apriori_results
# In[82]:
# calculate support values for international TV content
apriori_results = calculate_association(inter_tv, "listed_in", 0.05, 2)
apriori_results
# In[83]:
# display association rules and values for international TV content
rules_df = association_rules(apriori_results, metric="lift").sort_values(by = ["antecedent support", "lift", "support"],
ascending = False)[:10]
rules_df
# In[84]:
# result dataframe
final_results = pd.DataFrame({"Genres" : rules_df["consequents"].apply(lambda s: list(s)[0]),
"support" : rules_df["support"]})
final_results
# In[85]:
# calculate support values for domestic TV content
apriori_results = calculate_association(domestic_tv, "listed_in", 0.05, 1)[:10]
apriori_results.rename({"itemsets" : "Genres"}, axis = 1, inplace = True)
# to unfreeze the frozensets in apriori_results dataframe
apriori_results["Genres"] = apriori_results["Genres"].apply(lambda s: list(s)[0])
# In[86]:
# merge the final results of both the domestic and international TV content
final_results = final_results.merge(apriori_results[["Genres", "support"]],
how = "outer",
left_on = "Genres",
right_on = "Genres")
final_results
# In[87]:
# create figure with shared y-axis
fig, ax = plt.subplots(ncols = 2, sharey = True)
# Create a lolipop chart because why not!
# stem of the sub-plot 1
ax[0].hlines(final_results["Genres"],
xmin = 0,
xmax = -1 * final_results["support_x"],
lw = 3)
# dot of the sub-plot 1
ax[0].scatter(-1 * final_results["support_x"],
final_results["Genres"],
marker = "o",
s = 100)
# shift the y-axis to right for sub-plot 1 to make the yticklabels appear in center
ax[0].yaxis.tick_right()
# enable yticks
ax[0].set_yticks(final_results["Genres"].index)
# enable yticklabels
ax[0].set_yticklabels(list(final_results["Genres"].values)) #, fontdict = {'horizontalalignment': "center"})
# hide ytickmarks
ax[0].tick_params(axis = u'y', which = u'both', length=0)
# title for sub-plot 1
ax[0].set_title("International")
# hide the x-axis
ax[0].get_xaxis().set_visible(False)
# stem of the sub-plot 2
ax[1].hlines(final_results["Genres"],
xmin = 0,
xmax = final_results["support_y"],
lw = 3)
# dot of the sub-plot 2
ax[1].scatter(final_results["support_y"],
final_results["Genres"],
marker = "o",
s = 100)
# title for sub-plot 2
ax[1].set_title("Domestic")
# hide tick marks for y-axis
ax[1].tick_params(axis = 'y', which = u'both', length=0)
# hide the x-axis for sub-plot 2
ax[1].get_xaxis().set_visible(False)
# despine the sub-plot
for i in ax[0].spines:
ax[0].spines[i].set_visible(False)
for i in ax[1].spines:
ax[1].spines[i].set_visible(False)
# extras
plt.subplots_adjust(wspace = 0.7)
plt.suptitle("Breakdown of Netflix Content by 10 Most Common Genres (for TV Shows)", y = 1.02)
# show the plot
plt.show()
# In[88]:
# calculate support values for international Movie content
apriori_results = calculate_association(inter_mv, "listed_in", 0.05, 2)
apriori_results
# In[89]:
# display association rules and values for international TV content
rules_df = association_rules(apriori_results, metric="lift").sort_values(by = ["antecedent support", "lift", "support"],
ascending = False)[:9]
rules_df
# In[90]:
# result dataframe
final_results = pd.DataFrame({"Genres" : rules_df["consequents"].apply(lambda s: list(s)[0]),
"support" : rules_df["support"]})
final_results
# In[91]:
# calculate support values for domestic Movie content
apriori_results = calculate_association(domestic_mv, "listed_in", 0.05, 1)[:10]
apriori_results.rename({"itemsets" : "Genres"}, axis = 1, inplace = True)
apriori_results["Genres"] = apriori_results["Genres"].apply(lambda s: list(s)[0])
# In[92]:
# merge the final results of both the domestic and international TV content
final_results = final_results.merge(apriori_results[["Genres", "support"]],
how = "outer",
left_on = "Genres",
right_on = "Genres")
final_results
# In[93]:
# create figure with shared y-axis
fig, ax = plt.subplots(ncols = 2, sharey = True)
# Create a lolipop chart because why not!
# stem of the sub-plot 1
ax[0].hlines(final_results["Genres"],
xmin = 0,
xmax = -1 * final_results["support_x"],
lw = 3)
# dot of the sub-plot 1
ax[0].scatter(-1 * final_results["support_x"],
final_results["Genres"],
marker = "o",
s = 100)
# shift the y-axis to right for sub-plot 1 to make the yticklabels appear in center
ax[0].yaxis.tick_right()
# enable yticks
ax[0].set_yticks(final_results["Genres"].index)
# enable yticklabels
ax[0].set_yticklabels(list(final_results["Genres"].values)) #, fontdict = {'horizontalalignment': "center"})
# hide ytickmarks
ax[0].tick_params(axis = u'y', which = u'both', length=0)
# title for sub-plot 1
ax[0].set_title("International")
# hide the x-axis
ax[0].get_xaxis().set_visible(False)
# stem of the sub-plot 2
ax[1].hlines(final_results["Genres"],
xmin = 0,
xmax = final_results["support_y"],
lw = 3)
# dot of the sub-plot 2
ax[1].scatter(final_results["support_y"],
final_results["Genres"],
marker = "o",
s = 100)
# title for sub-plot 2
ax[1].set_title("Domestic")
# hide tick marks for y-axis
ax[1].tick_params(axis = 'y', which = u'both', length=0)
# hide the x-axis for sub-plot 2
ax[1].get_xaxis().set_visible(False)
# despine the sub-plot
for i in ax[0].spines:
ax[0].spines[i].set_visible(False)
for i in ax[1].spines:
ax[1].spines[i].set_visible(False)
# extras
plt.subplots_adjust(wspace = 0.7)
plt.suptitle("Breakdown of Netflix Content by 10 Most Common Genres (for Movies)", y = 1.02)
# show the plot
plt.show()
# ## Compare English Language vs. Non-English Language Content
# In[94]:
# read the source wikipedia page
response = requests.get("https://en.wikipedia.org/wiki/List_of_countries_and_territories_where_English_is_an_official_language")
# parse the response content as html
soup = BeautifulSoup(response.content, "html.parser")
# In[95]:
# read the first table with primarily English Speaking Countires
df = pd.read_html(str(soup.find_all("table")[0]))[0]
# tranform the dataframe to a list
english_countries = list(df["Country"].str.extract("(\w+\s*\w+)")[0].values)
# print the extracted countries
# print(english_countries)
# read the second table with Multi-Language Speaking Countires
df = pd.read_html(str(soup.find_all("table")[1]))[0]
# Filter the countries where English is a primary language and add to the list from above step
english_countries = english_countries + list(df.loc[df["Primary language?"].str.contains("Yes"),
"Country"].str.replace("\[\d+\]", "", regex = True).values)
# print the final list
# english_countries
# In[96]:
# make a copy of cleaned dataframe to avoid warnings
netflix_clean_data = netflix_clean_data.copy()
# Based on the english speaking countries, segregate the titles into English/Non-English categories
netflix_clean_data["prim_lang"] = netflix_clean_data["country"].apply(lambda s: "E" if s.split(",")[0]
in english_countries
else ("U" if s.split(",")[0] == "Unknown"
else "NE"))
# In[97]:
# plot the distribution
fig, ax = plt.subplots(figsize = (15, 6))
# donut chart for breakdown of content by primary language
ax.pie(x = netflix_clean_data["prim_lang"].value_counts(),
labels = ["English", "Non-English", "Unknown"],
wedgeprops={'width':0.35},
autopct = "%.0f%%",
pctdistance = 0.85,
startangle = 180)
# extras
ax.text(-0.3, 0, "Language!")
ax.set_title("Breakdown of Netflix Content: English vs. Non-English Content") #, fontsize = "small")
# show the plot
plt.show()
# In[98]:
# calculate the most credited cast members for English Content
apriori_results_eng = calculate_association(netflix_clean_data[(netflix_clean_data["cast"] != "Unknown") &
(netflix_clean_data["prim_lang"] == "E")],
"cast", 0.001, 1)
# top 10 most credited cast members for English Content
apriori_results_eng[:10]
# In[99]:
# calculate the most credited cast members for Non-English Content
apriori_results_neng = calculate_association(netflix_clean_data[(netflix_clean_data["cast"] != "Unknown") &
(netflix_clean_data["prim_lang"] == "NE")],
"cast", 0.001, 1)
# top 10 most credited cast members for Non-English Content
apriori_results_neng[:10]
# In[100]:
# plot the Top 5 Most credited cast members for English & Non-English Content
fig, ax = plt.subplots(figsize = (12, 5))
# stems
ax.hlines(y = apriori_results_eng[:5]["itemsets"].apply(lambda s: list(s)[0]),
xmin = 0,
xmax = -1 * apriori_results_eng[:5]["support"],
lw = 3,
color = "#FF800E",
label = "English")
# dots
ax.scatter(y = apriori_results_eng[:5]["itemsets"].apply(lambda s: list(s)[0]),
x = -1 * apriori_results_eng[:5]["support"],
color = "#FF800E",
s = 100)
# stems
ax.hlines(apriori_results_neng[:5]["itemsets"].apply(lambda s: list(s)[0]),
xmin = 0,
xmax = -1 * apriori_results_neng[:5]["support"],
lw = 3,
label = "Non-English")
# dots
ax.scatter(y = apriori_results_neng[:5]["itemsets"].apply(lambda s: list(s)[0]),
x = -1 * apriori_results_neng[:5]["support"],
s = 100)
# annotations
for i in range(5):
ax.annotate(text = apriori_results_eng[:5]["itemsets"].apply(lambda s: list(s)[0]).values[i],
xy = (0.0001, i - 0.2))
for i in range(5):
ax.annotate(text = apriori_results_neng[:5]["itemsets"].apply(lambda s: list(s)[0]).values[i],
xy = (0.0001, (i + 5) - 0.2))
# extras
ax.set_title("Top 5 Headlining Cast Member Based on No. of Netflix Titles Credited", fontsize = "large")
ax.axis("off")
ax.legend(bbox_to_anchor = (0.3, 0.4))
# display the plot
plt.show()
# In[101]:
# identify the most credited directors for Non-English Content
apriori_results_neng = calculate_association(netflix_clean_data[(netflix_clean_data["director"] != "Unknown") &
(netflix_clean_data["prim_lang"] == "NE")],
"director", 0.001, 1)
# top 10 most credited directors for Non-English Content
apriori_results_neng[:10]
# In[102]:
# calculate the most credited directors for English Content
apriori_results_eng = calculate_association(netflix_clean_data[(netflix_clean_data["director"] != "Unknown") &
(netflix_clean_data["prim_lang"] == "E")],
"director", 0.001, 1)
# top 10 most credited directors for English Content
apriori_results_eng[:10]
# In[103]:
# plot the Top 5 Most credited Directors for English & Non-English Content
fig, ax = plt.subplots(figsize = (12, 5))
# stems
ax.hlines(y = apriori_results_eng[:5]["itemsets"].apply(lambda s: list(s)[0]),
xmin = 0,
xmax = -1 * apriori_results_eng[:5]["support"],
lw = 3,
color = "#FF800E",
label = "English")
# dots
ax.scatter(y = apriori_results_eng[:5]["itemsets"].apply(lambda s: list(s)[0]),
x = -1 * apriori_results_eng[:5]["support"],
color = "#FF800E",
s = 100)
# stems
ax.hlines(apriori_results_neng[:5]["itemsets"].apply(lambda s: list(s)[0]),
xmin = 0,
xmax = -1 * apriori_results_neng[:5]["support"],
lw = 3,
label = "Non-English")
# dots
ax.scatter(y = apriori_results_neng[:5]["itemsets"].apply(lambda s: list(s)[0]),
x = -1 * apriori_results_neng[:5]["support"],
s = 100)
# annotations
for i in range(5):
ax.annotate(text = apriori_results_eng[:5]["itemsets"].apply(lambda s: list(s)[0]).values[i],
xy = (0.0001, i - 0.2))
for i in range(5):
ax.annotate(text = apriori_results_neng[:5]["itemsets"].apply(lambda s: list(s)[0]).values[i],
xy = (0.0001, (i + 5) - 0.2))
# extras
ax.set_title("Top 5 Headlining Direcors Based on No. of Netflix Titles Credited", fontsize = "large")
ax.axis("off")
ax.legend(bbox_to_anchor = (0.3, 0.4))
# show the plot
plt.show()
# In[104]:
# import wordcloud, stopwords and Image modules
from wordcloud import WordCloud, STOPWORDS
from PIL import Image
# In[105]:
# to collect all the STOPWORDS
stopwords = set(STOPWORDS)
# In[106]:
# to collect all the words from description column of Non-English Content
title_words_ne = ""
"""
- select the descriptions from the rows classified as Non-English Content
- remove any non-alpha-numeric characters from the description of a title
- collect each word from each row of description column
- join all the words to obtain a string collection
"""
# format the description column for Non-English Content
for row in netflix_clean_data[netflix_clean_data["prim_lang"] == "NE"]["description"].str.replace(pat = r"[^a-zA-Z0-9 ]",
repl = "",
regex = True):
# split the words by space for each row
for each_word in row.split():
# collect all the words in string object
title_words_ne += "".join(each_word.lower())+" "
# In[107]:
# to collect all the words from description column of English Content
title_words_e = ""
"""
- select the descriptions from the rows classified as Non-English Content
- remove any non-alpha-numeric characters from the description of a title
- collect each word from each row of description column
- join all the words to obtain a string collection
"""
# format the description column for English Content
for row in netflix_clean_data[netflix_clean_data["prim_lang"] == "E"]["description"].str.replace(pat = r"[^a-zA-Z0-9 ]",
repl = "",
regex = True):
# split the words by space for each row
for each_word in row.split():
# collect all the words in string object
title_words_e += "".join(each_word.lower())+" "
# In[108]:
# plot the wordclouds
fig, ax = plt.subplots(ncols = 2, figsize = (10, 5))
# to obtain the wordcloud in the shape of N
mask = np.array(Image.open("Netflix_icon2.png"))
# set the properties for and generate the word cloud for Non-English Content
wrd = WordCloud(mask = mask,
stopwords = stopwords,
background_color = "white",
max_words = 100,
colormap = "tab10_r",
random_state = 1).generate(title_words_ne)
# plot the word cloud
ax[0].imshow(wrd)
# extras
ax[0].axis("off")
ax[0].set_title("For Non-English Content")
# set the properties for and generate the word cloud for English Content
wrd = WordCloud(mask = mask,
stopwords = stopwords,
background_color = "white",
max_words = 100,
colormap = "tab10_r",
random_state = 1).generate(title_words_e)
# plot the word cloud
ax[1].imshow(wrd)
# extras
ax[1].axis("off")
ax[1].set_title("For English Content")
plt.tight_layout(pad = 0)
plt.suptitle("#Spoiler! What's The Plot About?", y = 1.1, fontsize = 20)
# show the plot
plt.show()