#!/usr/bin/env python # coding: utf-8 # # Analyzing Netflix Dataset With Python # # (Part-1) # ### Import libraries # In[1]: import numpy as np import pandas as pd import matplotlib.pyplot as plt import matplotlib.style as style import os import re get_ipython().run_line_magic('matplotlib', 'inline') # ### Set default options # In[2]: pd.options.display.max_colwidth = 100 plt.rcParams["figure.figsize"] = [15, 5] plt.rcParams["axes.titlesize"] = 20 plt.rcParams["axes.labelsize"] = 12 plt.rcParams["font.size"] = 14 style.use('tableau-colorblind10') # ### Read and explore Netflix dataset # In[3]: # directory that contains netflix dataset net_dir = os.getcwd() + "\\Netflix_Dataset\\" # zip file containing netflix dataset net_zipfile = "Netflix_Titles.zip" # Read the file and save in raw dataframe netflix_raw_data = pd.read_csv(net_dir + net_zipfile) # #### dataframe.info() and dataframe.describe() # In[4]: # info and describe raw dataframe netflix_raw_data.info() pd.options.display.float_format = "{:.0f}".format netflix_raw_data.describe() # In[5]: # reset float format pd.options.display.float_format = "{:.3f}".format # #### Duplicate rows? # In[6]: # check for any duplicate records print("No. of duplicated records (if any):", netflix_raw_data.duplicated().sum()) # #### Content type # In[7]: # content category netflix_raw_data["type"].value_counts() # ### Sample rows # In[8]: # set some display options pd.options.display.max_colwidth = 50 pd.options.display.max_columns = 20 # sample rows from netflix dataset netflix_raw_data.iloc[7748: 7754, :] # ### Table for columns with Missing Values # In[9]: # this block of code only displays columns which have missing data and the % of rows with the missing values null_column_list = [] # total no. of rows in dataset total_rows = netflix_raw_data.shape[0] for each_col in netflix_raw_data.columns: if netflix_raw_data[each_col].isnull().sum() > 0: null_sum = netflix_raw_data[each_col].isnull().sum() null_perc = "{:.4f}".format(null_sum / total_rows * 100) null_column_list.append([each_col, null_sum, null_perc]) # display missing row count and % as a dataframe pd.DataFrame(null_column_list, columns = ["Column Name", "No. of rows with missing values", "Missing value as % (Netflix Dataset)"]) # In[10]: # delete the dummy variables del total_rows, null_column_list # ## Explore Netflix Dataset # # ### type Column # In[11]: # create figure fig, ax = plt.subplots(figsize = (12,3)) # plot content type as percentage ax.barh(y = netflix_raw_data["type"].value_counts(normalize = True).index, width = netflix_raw_data["type"].value_counts(normalize = True).mul(100).values) # set title for the plot ax.set_title("Types of content available on Netflix") # hide the x-axis ticks and labels ax.set_xticklabels(labels = "") ax.tick_params(bottom = False) # annotate each bar for p in ax.patches: ax.annotate(text = "{:.2f}%".format(p.get_width()), xy = [p.get_width()/ 2, p.get_y() + 0.35], color = "white", fontsize = "large") # despine the plot [ax.spines[spine].set_visible(False) for spine in ax.spines if spine != "left"] # show the plot plt.show() # ### rating Column # In[12]: # identify the titles with missing-values netflix_raw_data.loc[netflix_raw_data["rating"].isnull(), ["title", "director", "release_year", "rating"]] # #### Mapper for titles with missing values # In[13]: # dictionary to map title with ratings (Copied from Mike's work) ratings_map = {idx:val for idx,val in zip(netflix_raw_data.loc[netflix_raw_data["rating"].isnull(), "title"].values, ["TV-PG", "TV-14", "TV-Y7", "TV-MA", "TV-MA", "TV-MA", "PG-13"])} # display ratings dictionary ratings_map # In[14]: # replace the missing ratings netflix_raw_data["rating"] = netflix_raw_data["title"].map(ratings_map).fillna(netflix_raw_data["rating"]) # In[15]: # import pywaffle from pywaffle import Waffle # In[16]: # temp variable for value count of ratings for Netflix Movies data1 = dict(netflix_raw_data[netflix_raw_data["type"] == "Movie"] ["rating"].value_counts()) # temp variable for value count of ratings for Netflix TV Shows data2 = dict(netflix_raw_data[netflix_raw_data["type"] != "Movie"] ["rating"].value_counts()) # In[17]: # make pywaffle chart with 2 subplots for each type of Netflix Content fig = plt.figure( FigureClass = Waffle, plots = { '211': {'values': data1, 'labels' : [f"{k} ({int(v / sum(data1.values()) * 100)}%)" for k, v in data1.items()], 'legend': { 'loc': 'lower left', 'bbox_to_anchor': (0, -0.6), 'fontsize': 12, 'ncol': 5}, 'title' : {"label" : "Ratings Wise Distribution of Netflix Movies"} }, '212': {'values': data2, 'labels' : [f"{k} ({int(v / sum(data1.values()) * 100)}%)" for k, v in data2.items()], 'legend': { 'loc': 'lower left', 'bbox_to_anchor': (0, -0.45), 'fontsize': 12, 'ncol': 5}, 'title' : {"label" : "Ratings Wise Distribution of Netflix Shows"} } }, rows = 3, columns = 15, icons = "user", cmap_name = "tab20", figsize = (14, 6.5) ) plt.tight_layout(h_pad = 0) plt.show() # ### date_added Column & release Column # In[18]: # convert date_added column to a datetime format netflix_raw_data["date_added"] = pd.to_datetime(netflix_raw_data["date_added"]) # select only those records where date_added column has values netflix_raw_data = netflix_raw_data[netflix_raw_data["date_added"].notna()] # shape of resultant netflix_raw_data netflix_raw_data.shape # #### Additional Analysis # # This is a small analysis of: # - distribution of titles based on the decade they were released # - distribution of titles hosted by Netflix since it's inception # # For ease of plotting: # - the titles that were released before the year 1980, have been grouped under `Before 1980`. Why this year? because I was born in this decade! # - since Netflix established itself in 190 countries by 2016, all the titles added (on Netflix platform) before 2016 have been grouped under `Added before 2016`. # In[19]: # calculate the decade for each year of release decade_df = (netflix_raw_data["release_year"] // 10 * 10).value_counts().reset_index() # rename the dataframe columns decade_df.rename(columns = {"index": "Decade", "release_year" : "No. of Titles"}, inplace = True) # identify titles released before the year 1980 decade_df["Decade"] = decade_df["Decade"].apply(lambda d: d if d > 1970 else "Before 1980") # label each decade as YYYY's decade_df["Decade"] = decade_df["Decade"].astype(str).apply(lambda s: s + "'s") # In[20]: # group the no. of titles by decade in which they were released data1 = decade_df.groupby("Decade")["No. of Titles"].sum() # group the no. of titles by decade in which they were added to Netflix data2 = netflix_raw_data["date_added"].dt.year.apply(lambda d: "In " + str(d) if d > 2015 else "Before 2016").\ value_counts().sort_index() # In[21]: # create figure object fig, ax = plt.subplots(ncols = 2, figsize = (13.5, 6)) # plot the years when title was released ax[0].pie(x = data1.values, labels = data1.index, labeldistance = 1.03, autopct = "%.1f%%", explode = [0.1, 0.05, 0, 0, 0, 0.15], pctdistance = 0.85, startangle = 0, wedgeprops = {"width": 0.4}) # annotation in the center ax[0].annotate(text = "Decade of Release", xy = (-0.5,0), fontsize = 15) # plot the years when title was added ax[1].pie(x = data2.values, labels = data2.index, labeldistance = 1.03, autopct = "%.1f%%", explode = [0.1, 0.05, 0, 0, 0, 0, 0.15], pctdistance = 0.85, startangle = 0, rotatelabels = False, wedgeprops = {"width": 0.4}) # annotation in the center ax[1].annotate(text = "Year of Addition", xy = (-0.3,0), fontsize = 15) # extras # plt.subplots_adjust(wspace = 0.2, hspace = 0) plt.tight_layout(pad = 1) plt.suptitle(t = "Distribution of Titles on Netflix by the", fontsize = 20) # show plot plt.show() # In[22]: # delete non-essential variables del decade_df, data1, data2 # ### duration Column # # This column specifies the duration of a title in No. of seasons or runtime in minutes. However, if we analyze the values, the TV Shows are specified with seasons and Movies have runtime minutes. No series has runtime in minutes. # In[23]: # title type with duration in seasons print(netflix_raw_data.loc[netflix_raw_data["duration"].str.contains("Season"), "type"].value_counts(), "\n") # title type with duration in minutes print(netflix_raw_data.loc[netflix_raw_data["duration"].str.contains("min"), "type"].value_counts()) # In[24]: # duration for title type TV Show netflix_raw_data.loc[netflix_raw_data["duration"].str.contains("Season"), "duration"].value_counts() # ### Visualize distribution of seasons for titles on netflix # In[25]: # temporary variable data = netflix_raw_data.loc[netflix_raw_data["duration"].str.contains("Season")]["duration"].value_counts() # group seasons to accommodate in a pie-chart for each in data.index: if int(each.split()[0]) < 6: pass elif int(each.split()[0]) <= 8: data.rename({each : "As long as GOT"}, inplace = True) elif int(each.split()[0]) <= 10: data.rename({each : "As long as F.R.I.E.N.D.S"}, inplace = True) else: data.rename({each : "Too long"}, inplace = True) # re-group and total data = data.groupby(data.index).sum() # In[26]: # create a figure object fig, ax = plt.subplots(figsize = (12, 8)) # draw a pie-chart ax.pie(x = data, labels = data.index, autopct = "%.1f%%", pctdistance = 0.89, explode = [0, 0, 0, 0, 0.1, 0.2, 0.3, 0.4], labeldistance = 1.03, startangle = 30, radius = 1.2, wedgeprops = {"width": 0.5}) # set title for the plot ax.set_title("Netflix and Chill...", pad = 20) ax.annotate(text = "But for How long?", xy = (-0.5, 0), fontsize = "18") ## Custom legend for grouped items # handler map handler_map = {"As long as F.R.I.E.N.D.S" : "Upto 10 Seasons", "As long as GOT (Game Of Thrones)" : "Upto 8 Seasons", "Too Long" : "More than 10 Seasons"} # get default the handles and labels handles, labels = ax.get_legend_handles_labels() # place only the last three handles which have grouped seasons ax.legend(handles = handles[-3:], labels = (" : ".join([k,v]) for k,v in handler_map.items()), bbox_to_anchor = (1, 1.05)) # show the plot plt.show() # In[27]: # delete temporary/intermediate variables del data # In[28]: # create figure object fig, ax = plt.subplots() # distribution of runtime in minutes ax.hist(x = netflix_raw_data.loc[netflix_raw_data["duration"].str.contains("min"), "duration"].\ str.strip("min").astype(int), bins = 50) # draw vertical line - mean runtime for movies ax.axvline(x = netflix_raw_data.loc[netflix_raw_data["duration"].str.contains("min"), "duration"].\ str.strip("min").astype(int).mean(), lw = 3, dashes = [5, 2, 1, 2], color = "orange", label = "Mean") # annotation ax.annotate(text = "{:.2f}".format(netflix_raw_data.loc[netflix_raw_data["duration"].str.contains("min"), "duration"].\ str.strip("min").astype(int).mean()), xy = [100, 700]) # extras ax.legend() ax.set_title("Distribution of Runtime Minutes for Movies on Netflix") ax.set_ylabel("Count", fontsize = "large") ax.set_xlabel("Minutes", fontsize = "large") # show plot plt.show() # ## Tackle the Missing Values # # The big bad boy: Missing Values in 3 columns - Director, Cast and Country of origin of a title. # The idea is to try out the best approach possible and fill-in as many missing values as we can. # # Why IMDB? [IMDB.com](https://www.imdb.com/) provides separate datasets with different information. The common link between these datasets are alpha-numeric columns like "tconst", "nconst" etc. that provide unique identifiers for a title/ cast/ crew member. # ### Split the Netflix dataset into 2 separate dataframes # # - netflix_data : dataframe with non-null values for all the columns # - missing_data : dataframe with rows where values for any of the 3 columns is missing # In[29]: """split the dataframe with missing and non-missing values""" # non-null data netflix_data = netflix_raw_data[(netflix_raw_data["director"].notna()) & (netflix_raw_data["cast"].notna()) & (netflix_raw_data["country"].notna())].reset_index(drop = True) # where data for either of the 3 columns - director, cast and country is missing missing_data = netflix_raw_data[netflix_raw_data["director"].isnull() | netflix_raw_data["cast"].isnull() | netflix_raw_data["country"].isnull()].reset_index(drop = True) # In[30]: # shape of resultant datasets print("Dataframe Name: (Rows, Columns)", "-".rjust(30, "-"), "Data with Nulls: " + str(missing_data.shape), "Non-null data: " + str(netflix_data.shape), sep = "\n") # In[31]: missing_data[["director", "cast", "country"]].isnull().sum() # ### Concatenate the title and release_year columns in missing_data dataframes # # In order to compare the Netflix dataset and datasets provided by IMDB, # # - special or non-alphanumeric characters have been removed from title names # - lower case has been applied to all the title names # - underscore has been added between title and year values while concatenation # In[32]: # combination of title and release_year columns missing_data["combo"] = missing_data["title"].str.replace(r"[^\w\s]+", "", regex = True).str.lower().\ str.cat(missing_data["release_year"].astype(str), sep = "_") # sample rows with selected columns only missing_data[["title", "release_year", "combo", "director", "cast", "country"]].head(10) # ### Read IMDB datasets # In[33]: # Set-up directory and Files imdb_dir = os.getcwd() + "\\IMDB_Dataset\\" imdb_file_names = ["title_akas.tsv", "title_basics.tsv", "title_crew.tsv", "title_principals.tsv", "name_basics.tsv"] # In[34]: # import the IMDB_work.py module aliased as "iw" for now import IMDB_work as iw # In[35]: print("Docstring for the module and function", "\n") print(iw.__doc__) print(iw.read_imdb_files.__doc__) # In[36]: """ Read the title_basics.tsv file """ title_basics = iw.read_imdb_files(file_name = imdb_file_names[1], file_directory = imdb_dir) # In[37]: """ Read the name_basics.tsv file """ name_basics = iw.read_imdb_files(file_name = imdb_file_names[-1], file_directory = imdb_dir) # In[38]: """ Read the title_crew.tsv file """ title_crew = iw.read_imdb_files(file_name = imdb_file_names[2], file_directory = imdb_dir) # In[39]: """Read title_principals.tsv file""" title_principals = iw.read_imdb_files(file_name = imdb_file_names[3], file_directory = imdb_dir) # ### Finding Common Titles # In[40]: # combination of primaryTitle and startYear columns >> title_basics dataset title_basics["combo"] = title_basics["primaryTitle"].str.replace(r"[^\w\s]+", "", regex = True).str.lower().\ str.cat(title_basics["startYear"].astype(str), sep = "_") # In[41]: # sample rows from title_basics dataframe title_basics.head() # In[42]: """ - match the combination in title_basics and missing_data dataframes - collect and store the matched records in a temporary dataframe - match_df """ all_matches_df = title_basics.loc[title_basics["combo"].isin(missing_data["combo"]), ["tconst", "combo"]].reset_index(drop = True).\ groupby("combo")["tconst"].unique().reset_index() # sample rows from resultant match_df dataframe print("Shape:", all_matches_df.shape) all_matches_df.head() # In[43]: # filter out unique title names i.e. no other title of the same name was released at any other time all_matches_df = all_matches_df[all_matches_df["tconst"].str.len() == 1].\ set_index("combo").explode("tconst").reset_index() # shape of resultant match_df print(all_matches_df.shape) # keep all_macthes_df as backup and create a dataframe for intermediate steps match_df = all_matches_df.copy() # ### Finding Missing Director # In[44]: """ - match the tconst ID between the match_df and title_crew dataset - extract director ID's associated with a title from title_crew dataframe """ # match the title ID and map the director ID in match_df dataframe match_df["director_id"] = match_df["tconst"].map(dict(title_crew.loc[title_crew["tconst"].isin(match_df["tconst"]), ["tconst", "directors"]].dropna().values)) """ - the title_crew dataset itself has null values. matches with these records won't help us - so we drop the records where director ID column still contains null """ # drop null values match_df.dropna( inplace = True) #subset = ["director_id"], # reset the index for match_df to linear >> 0,...,n-1 match_df.reset_index(drop = True, inplace = True) # sample rows from resultant match_df print(match_df.shape) match_df.tail(10) # In[45]: """Since we collected multiple directors as comma separated values we will explode these ID's so that we get one director ID per row.""" # split the director ID by "," match_df["director_id"] = match_df["director_id"].str.split(",") # explode on director ID column to get dataframe with each in separate row match_df = match_df.set_index(["combo", "tconst"]).explode("director_id").reset_index() # sample rows from resultant match_df match_df.tail(10) # In[46]: """Match the director ID with nconst column in Name Basics dataset and extract the name of the director""" # match the director ID and nconst columns matched_names = name_basics.loc[name_basics["nconst"].isin(match_df["director_id"]), ["nconst", "primaryName"]].values # map the names of directors match_df["director_name"] = match_df["director_id"].map(dict(matched_names)) # sample rows from resultant match_df match_df.tail() # This is a reverse process to short our long dataframe into a wide one by grouping (multiple if present) director(s) based on a title. # In[47]: # group the directors by title ID match_df = match_df.groupby("combo")["director_name"].apply(lambda s: ",".join(s)).reset_index() # How many missing directors have we been able to identify? match_df.shape # In[48]: # replace the missing_directors; this will be later appended to netflix non null dataset missing_data.loc[missing_data["director"].isnull(), "director"] = missing_data.loc[missing_data["director"].isnull(), "combo"].map(dict(match_df.values)).fillna("Unknown") # In[49]: # have we been able to reduce the missing value count for "director" column missing_data[["director", "cast", "country"]].isna().sum() # In[50]: # drop match_df dataframe to be re-used for next step del match_df # ### Finding Missing Cast # In[51]: # utilize the backup of dataframe with matched title_id's with title_basics dataset match_df = all_matches_df.copy() # In[52]: # Value present in category column sorted(title_principals["category"].unique()) # In[53]: """ - match the tconst ID between the match_df and title_principals dataset - extract director ID's associated with a title from title_principals dataframe - select only those principals where category is Actor or Actress """ match_df["cast_id"] = ( match_df["tconst"].map(dict(title_principals.loc[(title_principals["tconst"].isin(match_df["tconst"])) & (title_principals["category"].isin(["actress", "actor"])), ["tconst", "nconst"]].reset_index(drop = True).\ groupby("tconst")["nconst"].unique().reset_index().values)) ) # drop null values match_df.dropna(inplace = True) # reset index of match_df dataframe match_df.reset_index(drop = True, inplace = True) # In[54]: # explode match_df on cast_id column to get one principal ID per row match_df = match_df.set_index(["combo", "tconst"]).explode("cast_id").reset_index() # In[55]: # match the cast ID with nconst column matched_names = name_basics.loc[name_basics["nconst"].isin(match_df["cast_id"]), ["nconst", "primaryName"]].values # map the Names of cast members to match_df dataframe match_df["cast_name"] = match_df["cast_id"].map(dict(matched_names)) # resultant match_df shape match_df.shape # In[56]: # remove any null values from match_df match_df.dropna(inplace = True) # In[57]: # concatenate multiple cast members as comma separated values match_df.groupby("combo")["cast_name"].apply(lambda s: ",".join(s)).reset_index() # what's the tally? missing_data.isnull().sum() # In[58]: # group the by title ID match_df = match_df.groupby("combo")["cast_name"].apply(lambda s: ",".join(s)).reset_index() # replace the missing_directors; this will be later appended to netflix non null dataset missing_data.loc[missing_data["cast"].isnull(), "cast"] = missing_data.loc[missing_data["cast"].isnull(), "combo"].map(dict(match_df.values)).fillna("Unknown") # In[59]: # recycling the variables del match_df # ### Finding Missing Country - Approach 1 # # Here we have a change of strategy. Since none of the IMDB datasets contain direct information about the origin country of a title content, we will have to capture the information indirectly or perhaps employ multiple ways to capture the information. # In[60]: # Import BeautifulSoup module from bs4 import BeautifulSoup # import requests module import requests # In[61]: # only select the rows where country column is missing values match_df = all_matches_df[all_matches_df["combo"].isin(missing_data.loc[missing_data["country"].isnull(), "combo"])] # reset the index of match_df dataframe match_df = match_df.reset_index(drop = True) # In[62]: # create a column and generate url values with each title_id # Each title on IMDB can be accessed using "https://www.imdb.com/title//" match_df["imdb_url"] = match_df["tconst"].apply(lambda str_id: "https://www.imdb.com/title/" + str_id + "/") # sample rows match_df.head() # In[63]: """ Function to read the web-page for a title on IMDB.com ----------------------------------------------------- Name: get_country_of_origin Parameters: - imdb_url: dedicated web-page of a title on IMDB.com Returns: a string value containing name of the country/countries of origin of a title Additional Info: - To avoid session time-outs/ multiple requests/ Nonetype errors, WITH requests.Session() statement has been used. """ # function definition def get_country_of_origin(imdb_url): # request a session with requests.Session() as s: # access the web-page using the passed url string title_page = s.get(imdb_url) # parse the web-page contents with BeautifulSoup soup = BeautifulSoup(title_page.content, 'html.parser') # search for a list item with specific attributes item = soup.find("li", {"data-testid" : "title-details-origin"}, class_ = "ipc-metadata-list__item",) # if the search is successful, return the content string if item: return item.a.string # In[64]: # apply the "get_country_of_origin" function to the imdb_url column match_df["country_name"] = match_df["imdb_url"].apply(get_country_of_origin, ) # how many results have we got successfully match_df[match_df["country_name"] != None].shape # In[65]: # replace the found country names back to missing_data dataframe missing_data.loc[missing_data["country"].isnull(), "country"] = missing_data.loc[missing_data["country"].isnull(), "combo"].map(dict(match_df[["combo", "country_name"]].values)) # In[66]: # missing value count missing_data.isnull().sum() # ### Finding Missing Country - Approach 2 # In[67]: pd.options.display.max_colwidth = 200 # pattern work to match the Step2B in Mike's Presentation # pattern: title containing "some words (language)" missing_data.loc[(missing_data["country"].isnull()) & (missing_data["title"].str.contains(r"\(\w+\)")), ["title"]] # In[68]: # select the indexes where the titles matched the pattern select_idx = missing_data.loc[(missing_data["country"].isnull()) & (missing_data["title"].str.contains(r"\(\w+\)")), ["title"]].index # substitute India for country value for the rows at selected indices missing_data.loc[select_idx, "country"] = "India" # In[69]: # select the indexes where the title or listed_in columns contains the pattern "korean" select_idx = missing_data.loc[(missing_data["country"].isnull()) & ((missing_data["title"].str.contains(r"[Kk]orean", regex = True)) | (missing_data["listed_in"].str.contains(r"[Kk]orean", regex = True))), ["title"]].index # substitute "South Korea" for country value for the rows at selected indices missing_data.iloc[select_idx, 5] = "South Korea" # In[70]: # select the indexes where the listed_in column contains the pattern "british" select_idx = missing_data.loc[(missing_data["country"].isnull()) & (missing_data["listed_in"].str.contains(r"[Bb]ritish", regex = True)), ["title"]].index # substitute "United Kingdom" for country value for the rows at selected indices missing_data.iloc[select_idx, 5] = "United Kingdom" # In[71]: # identify the indexes where the listed_in column contains the pattern "spanish" missing_data.loc[(missing_data["listed_in"].str.lower().str.contains("spanish")) & (missing_data["country"].isnull())]["title"] # In[72]: # spanish title dictionary with countries spanish_titles = {"Camarón Revolution" : "Spain", "Especial 20 años Fútbol de Primera" : "Argentina", "Los 10 años de Peter Capusotto" : "Argentina", "Surviving Escobar - Alias JJ" : "Columbia", "The Least Expected Day: Inside the Movistar Team 2019" : "Spain"} # In[73]: # select the indexes where the listed_in column contains the pattern "spanish" selected_idx = missing_data[missing_data["title"].isin(list(spanish_titles.keys()))].index # replace the None values in country column with dictionary values missing_data.loc[selected_idx, "country"] = missing_data.loc[selected_idx, "title"].map(spanish_titles) # In[74]: # replace the still missing values in missing_data dataframe with "Unknown" missing_data["country"].fillna("Unknown", inplace = True) # ### Combining the Found Values # In[75]: """ - merge the non-null dataset with the substituted missing values dataset - the resultant dataset has been named "netflix_clean_data" """ # merge two datasets and reset index netflix_clean_data = netflix_data.append(missing_data.iloc[:, :-1]).reset_index(drop = True) # ## Apriori Analysis - Learn Associations # In[76]: # import mlxtend modules to find out assciation_rules from mlxtend.frequent_patterns import association_rules, apriori from mlxtend.preprocessing import TransactionEncoder # In[77]: # subset with only International TV Shows inter_tv = netflix_clean_data.loc[(netflix_clean_data["type"] == "TV Show") & (netflix_clean_data["listed_in"].str.contains(r"International", regex = True))] inter_tv.shape # In[78]: # subset with only International Movies inter_mv = netflix_clean_data.loc[(netflix_clean_data["type"] == "Movie") & (netflix_clean_data["listed_in"].str.contains(r"International", regex = True))] inter_mv.shape # In[79]: # subset with only non-International TV Shows domestic_tv = netflix_clean_data.loc[(netflix_clean_data["type"] == "TV Show") & ~(netflix_clean_data["listed_in"].str.contains(r"International", regex = True))] domestic_tv.shape # In[80]: # subset with only non-International Movies domestic_mv = netflix_clean_data.loc[(netflix_clean_data["type"] == "Movie") & ~(netflix_clean_data["listed_in"].str.contains(r"International", regex = True))] domestic_mv.shape # In[81]: """ Function to identify similarities/associations between genres, cast members, directors ----------------------------------------------------- Name: calculate_association Parameters: - df: subset of netflix_clean_dataset - column_name: column containing attributes for which association needs to be calculated - split_char: string characetrs to split the attributes - default value ", " - min_support: minimum support value to be passed to apriori module - max_length: - maximum length of the set value to be passed to apriori module - helps to decide if paired association is required - index_col: column to form index for apriori analysis Returns: a dataframe comprising of support values and itemsets as its two columns Additional Info: - mlxtends's TransactionEncoder module has been used to transform the columns values """ # function definition def calculate_association(df, column_name, min_support, max_length, index_col = "show_id", split_char = ", "): # obtain list of lists by splitting the values in the column using the split character col_val_list = list(df[column_name].str.split(split_char)) # initialize TransactionEncoder txn_enc = TransactionEncoder() # fit the list of lists txn_enc.fit(col_val_list) # obtain the transformed values using the encoder transformed_data = txn_enc.transform(col_val_list) # create an encoded df with unique show_id column acting as index encoded_df = pd.DataFrame(transformed_data, columns = txn_enc.columns_, index = df[index_col]) # apply the apriori algorithm to the transformed dataframe and select apriori_results = apriori(df = encoded_df[encoded_df.sum(axis = 1) >= 1], use_colnames = True, min_support = min_support, max_len = max_length).sort_values(by = "support", ascending = False) return apriori_results # In[82]: # calculate support values for international TV content apriori_results = calculate_association(inter_tv, "listed_in", 0.05, 2) apriori_results # In[83]: # display association rules and values for international TV content rules_df = association_rules(apriori_results, metric="lift").sort_values(by = ["antecedent support", "lift", "support"], ascending = False)[:10] rules_df # In[84]: # result dataframe final_results = pd.DataFrame({"Genres" : rules_df["consequents"].apply(lambda s: list(s)[0]), "support" : rules_df["support"]}) final_results # In[85]: # calculate support values for domestic TV content apriori_results = calculate_association(domestic_tv, "listed_in", 0.05, 1)[:10] apriori_results.rename({"itemsets" : "Genres"}, axis = 1, inplace = True) # to unfreeze the frozensets in apriori_results dataframe apriori_results["Genres"] = apriori_results["Genres"].apply(lambda s: list(s)[0]) # In[86]: # merge the final results of both the domestic and international TV content final_results = final_results.merge(apriori_results[["Genres", "support"]], how = "outer", left_on = "Genres", right_on = "Genres") final_results # In[87]: # create figure with shared y-axis fig, ax = plt.subplots(ncols = 2, sharey = True) # Create a lolipop chart because why not! # stem of the sub-plot 1 ax[0].hlines(final_results["Genres"], xmin = 0, xmax = -1 * final_results["support_x"], lw = 3) # dot of the sub-plot 1 ax[0].scatter(-1 * final_results["support_x"], final_results["Genres"], marker = "o", s = 100) # shift the y-axis to right for sub-plot 1 to make the yticklabels appear in center ax[0].yaxis.tick_right() # enable yticks ax[0].set_yticks(final_results["Genres"].index) # enable yticklabels ax[0].set_yticklabels(list(final_results["Genres"].values)) #, fontdict = {'horizontalalignment': "center"}) # hide ytickmarks ax[0].tick_params(axis = u'y', which = u'both', length=0) # title for sub-plot 1 ax[0].set_title("International") # hide the x-axis ax[0].get_xaxis().set_visible(False) # stem of the sub-plot 2 ax[1].hlines(final_results["Genres"], xmin = 0, xmax = final_results["support_y"], lw = 3) # dot of the sub-plot 2 ax[1].scatter(final_results["support_y"], final_results["Genres"], marker = "o", s = 100) # title for sub-plot 2 ax[1].set_title("Domestic") # hide tick marks for y-axis ax[1].tick_params(axis = 'y', which = u'both', length=0) # hide the x-axis for sub-plot 2 ax[1].get_xaxis().set_visible(False) # despine the sub-plot for i in ax[0].spines: ax[0].spines[i].set_visible(False) for i in ax[1].spines: ax[1].spines[i].set_visible(False) # extras plt.subplots_adjust(wspace = 0.7) plt.suptitle("Breakdown of Netflix Content by 10 Most Common Genres (for TV Shows)", y = 1.02) # show the plot plt.show() # In[88]: # calculate support values for international Movie content apriori_results = calculate_association(inter_mv, "listed_in", 0.05, 2) apriori_results # In[89]: # display association rules and values for international TV content rules_df = association_rules(apriori_results, metric="lift").sort_values(by = ["antecedent support", "lift", "support"], ascending = False)[:9] rules_df # In[90]: # result dataframe final_results = pd.DataFrame({"Genres" : rules_df["consequents"].apply(lambda s: list(s)[0]), "support" : rules_df["support"]}) final_results # In[91]: # calculate support values for domestic Movie content apriori_results = calculate_association(domestic_mv, "listed_in", 0.05, 1)[:10] apriori_results.rename({"itemsets" : "Genres"}, axis = 1, inplace = True) apriori_results["Genres"] = apriori_results["Genres"].apply(lambda s: list(s)[0]) # In[92]: # merge the final results of both the domestic and international TV content final_results = final_results.merge(apriori_results[["Genres", "support"]], how = "outer", left_on = "Genres", right_on = "Genres") final_results # In[93]: # create figure with shared y-axis fig, ax = plt.subplots(ncols = 2, sharey = True) # Create a lolipop chart because why not! # stem of the sub-plot 1 ax[0].hlines(final_results["Genres"], xmin = 0, xmax = -1 * final_results["support_x"], lw = 3) # dot of the sub-plot 1 ax[0].scatter(-1 * final_results["support_x"], final_results["Genres"], marker = "o", s = 100) # shift the y-axis to right for sub-plot 1 to make the yticklabels appear in center ax[0].yaxis.tick_right() # enable yticks ax[0].set_yticks(final_results["Genres"].index) # enable yticklabels ax[0].set_yticklabels(list(final_results["Genres"].values)) #, fontdict = {'horizontalalignment': "center"}) # hide ytickmarks ax[0].tick_params(axis = u'y', which = u'both', length=0) # title for sub-plot 1 ax[0].set_title("International") # hide the x-axis ax[0].get_xaxis().set_visible(False) # stem of the sub-plot 2 ax[1].hlines(final_results["Genres"], xmin = 0, xmax = final_results["support_y"], lw = 3) # dot of the sub-plot 2 ax[1].scatter(final_results["support_y"], final_results["Genres"], marker = "o", s = 100) # title for sub-plot 2 ax[1].set_title("Domestic") # hide tick marks for y-axis ax[1].tick_params(axis = 'y', which = u'both', length=0) # hide the x-axis for sub-plot 2 ax[1].get_xaxis().set_visible(False) # despine the sub-plot for i in ax[0].spines: ax[0].spines[i].set_visible(False) for i in ax[1].spines: ax[1].spines[i].set_visible(False) # extras plt.subplots_adjust(wspace = 0.7) plt.suptitle("Breakdown of Netflix Content by 10 Most Common Genres (for Movies)", y = 1.02) # show the plot plt.show() # ## Compare English Language vs. Non-English Language Content # In[94]: # read the source wikipedia page response = requests.get("https://en.wikipedia.org/wiki/List_of_countries_and_territories_where_English_is_an_official_language") # parse the response content as html soup = BeautifulSoup(response.content, "html.parser") # In[95]: # read the first table with primarily English Speaking Countires df = pd.read_html(str(soup.find_all("table")[0]))[0] # tranform the dataframe to a list english_countries = list(df["Country"].str.extract("(\w+\s*\w+)")[0].values) # print the extracted countries # print(english_countries) # read the second table with Multi-Language Speaking Countires df = pd.read_html(str(soup.find_all("table")[1]))[0] # Filter the countries where English is a primary language and add to the list from above step english_countries = english_countries + list(df.loc[df["Primary language?"].str.contains("Yes"), "Country"].str.replace("\[\d+\]", "", regex = True).values) # print the final list # english_countries # In[96]: # make a copy of cleaned dataframe to avoid warnings netflix_clean_data = netflix_clean_data.copy() # Based on the english speaking countries, segregate the titles into English/Non-English categories netflix_clean_data["prim_lang"] = netflix_clean_data["country"].apply(lambda s: "E" if s.split(",")[0] in english_countries else ("U" if s.split(",")[0] == "Unknown" else "NE")) # In[97]: # plot the distribution fig, ax = plt.subplots(figsize = (15, 6)) # donut chart for breakdown of content by primary language ax.pie(x = netflix_clean_data["prim_lang"].value_counts(), labels = ["English", "Non-English", "Unknown"], wedgeprops={'width':0.35}, autopct = "%.0f%%", pctdistance = 0.85, startangle = 180) # extras ax.text(-0.3, 0, "Language!") ax.set_title("Breakdown of Netflix Content: English vs. Non-English Content") #, fontsize = "small") # show the plot plt.show() # In[98]: # calculate the most credited cast members for English Content apriori_results_eng = calculate_association(netflix_clean_data[(netflix_clean_data["cast"] != "Unknown") & (netflix_clean_data["prim_lang"] == "E")], "cast", 0.001, 1) # top 10 most credited cast members for English Content apriori_results_eng[:10] # In[99]: # calculate the most credited cast members for Non-English Content apriori_results_neng = calculate_association(netflix_clean_data[(netflix_clean_data["cast"] != "Unknown") & (netflix_clean_data["prim_lang"] == "NE")], "cast", 0.001, 1) # top 10 most credited cast members for Non-English Content apriori_results_neng[:10] # In[100]: # plot the Top 5 Most credited cast members for English & Non-English Content fig, ax = plt.subplots(figsize = (12, 5)) # stems ax.hlines(y = apriori_results_eng[:5]["itemsets"].apply(lambda s: list(s)[0]), xmin = 0, xmax = -1 * apriori_results_eng[:5]["support"], lw = 3, color = "#FF800E", label = "English") # dots ax.scatter(y = apriori_results_eng[:5]["itemsets"].apply(lambda s: list(s)[0]), x = -1 * apriori_results_eng[:5]["support"], color = "#FF800E", s = 100) # stems ax.hlines(apriori_results_neng[:5]["itemsets"].apply(lambda s: list(s)[0]), xmin = 0, xmax = -1 * apriori_results_neng[:5]["support"], lw = 3, label = "Non-English") # dots ax.scatter(y = apriori_results_neng[:5]["itemsets"].apply(lambda s: list(s)[0]), x = -1 * apriori_results_neng[:5]["support"], s = 100) # annotations for i in range(5): ax.annotate(text = apriori_results_eng[:5]["itemsets"].apply(lambda s: list(s)[0]).values[i], xy = (0.0001, i - 0.2)) for i in range(5): ax.annotate(text = apriori_results_neng[:5]["itemsets"].apply(lambda s: list(s)[0]).values[i], xy = (0.0001, (i + 5) - 0.2)) # extras ax.set_title("Top 5 Headlining Cast Member Based on No. of Netflix Titles Credited", fontsize = "large") ax.axis("off") ax.legend(bbox_to_anchor = (0.3, 0.4)) # display the plot plt.show() # In[101]: # identify the most credited directors for Non-English Content apriori_results_neng = calculate_association(netflix_clean_data[(netflix_clean_data["director"] != "Unknown") & (netflix_clean_data["prim_lang"] == "NE")], "director", 0.001, 1) # top 10 most credited directors for Non-English Content apriori_results_neng[:10] # In[102]: # calculate the most credited directors for English Content apriori_results_eng = calculate_association(netflix_clean_data[(netflix_clean_data["director"] != "Unknown") & (netflix_clean_data["prim_lang"] == "E")], "director", 0.001, 1) # top 10 most credited directors for English Content apriori_results_eng[:10] # In[103]: # plot the Top 5 Most credited Directors for English & Non-English Content fig, ax = plt.subplots(figsize = (12, 5)) # stems ax.hlines(y = apriori_results_eng[:5]["itemsets"].apply(lambda s: list(s)[0]), xmin = 0, xmax = -1 * apriori_results_eng[:5]["support"], lw = 3, color = "#FF800E", label = "English") # dots ax.scatter(y = apriori_results_eng[:5]["itemsets"].apply(lambda s: list(s)[0]), x = -1 * apriori_results_eng[:5]["support"], color = "#FF800E", s = 100) # stems ax.hlines(apriori_results_neng[:5]["itemsets"].apply(lambda s: list(s)[0]), xmin = 0, xmax = -1 * apriori_results_neng[:5]["support"], lw = 3, label = "Non-English") # dots ax.scatter(y = apriori_results_neng[:5]["itemsets"].apply(lambda s: list(s)[0]), x = -1 * apriori_results_neng[:5]["support"], s = 100) # annotations for i in range(5): ax.annotate(text = apriori_results_eng[:5]["itemsets"].apply(lambda s: list(s)[0]).values[i], xy = (0.0001, i - 0.2)) for i in range(5): ax.annotate(text = apriori_results_neng[:5]["itemsets"].apply(lambda s: list(s)[0]).values[i], xy = (0.0001, (i + 5) - 0.2)) # extras ax.set_title("Top 5 Headlining Direcors Based on No. of Netflix Titles Credited", fontsize = "large") ax.axis("off") ax.legend(bbox_to_anchor = (0.3, 0.4)) # show the plot plt.show() # In[104]: # import wordcloud, stopwords and Image modules from wordcloud import WordCloud, STOPWORDS from PIL import Image # In[105]: # to collect all the STOPWORDS stopwords = set(STOPWORDS) # In[106]: # to collect all the words from description column of Non-English Content title_words_ne = "" """ - select the descriptions from the rows classified as Non-English Content - remove any non-alpha-numeric characters from the description of a title - collect each word from each row of description column - join all the words to obtain a string collection """ # format the description column for Non-English Content for row in netflix_clean_data[netflix_clean_data["prim_lang"] == "NE"]["description"].str.replace(pat = r"[^a-zA-Z0-9 ]", repl = "", regex = True): # split the words by space for each row for each_word in row.split(): # collect all the words in string object title_words_ne += "".join(each_word.lower())+" " # In[107]: # to collect all the words from description column of English Content title_words_e = "" """ - select the descriptions from the rows classified as Non-English Content - remove any non-alpha-numeric characters from the description of a title - collect each word from each row of description column - join all the words to obtain a string collection """ # format the description column for English Content for row in netflix_clean_data[netflix_clean_data["prim_lang"] == "E"]["description"].str.replace(pat = r"[^a-zA-Z0-9 ]", repl = "", regex = True): # split the words by space for each row for each_word in row.split(): # collect all the words in string object title_words_e += "".join(each_word.lower())+" " # In[108]: # plot the wordclouds fig, ax = plt.subplots(ncols = 2, figsize = (10, 5)) # to obtain the wordcloud in the shape of N mask = np.array(Image.open("Netflix_icon2.png")) # set the properties for and generate the word cloud for Non-English Content wrd = WordCloud(mask = mask, stopwords = stopwords, background_color = "white", max_words = 100, colormap = "tab10_r", random_state = 1).generate(title_words_ne) # plot the word cloud ax[0].imshow(wrd) # extras ax[0].axis("off") ax[0].set_title("For Non-English Content") # set the properties for and generate the word cloud for English Content wrd = WordCloud(mask = mask, stopwords = stopwords, background_color = "white", max_words = 100, colormap = "tab10_r", random_state = 1).generate(title_words_e) # plot the word cloud ax[1].imshow(wrd) # extras ax[1].axis("off") ax[1].set_title("For English Content") plt.tight_layout(pad = 0) plt.suptitle("#Spoiler! What's The Plot About?", y = 1.1, fontsize = 20) # show the plot plt.show()