#r "nuget:MathNet.Numerics" #r "nuget:MathNet.Numerics.FSharp" #r "nuget:FSharp.Data" // Uncomment for the IFSharp Kernel // #load "Paket.fsx" // Paket.Package [ "FSharp.Data"; ] // #load "Paket.Generated.Refs.fsx" open System open System.Text open FSharp.Data open MathNet.Numerics open System.Collections.Generic open Microsoft.FSharp.Collections [] // Data obtained from: https://www.kaggle.com/tmdb/tmdb-movie-metadata let DataPath = "/Users/mukundraghavsharma/Desktop/F#/FSharp-Advent-2019/data/tmdb_5000_movies.csv" let data = CsvFile.Load(DataPath).Cache() printfn "%A" data.Headers let getFirstItemInColumn (colName : string) = seq { for row in data.Rows -> row.GetColumn colName } |> Seq.head getFirstItemInColumn "genres" getFirstItemInColumn "keywords" getFirstItemInColumn "production_companies" getFirstItemInColumn "overview" getFirstItemInColumn "popularity" type MovieData = { title : string; soup : string; soupList : string list; genreList : string list; prodCompanyList : string list; popularity : double; } let tokenizeAndClean (words : string) = // Tokenize let split = words.Split(' ') // Lowercase let lowered = split |> Array.map(fun s -> s.ToLower()) // Remove Common Stop Words that don't add meaning let commonStopWords = Set.ofList ["ourselves"; "hers"; "between"; "yourself"; "but"; "again"; "there"; "about"; "once"; "during"; "out"; "very"; "having"; "with"; "they"; "own"; "an"; "be"; "some"; "for"; "do"; "its"; "yours"; "such"; "into"; "of"; "most"; "itself"; "other"; "off"; "is"; "s"; "am"; "or"; "who"; "as"; "from"; "him"; "each"; "the"; "themselves"; "until"; "below"; "are"; "we"; "these"; "your"; "his"; "through"; "don"; "nor"; "me"; "were"; "her"; "more"; "himself"; "this"; "down"; "should"; "our"; "their"; "while"; "above"; "both"; "up"; "to"; "ours"; "had"; "she"; "all"; "no"; "when"; "at"; "any"; "before"; "them"; "same"; "and"; "been"; "have"; "in"; "will"; "on"; "does"; "yourselves"; "then"; "that"; "because"; "what"; "over"; "why"; "so"; "can"; "did"; "not"; "now"; "under"; "he"; "you"; "herself"; "has"; "just"; "where"; "too"; "only"; "myself"; "which"; "those"; "i"; "after"; "few"; "whom"; "t"; "being"; "if"; "theirs"; "my"; "against"; "a"; "by"; "doing"; "it"; "how"; "further"; "was"; "here"; "than"] let notStopWords = lowered |> Array.filter(fun s -> not (Set.contains s commonStopWords)) // Remove Punctuation let nonPunctuation = notStopWords |> Array.map(fun x -> x.Replace("�", "") .Replace("'", "") .Replace(":", "") .Replace(".", "") .Replace(",", "") .Replace("-", "") .Replace("!", "") .Replace("?", "") .Replace("\"", "") .Replace(";", "")) let removeEmptyStrings = nonPunctuation |> Array.filter(fun x -> not (String.IsNullOrEmpty x)) // Lexicographically sort let sorted = Array.sort removeEmptyStrings // Concatenate the joined data sorted |> Array.toList let sample = "The quick brown. fox jumps! over the; Lazy dog" let tokenizedSample = tokenizeAndClean sample tokenizedSample // Genre [] let SampleGenresJson = "[{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"name\": \"Adventure\"}, {\"id\": 80, \"name\": \"Crime\"}]" type GenreProvider = JsonProvider< SampleGenresJson > let sanitizeGenre (genres : string) : string list = let parsed = GenreProvider.Parse(genres) parsed |> Array.map(fun x -> x.Name.Replace(" ", "")) |> String.concat " " |> tokenizeAndClean // Keywords [] let SampleKeywordsJson = "[{\"id\": 1463, \"name\": \"culture clash\"}, {\"id\": 2964, \"name\": \"future\"}, {\"id\": 3386, \"name\": \"space war\"}]" type KeywordsProvider = JsonProvider< SampleKeywordsJson > let sanitizeKeywords (keywords : string) : string list = let parsed = KeywordsProvider.Parse(keywords) parsed |> Array.map(fun x -> x.Name.Replace(" ", "")) |> String.concat " " |> tokenizeAndClean // Overview let sanitizeOverview (overview : string) : string list = let nonAsciiRemoved = Encoding.ASCII.GetString(Encoding.ASCII.GetBytes(overview)) tokenizeAndClean nonAsciiRemoved |> List.filter(fun x -> x <> String.Empty) // Production Company [] let ProductionCompanyJson = "[{\"name\": \"Ingenious Film Partners\", \"id\": 289}, {\"name\": \"Twentieth Century Fox Film Corporation\", \"id\": 306}, {\"name\": \"Dune Entertainment\", \"id\": 444}, {\"name\": \"Lightstorm Entertainment\", \"id\": 574}]" type ProductionCompanyProvider = JsonProvider< ProductionCompanyJson > let sanitizeProductionCompany (productionCompany : string) : string list = let parsed = ProductionCompanyProvider.Parse(productionCompany) parsed |> Array.map(fun x -> x.Name.Replace(" ", "")) // Lightstorm Entertainment -> LightstormEntertainment |> String.concat " " |> tokenizeAndClean // Wrapper function that gets a sequence of all the domain objects. let extractData = let data = CsvFile.Load(DataPath).Cache() let mutable output = [] for row in data.Rows do let title = (row.GetColumn "title") // Genres let genres = sanitizeGenre (row.GetColumn "genres") let getSoupGenres = genres |> String.concat " " // Keywords let keywords = sanitizeKeywords (row.GetColumn "keywords") let getSoupKeyword = keywords |> String.concat " " // Overview let overview = sanitizeOverview (row.GetColumn "overview") let getSoupOverview = overview |> String.concat " " // Production Company let productionCompany = sanitizeProductionCompany (row.GetColumn "production_companies") let getSoupProductionCompany = productionCompany |> String.concat " " // Soup let soup = getSoupGenres + " " + getSoupKeyword + " " + getSoupOverview + " " + getSoupProductionCompany let soupList = genres @ keywords @ overview @ productionCompany // Popularity let popularity = double(row.GetColumn "popularity") // Construct data type let movieData = { title = title; soup = soup; popularity = popularity; genreList = genres; prodCompanyList = productionCompany; soupList = soupList } // Append the output output <- output @ [movieData] output |> Seq.ofList // Function that wraps the extractData functionality in a dictionary for ease of use. let getDictOfData = let out = Dictionary() for w in extractData do out.[w.title] <- w out // Test Function getDictOfData |> Seq.take 5 // Function gets all the distinct words from the soup list let getAllWords = extractData |> Seq.map(fun x -> x.soupList) |> Seq.concat |> Seq.distinct // Function to get the word frequency for a particular movie and then add the popularity to the end of the dictionary let getFeatureDictByMovieData (movieCompare : MovieData) = let wordFrequency = new Dictionary() for w in getAllWords do let count = movieCompare.soupList |> List.filter(fun x -> x = w) |> List.length let countAsDouble = double(count) if not (wordFrequency.ContainsKey w) then wordFrequency.[w] <- countAsDouble else wordFrequency.[w] <- wordFrequency.[w] + countAsDouble wordFrequency.[";popularity_score;"] <- movieCompare.popularity wordFrequency // Function to get feature dictionary let getFeatureDict (movieName : string) = if getDictOfData.ContainsKey movieName then getFeatureDictByMovieData getDictOfData.[movieName] else failwith "Movie Not Found!" // Function to get the feature vector i.e. values in the feature dictionary let getFeatureVector(movieName : string) = let featureDict = getFeatureDict movieName featureDict |> Seq.map(fun x -> x.Value) |> Seq.toArray printfn "Avatar Feature Dictionary:" let avatarVector = getFeatureDict "Avatar" printfn "%A" avatarVector printfn "Popularity Score: %A\n" (avatarVector.[";popularity_score;"]) printfn "%A" (getFeatureVector "Avatar") printfn "The Dark Knight Rises Feature Dictionary:" let darkKnightVector = getFeatureDict "The Dark Knight Rises" printfn "%A" darkKnightVector printfn "Popularity Score: %A" (darkKnightVector.[";popularity_score;"]) printfn "%A" (getFeatureVector "The Dark Knight Rises") let x : double[] = [| 3.; 1. |] let y : double[] = [| 3.; 3. |] printfn "Cosine Distance when values are equal: %A" (Distance.Cosine(x, x)) printfn "Cosine Distance when values are different: %A" (Distance.Cosine(x, y)) let computeCosineDistance (movie1 : string) (movie2 : string) : double = // Feature Vector let movie1Vector = getFeatureVector movie1 let movie2Vector = getFeatureVector movie2 // Compute the Cosine distance in the case the movies exist Distance.Cosine(movie1Vector, movie2Vector) computeCosineDistance "Avatar" "The Dark Knight Rises" let recommendMovies (movie : string) (recommendationCount : int) = // Check if two lists contain any intersection - will be using this function to let setIntersect listA listB = Set.intersect (Set.ofList listA) (Set.ofList listB) |> Set.isEmpty |> not // Filter out any unrelated movies to improve computation let filterOutMoviesNotRelated (movie1 : MovieData) (movie2 : MovieData) = // Filtering mechanism: If the movies don't have any genres in common, don't even consider. let checkIfGenresExist (movie1 : MovieData) (movie2 : MovieData) = let movie1Genres = movie1.genreList let movie2Genres = movie2.genreList setIntersect movie1Genres movie2Genres // Filtering mechanism: If the movies don't have any production companies in common, don't even consider. let checkIfProductionCompaniesExist (movie1 : MovieData) (movie2 : MovieData) = let movie1ProdCompany = movie1.prodCompanyList let movie2ProdCompany = movie2.prodCompanyList setIntersect movie1ProdCompany movie2ProdCompany (checkIfGenresExist movie1 movie2) && (checkIfProductionCompaniesExist movie1 movie2) if getDictOfData.ContainsKey movie then let movieData = getDictOfData.[movie] getDictOfData // Don't include the current item in question nor any other movie not of any of the genres of the movie |> Seq.filter(fun x -> not(x.Value = movieData) && ( filterOutMoviesNotRelated movieData x.Value )) // Grab a tuple of the title and cosine distance |> Seq.map(fun x -> (x.Value.title, computeCosineDistance movie x.Value.title)) // Remove NaNs |> Seq.filter(fun x -> not(System.Double.IsNaN(snd x))) // Sort by distance |> Seq.sortBy(fun x -> snd x) // Take only the specified recommendation counts |> Seq.take recommendationCount // Convert to list |> Seq.toList else failwith "Movie not found!" let results = (recommendMovies "The Dark Knight Rises" 10) |> List.iter(fun x -> printfn "%A %A" (fst x) (snd x)) results