# Importing the libraries
import numpy as np
import re
import pickle
import nltk
from nltk.corpus import stopwords
from sklearn.datasets import load_files
nltk.download('stopwords')
[nltk_data] Downloading package stopwords to [nltk_data] /Users/uzaycetin/nltk_data... [nltk_data] Package stopwords is already up-to-date!
True
# Unpickling dataset
X_in = open('data/uX.pickle','rb')
y_in = open('data/uy.pickle','rb')
X = pickle.load(X_in)
y = pickle.load(y_in)
type(X), len(X)
(list, 2000)
X[10]
b'it\'s ironic that the best films in cinema history are invariably the original director\'s cut of the film . \nfilms such as aliens , the abyss , the wild bunch , blade runner , and terminator 2 are all prime examples of a filmmaker\'s integrity , later chopped up or mucked with by the studio . \nthe advent of the dvd format has provided a more accessible way to get these original cuts to the public and provide to film freaks like myself the ability to become further enraptured by the extension of such classic films . \nthe dvd release of the original international version of luc besson\'s 1995 masterpiece the professional , which is known as l ? on around the world , is a prime example of how a good film can become an instant classic as a director\'s cut . \nfor years , i have heard of an " international " version available only in laserdisc format , which has eluded me for years . \ni even bought a laserdisc player from my uncle don for 100 bucks just to watch certain directors\' cuts - including l ? on . \nbut after countless searches in laserdisc stores , i could never find it . \nuntil now . \nthe film follows the story of l ? on , played by jean reno , a professional hit man for an italian mob crew run by danny aiello . \nhe lives next door to a ferociously independent 12-year-old girl named mathilda - played by then newcomer natalie portman , whose father is involved in drugs and crooked cops . \none day , a crew of the cops - lead by gary oldman in an over-the-top performance - kills her entire family while mathilda is out buying groceries . \ntaking pity on her , l ? on hides her in his apartment when she returns to save her life . \nmathilda learns of l ? on\'s hit man profession and decides to follow in l ? on\'s footsteps as a " cleaner . " \nshe then falls in love with him and manages to reawaken emotions within l ? on he has kept locked away as part of his profession . \nat this point , the differences in the american version and the international version really kick in . \nthe international version contains an additional 24 minutes of footage that pertains directly to the relationship of l ? on and mathilda , footage that american censors deemed " too explicit . " \ni assume that american censors feel that defining character development is too much for american audiences to handle . \nthe best part of this extra footage is that gives better understanding of both mathilda\'s and l ? on\'s motivations pertaining to later actions involving oldman and his crew of bad cops . \nthere\'s even a previously unseen cameo by the great french actor jean-hugues anglade - star of queen margot , besson\'s la femme nikita , and killing zoe . \nwith these additional scenes replaced , l ? on and matilda\'s relationship brings you emotionally closer to the film and draws stronger bonds between the characters that were murky in the american version . \nthe most ironic thing about the international version of the film is that with the inclusion of the missing scenes , the film becomes primarily a heavy , emotional drama punctuated with big action scenes at the beginning and the end of the film . \nthe film betters reflects the serious drama of french cinema but is laced with pieces reflecting the brutality of american cinema . \nmy advice to everyone out there is to throw away your copy of the professional , throw down twenty bucks , and pick up this newly restored cinematic masterpiece . \ndirector/writer : luc besson producer : luc besson , claude besson starring : jean reno , natalie portman , gary oldman , danny aiello \n'
y[10]
1
# Creating the corpus
corpus = []
for i in range(0, 2000):
review = re.sub(r'\W', ' ', str(X[i]))
review = review.lower()
review = re.sub(r'^br$', ' ', review)
review = re.sub(r'\s+br\s+',' ',review)
review = re.sub(r'\s+[a-z]\s+', ' ',review)
review = re.sub(r'^b\s+', '', review)
review = re.sub(r'\s+', ' ', review)
corpus.append(review)
corpus[5]
'don know how many other people have had the idea cross their mind that their life could be an ongoing television show watched by another world of people but it something used to wonder about when was younger ni can decide if first thought it because watched lot of tv or because my brother hit me in the head with baseball bat but m pretty sure andrew niccol screenwriter for the truman show has had the same curious thought nthe truman show is about man jim carrey whose entire life has been engineered by corporation and marketed to the public nsince birth he been living in the fictional island town of seahaven fla which actually exists as giant domed set just beyond the hollywood sign nall the people in truman burbank life are actors and the anonymous townfolk paid extras npeople watch truman life 24 hours day live with no commercial interruptions nrevenue comes instead from product placement staple of contemporary hollywood with truman friends and relatives describing their consumer items in cheerful and optimistic tones nthe sun rises and sets on cue and everyone likes everyone else neven if you haven seen the truman show you ve probably learned most of the crucial details from the commercials the trailer and other promotion blitzes nif anything ve written so far is surprise to you then pity you for making me primary source for your entertainment news neither way you should probably know the premise of the truman show going into it because otherwise you ll probably think it sucks nit begins with jim carrey looking into mirror reciting the crucial eat me scene from alive wishing his neighbors good day afternoon and evening and living in domestic bliss with his wife laura linney nbut weird things start happening na stage light comes falling out of the sky truman hears people tracking his movements on the radio and he swears he sees his dead father nwe learn through flashback that his father drowned when truman was child sailing through rainstorm nthe death was staged by director christof ed harris who wanted to put fear of water into truman to keep him from leaving the island nsimilar cruel manipulations keep him from stumbling upon the truth not the least of them truman best friend marlon noah emmerich who assures him he not in on any conspiracy because the last thing he ever do is hurt him nas truman trusting eyes tear over christof fades up emotional synthesizer music for worldwide audience of people who apparently never watch anything else nthere even daily behind the scenes update from harry shearer that captures the highlights of truman existence nthe key to movie like this is in finding the right balance between humor and drama because premise like this has its obvious implications in both directions nand it all hinges on carrey who has to balance comic naivet with real sense of longing and drive to find out what the hell is going on nhe come long way from talking butt cheeks in ace ventura nmost fundamentally it just plain interesting to follow along as director peter weir shows us just how they can capture person life without the person realizing it how they coordinate extras hide cameras and feed automatic lines to actors nthe most interesting thing about the truman show is that most of us can probably imagine some ambitious director pitching show like this nsure the most interesting tv genre of the 90s has been reality programming and jerry springer popularity is through the roof but those people deserve to be humiliated for turning their intimate details over to the airwaves nand when news shows use hidden cameras it done in the name of the greater good not entertainment nbut merge the two and you ve got the truman show nright now it implausible mean the fcc won let you say the word on the air do you think they let you broadcast every detail of man life to the entire world without his consent none day though it could happen nand even though it would mean the end of privacy people would eat it up nyou know they would '
# Creating the Tf-Idf model directly
#
# min_df = 3 a word should pass at least 3 doc
# max_df = 0.6 words that pass in more than %60 of words is also eliminated
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features = 2000, min_df = 3, max_df = 0.6, stop_words = stopwords.words('english'))
X = vectorizer.fit_transform(corpus).toarray()
X.shape
(2000, 2000)
X[0,:10]
array([0. , 0. , 0. , 0. , 0.06635601, 0. , 0. , 0. , 0. , 0. ])
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
text_train, text_test, sent_train, sent_test = train_test_split(X, y, test_size = 0.20, random_state = 0)
text_train.shape
(1600, 2000)
# Training the classifier
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(text_train,sent_train)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2', random_state=None, solver='liblinear', tol=0.0001, verbose=0, warm_start=False)
# Testing model performance
sent_pred = classifier.predict(text_test)
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(sent_test, sent_pred)
cm
array([[168, 40], [ 21, 171]])
accuracy_score(sent_test, sent_pred)
0.8475
# Saving our classifier
with open('pre-trained-model/classifier.pickle','wb') as f:
pickle.dump(classifier,f)
# Saving the Tf-Idf model
with open('pre-trained-model/tfidfmodel.pickle','wb') as f:
pickle.dump(vectorizer,f)