#!/usr/bin/env python # coding: utf-8 # In[ ]: get_ipython().system('pip3 install scikit-multilearn') # In[ ]: import warnings warnings.filterwarnings("ignore") import pandas as pd import sqlite3 import csv import matplotlib.pyplot as plt import seaborn as sns import numpy as np from wordcloud import WordCloud import re import os from sqlalchemy import create_engine # database connection import datetime as dt from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from nltk.stem.snowball import SnowballStemmer from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.multiclass import OneVsRestClassifier from sklearn.linear_model import SGDClassifier from sklearn import metrics from sklearn.metrics import f1_score,precision_score,recall_score from sklearn import svm from sklearn.linear_model import LogisticRegression from skmultilearn.adapt import mlknn from skmultilearn.problem_transform import ClassifierChain from skmultilearn.problem_transform import BinaryRelevance from skmultilearn.problem_transform import LabelPowerset from sklearn.naive_bayes import GaussianNB from datetime import datetime # # Stack Overflow: Tag Prediction # In[ ]: from google.colab import drive drive.mount('/content/drive') #
Description
#
# Stack Overflow is the largest, most trusted online community for developers to learn, share their programming knowledge, and build their careers.
#
# Stack Overflow is something which every programmer use one way or another. Each month, over 50 million developers come to Stack Overflow to learn, share their knowledge, and build their careers. It features questions and answers on a wide range of topics in computer programming. The website serves as a platform for users to ask and answer questions, and, through membership and active participation, to vote questions and answers up or down and edit questions and answers in a fashion similar to a wiki or Digg. As of April 2014 Stack Overflow has over 4,000,000 registered users, and it exceeded 10,000,000 questions in late August 2015. Based on the type of tags assigned to questions, the top eight most discussed topics on the site are: Java, JavaScript, C#, PHP, Android, jQuery, Python and HTML.
#
#
Problem Statemtent
# Suggest the tags based on the content that was there in the question posted on Stackoverflow. #Source: https://www.kaggle.com/c/facebook-recruiting-iii-keyword-extraction/
# ## Train.csv contains 4 columns: Id,Title,Body,Tags.# The questions are randomized and contains a mix of verbose text sites as well as sites related to math and programming. The number of questions from each site may vary, and no filtering has been performed on the questions (such as closed questions).
# Test.csv contains the same columns but without the Tags, which you are to predict.
# Size of Train.csv - 6.75GB
# Size of Test.csv - 2GB
# Number of rows in Train.csv = 6034195
#
# Id - Unique identifier for each question# #
# Title - The question's title
# Body - The body of the question
# Tags - The tags associated with the question in a space-seperated format (all lowercase, should not contain tabs '\t' or ampersands '&')
#
# Title: Implementing Boundary Value Analysis of Software Testing in a C++ program? # Body :#\n\n ## #include< # iostream>\n # #include< # stdlib.h>\n\n # using namespace std;\n\n # int main()\n # {\n # int n,a[n],x,c,u[n],m[n],e[n][4];\n # cout<<"Enter the number of variables";\n cin>>n;\n\n # cout<<"Enter the Lower, and Upper Limits of the variables";\n # for(int y=1; y<n+1; y++)\n # {\n # cin>>m[y];\n # cin>>u[y];\n # }\n # for(x=1; x<n+1; x++)\n # {\n # a[x] = (m[x] + u[x])/2;\n # }\n # c=(n*4)-4;\n # for(int a1=1; a1<n+1; a1++)\n # {\n\n # e[a1][0] = m[a1];\n # e[a1][1] = m[a1]+1;\n # e[a1][2] = u[a1]-1;\n # e[a1][3] = u[a1];\n # }\n # for(int i=1; i<n+1; i++)\n # {\n # for(int l=1; l<=i; l++)\n # {\n # if(l!=1)\n # {\n # cout<<a[l]<<"\\t";\n # }\n # }\n # for(int j=0; j<4; j++)\n # {\n # cout<<e[i][j];\n # for(int k=0; k<n-(i+1); k++)\n # {\n # cout<<a[k]<<"\\t";\n # }\n # cout<<"\\n";\n # }\n # } \n\n # system("PAUSE");\n # return 0; \n # }\n #
The answer should come in the form of a table like
\n\n #\n\n ## 1 50 50\n # 2 50 50\n # 99 50 50\n # 100 50 50\n # 50 1 50\n # 50 2 50\n # 50 99 50\n # 50 100 50\n # 50 50 1\n # 50 50 2\n # 50 50 99\n # 50 50 100\n #
if the no of inputs is 3 and their ranges are\n # 1,100\n # 1,100\n # 1,100\n # (could be varied too)
\n\n #The output is not coming,can anyone correct the code or tell me what\'s wrong?
\n' # Tags : 'c++ c' #
It is a multi-label classification problem
# Multi-label Classification: Multilabel classification assigns to each sample a set of target labels. This can be thought as predicting properties of a data-point that are not mutually exclusive, such as topics that are relevant for a document. A question on Stackoverflow might be about any of C, Pointers, FileIO and/or memory-management at the same time or none of these.
# __Credit__: http://scikit-learn.org/stable/modules/multiclass.html
#
' in question:
questions_with_code+=1
is_code = 1
x = len(question)+len(title)
len_pre+=x
code = str(re.findall(r'(.*?)
', question, flags=re.DOTALL))
question=re.sub('(.*?)
', '', question, flags=re.MULTILINE|re.DOTALL)
question=striphtml(question.encode('utf-8'))
title=title.encode('utf-8')
question=str(title)+" "+str(question)
question=re.sub(r'[^A-Za-z]+',' ',question)
words=word_tokenize(str(question.lower()))
#Removing all single letter and and stopwords from question exceptt for the letter 'c'
question=' '.join(str(stemmer.stem(j)) for j in words if j not in stop_words and (len(j)!=1 or j=='c'))
len_post+=len(question)
tup = (question,code,tags,x,len(question),is_code)
questions_proccesed += 1
writer.execute("insert into QuestionsProcessed(question,code,tags,words_pre,words_post,is_code) values (?,?,?,?,?,?)",tup)
if (questions_proccesed%100000==0):
print("number of questions completed=",questions_proccesed)
no_dup_avg_len_pre=(len_pre*1.0)/questions_proccesed
no_dup_avg_len_post=(len_post*1.0)/questions_proccesed
print( "Avg. length of questions(Title+Body) before processing: %d"%no_dup_avg_len_pre)
print( "Avg. length of questions(Title+Body) after processing: %d"%no_dup_avg_len_post)
print ("Percent of questions containing code: %d"%((questions_with_code*100.0)/questions_proccesed))
print("Time taken to run this cell :", datetime.now() - start)
# In[ ]:
# dont forget to close the connections, or else you will end up with locks
conn_r.commit()
conn_w.commit()
conn_r.close()
conn_w.close()
# In[ ]:
if os.path.isfile(write_db):
conn_r = create_connection(write_db)
if conn_r is not None:
reader =conn_r.cursor()
reader.execute("SELECT question From QuestionsProcessed LIMIT 10")
print("Questions after preprocessed")
print('='*100)
reader.fetchone()
for row in reader:
print(row)
print('-'*100)
conn_r.commit()
conn_r.close()
# In[ ]:
#Taking 1 Million entries to a dataframe.
#check here
write_db = 'drive/My Drive/Stackoverflow/data/Processed.db'
if os.path.isfile(write_db):
conn_r = create_connection(write_db)
if conn_r is not None:
preprocessed_data = pd.read_sql_query("""SELECT question, Tags FROM QuestionsProcessed LIMIT 500000 """, conn_r)
conn_r.commit()
conn_r.close()
# In[ ]:
preprocessed_data.head()
# In[ ]:
print("number of data points in sample :", preprocessed_data.shape[0])
print("number of dimensions :", preprocessed_data.shape[1])
# 4. Machine Learning Models
# 4.1 Converting tags for multilabel problems
#
#
# X y1 y2 y3 y4
#
#
# x1 0 1 1 0
#
#
# x1 1 0 0 0
#
#
# x1 0 1 0 0
#
#
# In[ ]:
# binary='true' will give a binary vectorizer
vectorizer = CountVectorizer(tokenizer = lambda x: x.split(), binary='true')
multilabel_y = vectorizer.fit_transform(preprocessed_data['tags'])
# __ We will sample the number of tags instead considering all of them (due to limitation of computing power) __
# In[ ]:
def tags_to_choose(n):
t = multilabel_y.sum(axis=0).tolist()[0]
sorted_tags_i = sorted(range(len(t)), key=lambda i: t[i], reverse=True)
multilabel_yn=multilabel_y[:,sorted_tags_i[:n]]
return multilabel_yn
def questions_explained_fn(n):
multilabel_yn = tags_to_choose(n)
x= multilabel_yn.sum(axis=1)
return (np.count_nonzero(x==0))
# In[ ]:
questions_explained = []
total_tags=multilabel_y.shape[1]
total_qs=preprocessed_data.shape[0]
for i in range(500, total_tags, 100):
questions_explained.append(np.round(((total_qs-questions_explained_fn(i))/total_qs)*100,3))
# In[ ]:
fig, ax = plt.subplots()
ax.plot(questions_explained)
xlabel = list(500+np.array(range(-50,450,50))*50)
ax.set_xticklabels(xlabel)
plt.xlabel("Number of tags")
plt.ylabel("Number Questions coverd partially")
plt.grid()
plt.show()
# you can choose any number of tags based on your computing power, minimun is 50(it covers 90% of the tags)
print("with ",5500,"tags we are covering ",questions_explained[50],"% of questions")
# In[ ]:
multilabel_yx = tags_to_choose(500)
print("number of questions that are not covered :", questions_explained_fn(500),"out of 500000" )
# In[ ]:
print("Number of tags in sample :", multilabel_y.shape[1])
print("number of tags taken :", multilabel_yx.shape[1],"(",(multilabel_yx.shape[1]/multilabel_y.shape[1])*100,"%)")
# __ We consider top 15% tags which covers 99% of the questions __
# 4.2 Split the data into test and train (80:20)
# In[ ]:
total_size=preprocessed_data.shape[0]
train_size=int(0.80*total_size)
x_train=preprocessed_data.head(train_size)
x_test=preprocessed_data.tail(total_size - train_size)
y_train = multilabel_yx[0:train_size,:]
y_test = multilabel_yx[train_size:total_size,:]
# In[ ]:
print("Number of data points in train data :", x_train.shape)
print("Number of data points in test data :", x_test.shape)
# In[ ]:
print("Number of data points in train data :", y_train.shape)
print("Number of data points in test data :", y_test.shape)
# 4.3 Featurizing data
# In[ ]:
start = datetime.now()
vectorizer = TfidfVectorizer(min_df=0.00009, max_features=200000, smooth_idf=True, norm="l2", \
tokenizer = lambda x: x.split(), sublinear_tf=False, ngram_range=(1,3))
x_train_multilabel = vectorizer.fit_transform(x_train['question'])
x_test_multilabel = vectorizer.transform(x_test['question'])
print("Time taken to run this cell :", datetime.now() - start)
# In[ ]:
print("Dimensions of train data X:",x_train_multilabel.shape, "Y :",y_train.shape)
print("Dimensions of test data X:",x_test_multilabel.shape,"Y:",y_test.shape)
# In[ ]:
# https://www.analyticsvidhya.com/blog/2017/08/introduction-to-multi-label-classification/
#https://stats.stackexchange.com/questions/117796/scikit-multi-label-classification
# classifier = LabelPowerset(GaussianNB())
"""
from skmultilearn.adapt import MLkNN
classifier = MLkNN(k=21)
# train
classifier.fit(x_train_multilabel, y_train)
# predict
predictions = classifier.predict(x_test_multilabel)
print(accuracy_score(y_test,predictions))
print(metrics.f1_score(y_test, predictions, average = 'macro'))
print(metrics.f1_score(y_test, predictions, average = 'micro'))
print(metrics.hamming_loss(y_test,predictions))
"""
# we are getting memory error because the multilearn package
# is trying to convert the data into dense matrix
# ---------------------------------------------------------------------------
#MemoryError Traceback (most recent call last)
# in ()
#----> classifier.fit(x_train_multilabel, y_train)
# 4.4 Applying Logistic Regression with OneVsRest Classifier
# In[ ]:
# this will be taking so much time try not to run it, download the lr_with_equal_weight.pkl file and use to predict
# This takes about 6-7 hours to run.
classifier = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.00001, penalty='l1'), n_jobs=-1)
classifier.fit(x_train_multilabel, y_train)
predictions = classifier.predict(x_test_multilabel)
print("accuracy :",metrics.accuracy_score(y_test,predictions))
print("macro f1 score :",metrics.f1_score(y_test, predictions, average = 'macro'))
print("micro f1 scoore :",metrics.f1_score(y_test, predictions, average = 'micro'))
print("hamming loss :",metrics.hamming_loss(y_test,predictions))
print("Precision recall report :\n",metrics.classification_report(y_test, predictions))
# In[ ]:
from sklearn.externals import joblib
joblib.dump(classifier, 'lr_with_equal_weight.pkl')
# 4.5 Modeling with less data points (0.5M data points) and more weight to title and 500 tags only.
# In[ ]:
sql_create_table = """CREATE TABLE IF NOT EXISTS QuestionsProcessed (question text NOT NULL, code text, tags text, words_pre integer, words_post integer, is_code integer);"""
create_database_table("drive/My Drive/Stackoverflow/MyTitlemoreweight.db", sql_create_table)
# In[ ]:
title_more = create_engine("sqlite:///drive/My Drive/Stackoverflow/MyTitlemoreweight.db")
# no_dup = pd.DataFrame(df_no_dup, columns=['Title', 'Body', 'Tags'])
# no_dup.to_sql('no_dup_train',title_more)
# In[ ]:
# http://www.sqlitetutorial.net/sqlite-delete/
# https://stackoverflow.com/questions/2279706/select-random-row-from-a-sqlite-table
#check here
read_db = 'drive/My Drive/Stackoverflow/data/train_no_dup.db'
write_db = 'drive/My Drive/Stackoverflow/MyTitlemoreweight.db'
train_datasize = 400000
if os.path.isfile(read_db):
conn_r = create_connection(read_db)
if conn_r is not None:
reader =conn_r.cursor()
# for selecting first 0.5M rows
reader.execute("SELECT Title, Body, Tags From no_dup_train LIMIT 500001;")
# for selecting random points
#reader.execute("SELECT Title, Body, Tags From no_dup_train ORDER BY RANDOM() LIMIT 500001;")
if os.path.isfile(write_db):
conn_w = create_connection(write_db)
if conn_w is not None:
tables = checkTableExists(conn_w)
writer =conn_w.cursor()
if tables != 0:
writer.execute("DELETE FROM QuestionsProcessed WHERE 1")
print("Cleared All the rows")
# 4.5.1 Preprocessing of questions
#
# - Separate Code from Body
# - Remove Spcial characters from Question title and description (not in code)
# - Give more weightage to title : Add title three times to the question
#
# - Remove stop words (Except 'C')
# - Remove HTML Tags
# - Convert all the characters into small letters
# - Use SnowballStemmer to stem the words
#
# In[ ]:
#http://www.bernzilla.com/2008/05/13/selecting-a-random-row-from-an-sqlite-table/
import nltk
nltk.download('punkt')
start = datetime.now()
preprocessed_data_list=[]
reader.fetchone()
questions_with_code=0
len_pre=0
len_post=0
questions_proccesed = 0
for row in reader:
is_code = 0
title, question, tags = row[0], row[1], str(row[2])
if '' in question:
questions_with_code+=1
is_code = 1
x = len(question)+len(title)
len_pre+=x
code = str(re.findall(r'(.*?)
', question, flags=re.DOTALL))
question=re.sub('(.*?)
', '', question, flags=re.MULTILINE|re.DOTALL)
question=striphtml(question.encode('utf-8'))
title=title.encode('utf-8')
# adding title three time to the data to increase its weight
# add tags string to the training data
question=str(title)+" "+str(title)+" "+str(title)+" "+question
# if questions_proccesed<=train_datasize:
# question=str(title)+" "+str(title)+" "+str(title)+" "+question+" "+str(tags)
# else:
# question=str(title)+" "+str(title)+" "+str(title)+" "+question
question=re.sub(r'[^A-Za-z0-9#+.\-]+',' ',question)
words=word_tokenize(str(question.lower()))
#Removing all single letter and and stopwords from question exceptt for the letter 'c'
question=' '.join(str(stemmer.stem(j)) for j in words if j not in stop_words and (len(j)!=1 or j=='c'))
len_post+=len(question)
tup = (question,code,tags,x,len(question),is_code)
questions_proccesed += 1
writer.execute("insert into QuestionsProcessed(question,code,tags,words_pre,words_post,is_code) values (?,?,?,?,?,?)",tup)
if (questions_proccesed%100000==0):
print("number of questions completed=",questions_proccesed)
no_dup_avg_len_pre=(len_pre*1.0)/questions_proccesed
no_dup_avg_len_post=(len_post*1.0)/questions_proccesed
print( "Avg. length of questions(Title+Body) before processing: %d"%no_dup_avg_len_pre)
print( "Avg. length of questions(Title+Body) after processing: %d"%no_dup_avg_len_post)
print ("Percent of questions containing code: %d"%((questions_with_code*100.0)/questions_proccesed))
print("Time taken to run this cell :", datetime.now() - start)
# In[ ]:
# never forget to close the conections or else we will end up with database locks
conn_r.commit()
conn_w.commit()
conn_r.close()
conn_w.close()
# __ Sample quesitons after preprocessing of data __
# In[ ]:
if os.path.isfile(write_db):
conn_r = create_connection(write_db)
if conn_r is not None:
reader =conn_r.cursor()
reader.execute("SELECT question From QuestionsProcessed LIMIT 50")
print("Questions after preprocessed")
print('='*100)
reader.fetchone()
for row in reader:
print(row)
print('-'*100)
conn_r.commit()
conn_r.close()
# __ Saving Preprocessed data to a Database __
# In[ ]:
i=0
conn_r = create_connection(read_db)
if conn_r is not None:
reader =conn_r.cursor()
# for selecting first 0.5M rows
reader.execute("SELECT Title, Body, Tags From no_dup_train LIMIT 2;")
for row in reader:
is_code = 0
title, question, tags = row[0], row[1], str(row[2])
if '' in question:
questions_with_code+=1
is_code = 1
x = len(question)+len(title)
len_pre+=x
code = str(re.findall(r'(.*?)
', question, flags=re.DOTALL))
question=re.sub('(.*?)
', '', question, flags=re.MULTILINE|re.DOTALL)
question=striphtml(question.encode('utf-8'))
title=title.encode('utf-8')
# adding title three time to the data to increase its weight
# add tags string to the training data
print("title ",title," title")
question=str(title)+" "+str(title)+" "+str(title)+" "+question
print(question)
# if questions_proccesed<=train_datasize:
# question=str(title)+" "+str(title)+" "+str(title)+" "+question+" "+str(tags)
# else:
# question=str(title)+" "+str(title)+" "+str(title)+" "+question
question=re.sub(r'[^A-Za-z0-9#+.\-]+',' ',question)
words=word_tokenize(str(question.lower()))
#Removing all single letter and and stopwords from question exceptt for the letter 'c'
question=' '.join(str(stemmer.stem(j)) for j in words if j not in stop_words and (len(j)!=1 or j=='c'))
len_post+=len(question)
tup = (question,code,tags,x,len(question),is_code)
questions_proccesed += 1
print(" Final Question: ",question)
conn_r.commit()
conn_r.close()
# In[ ]:
preprocessed_old.head()
# In[ ]:
start = datetime.now()
#check here
#Taking 0.5 Million entries to a dataframe.
write_db = 'drive/My Drive/Stackoverflow/MyTitlemoreweight.db'
if os.path.isfile(write_db):
conn_r = create_connection(write_db)
print(type(conn_r))
if conn_r is not None:
print(type(conn_r),"inside")
preprocessed_data = pd.read_sql_query("""SELECT question, Tags FROM QuestionsProcessed""", conn_r)
print("done")
conn_r.commit()
conn_r.close()
print("Time taken to run this cell :", datetime.now() - start)
# In[ ]:
checkTableExists(conn_r)
def checkTableExists(dbcon):
cursr = dbcon.cursor()
str = "select name from sqlite_master where type='table'"
table_names = cursr.execute(str)
print("Tables in the databse:")
tables =table_names.fetchall()
print(tables[0][0])
return(len(tables))
# In[ ]:
cou = "select name from sqlite_master where type='table'"
table_names = cursr.execute(str)
print("Tables in the databse:")
tables =table_names.fetchall()
# In[ ]:
preprocessed_data.head()
# In[ ]:
print("number of data points in sample :", preprocessed_data.shape[0])
print("number of dimensions :", preprocessed_data.shape[1])
# __ Converting string Tags to multilable output variables __
# In[ ]:
vectorizer = CountVectorizer(tokenizer = lambda x: x.split(), binary='true')
multilabel_y = vectorizer.fit_transform(preprocessed_data['tags'])
# In[ ]:
# __ Selecting 500 Tags __
# In[ ]:
questions_explained = []
total_tags=multilabel_y.shape[1]
total_qs=preprocessed_data.shape[0]
for i in range(500, total_tags, 100):
questions_explained.append(np.round(((total_qs-questions_explained_fn(i))/total_qs)*100,3))
# In[ ]:
fig, ax = plt.subplots()
ax.plot(questions_explained)
xlabel = list(500+np.array(range(-50,450,50))*50)
ax.set_xticklabels(xlabel)
plt.xlabel("Number of tags")
plt.ylabel("Number Questions coverd partially")
plt.grid()
plt.show()
# you can choose any number of tags based on your computing power, minimun is 500(it covers 90% of the tags)
print("with ",5500,"tags we are covering ",questions_explained[50],"% of questions")
print("with ",500,"tags we are covering ",questions_explained[0],"% of questions")
# In[ ]:
# we will be taking 500 tags
multilabel_yx = tags_to_choose(500)
print("number of questions that are not covered :", questions_explained_fn(500),"out of 500000")
# In[ ]:
sub_size = 200000
preprocessed_data_sub = preprocessed_data.head(sub_size)
total_size=preprocessed_data_sub.shape[0]
train_size=int(0.80*total_size)
x_train=preprocessed_data_sub.head(train_size)
x_test=preprocessed_data_sub.tail(total_size - train_size)
y_train = multilabel_yx[0:train_size,:]
y_test = multilabel_yx[train_size:total_size,:]
# In[ ]:
print("Number of data points in train data :", y_train.shape)
print("Number of data points in test data :", y_test.shape)
# 4.5.2 Featurizing data with TfIdf vectorizer
# In[ ]:
start = datetime.now()
vectorizer = TfidfVectorizer(min_df=0.00009, max_features=200000, smooth_idf=True, norm="l2", \
tokenizer = lambda x: x.split(), sublinear_tf=False, ngram_range=(1,3))
x_train_multilabel = vectorizer.fit_transform(x_train['question'])
x_test_multilabel = vectorizer.transform(x_test['question'])
print("Time taken to run this cell :", datetime.now() - start)
# In[ ]:
train_datasize = 400000
x_train=preprocessed_data.head(train_datasize)
x_test=preprocessed_data.tail(preprocessed_data.shape[0] - 400000)
y_train = multilabel_yx[0:train_datasize,:]
y_test = multilabel_yx[train_datasize:preprocessed_data.shape[0],:]
# In[ ]:
print("Number of data points in train data :", y_train.shape)
print("Number of data points in test data :", y_test.shape)
# 4.5.2 Featurizing data with TfIdf vectorizer
# In[ ]:
start = datetime.now()
vectorizer = CountVectorizer(min_df=0.00009, max_features=200000, \
tokenizer = lambda x: x.split(), ngram_range=(1,4))
x_train_multilabel = vectorizer.fit_transform(x_train['question'])
x_test_multilabel = vectorizer.transform(x_test['question'])
print("Time taken to run this cell :", datetime.now() - start)
# In[ ]:
joblib.dump(x_train_multilabel, 'lr_with_more_title_weight.pkl')
# In[ ]:
print("Dimensions of train data X:",x_train_multilabel.shape, "Y :",y_train.shape)
print("Dimensions of test data X:",x_test_multilabel.shape,"Y:",y_test.shape)
# 4.5.3 Applying Logistic Regression with OneVsRest Classifier
# In[ ]:
start = datetime.now()
classifier = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.00001, penalty='l1'))
classifier.fit(x_train_multilabel, y_train)
# clf.fit(x_train_multilabel.copy(),y_train)
predictions = classifier.predict (x_test_multilabel)
print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))
precision = precision_score(y_test, predictions, average='micro')
recall = recall_score(y_test, predictions, average='micro')
f1 = f1_score(y_test, predictions, average='micro')
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
precision = precision_score(y_test, predictions, average='macro')
recall = recall_score(y_test, predictions, average='macro')
f1 = f1_score(y_test, predictions, average='macro')
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
print (metrics.classification_report(y_test, predictions))
print("Time taken to run this cell :", datetime.now() - start)
# In[ ]:
start = datetime.now()
classifier_2 = OneVsRestClassifier(LogisticRegression(penalty='l1'), n_jobs=-1)
x_train_multilabel.sort_indices()
classifier_2.fit(x_train_multilabel, y_train)
predictions_2 = classifier_2.predict(x_test_multilabel)
print("Accuracy :",metrics.accuracy_score(y_test, predictions_2))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions_2))
precision = precision_score(y_test, predictions_2, average='micro')
recall = recall_score(y_test, predictions_2, average='micro')
f1 = f1_score(y_test, predictions_2, average='micro')
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
precision = precision_score(y_test, predictions_2, average='macro')
recall = recall_score(y_test, predictions_2, average='macro')
f1 = f1_score(y_test, predictions_2, average='macro')
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
print (metrics.classification_report(y_test, predictions_2))
print("Time taken to run this cell :", datetime.now() - start)
# In[ ]:
from sklearn.externals import joblib
joblib.dump(classifier_2, 'drive/My Drive/Stackoverflow/Mylrnew_with_more_title_weight.pkl')
# In[ ]:
start = datetime.now()
classifier_2 = OneVsRestClassifier(LogisticRegression(penalty='l1'), n_jobs=-1)
classifier_2.fit(x_train_multilabel, y_train)
predictions_2 = classifier_2.predict(x_test_multilabel)
print("Accuracy :",metrics.accuracy_score(y_test, predictions_2))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions_2))
precision = precision_score(y_test, predictions_2, average='micro')
recall = recall_score(y_test, predictions_2, average='micro')
f1 = f1_score(y_test, predictions_2, average='micro')
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
precision = precision_score(y_test, predictions_2, average='macro')
recall = recall_score(y_test, predictions_2, average='macro')
f1 = f1_score(y_test, predictions_2, average='macro')
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
print (metrics.classification_report(y_test, predictions_2))
print("Time taken to run this cell :", datetime.now() - start)
# 5. Assignments
#
# - Use bag of words upto 4 grams and compute the micro f1 score with Logistic regression(OvR)
# - Perform hyperparam tuning on alpha (or lambda) for Logistic regression to improve the performance using GridSearch
# - Try OneVsRestClassifier with Linear-SVM (SGDClassifier with loss-hinge)
#
# In[ ]:
## Output
# 1st - bag of words upto 4 grams SGDClassifier(loss='log') 0.5M data points
# 2nd - bag of words upto 4 grams Logistic Regression 0.5M data points
# 3rd - tfidf(1 to 3 grams) 200k data points Logistic Regression grid search hyperparameter: alpha
# 4th - tfidf(1 to 3 grams) 200k data points (SGDClassifier with loss-hinge) grid search hyperparameter: alpha
# In[ ]:
start = datetime.now()
classifier = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.00001, penalty='l1'))
classifier.fit(x_train_multilabel, y_train)
# clf.fit(x_train_multilabel.copy(),y_train)
predictions = classifier.predict (x_test_multilabel)
print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))
precision = precision_score(y_test, predictions, average='micro')
recall = recall_score(y_test, predictions, average='micro')
f1 = f1_score(y_test, predictions, average='micro')
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
precision = precision_score(y_test, predictions, average='macro')
recall = recall_score(y_test, predictions, average='macro')
f1 = f1_score(y_test, predictions, average='macro')
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
print (metrics.classification_report(y_test, predictions))
print("Time taken to run this cell :", datetime.now() - start)
# In[ ]:
from sklearn.externals import joblib
joblib.dump(classifier, 'drive/My Drive/Stackoverflow/MySgd_with_more_title_weight.pkl')
# ### LR with 0.5M data points BOW
# In[ ]:
start = datetime.now()
classifier_2 = OneVsRestClassifier(LogisticRegression(penalty='l1'), n_jobs=-1)
x_train_multilabel.sort_indices()
classifier_2.fit(x_train_multilabel, y_train)
predictions_2 = classifier_2.predict(x_test_multilabel)
print("Accuracy :",metrics.accuracy_score(y_test, predictions_2))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions_2))
precision = precision_score(y_test, predictions_2, average='micro')
recall = recall_score(y_test, predictions_2, average='micro')
f1 = f1_score(y_test, predictions_2, average='micro')
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
precision = precision_score(y_test, predictions_2, average='macro')
recall = recall_score(y_test, predictions_2, average='macro')
f1 = f1_score(y_test, predictions_2, average='macro')
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
print (metrics.classification_report(y_test, predictions_2))
print("Time taken to run this cell :", datetime.now() - start)
# In[ ]:
from sklearn.externals import joblib
joblib.dump(classifier_2, 'drive/My Drive/Stackoverflow/Mylrnew_with_more_title_weight.pkl')
# ### Gridsearchcv on 200k data points Tfidf Vectors
# In[ ]:
from sklearn.model_selection import GridSearchCV
start = datetime.now()
classifier_2 = OneVsRestClassifier(SGDClassifier(loss='log',penalty='l1'), n_jobs=-1)
param={'estimator__alpha': [10**-5, 10**-4, 10**-3, 10**-2, 10**-1, 10**0, 10**1]}
grid_search_model = GridSearchCV(estimator = classifier_2, param_grid=param, cv=3, verbose=0,return_train_score=True,scoring='f1_micro',n_jobs=-1)
x_train_multilabel.sort_indices()
grid_search_model.fit(x_train_multilabel, y_train)
# In[ ]:
print("Time taken to run this cell :", datetime.now() - start)
# In[ ]:
alpha = [10**-5, 10**-4, 10**-3, 10**-2, 10**-1, 10**0, 10**1]
# In[ ]:
log_alpha = np.log([10**-5, 10**-4, 10**-3, 10**-2, 10**-1, 10**0, 10**1])
# In[ ]:
train_score = grid_search_model.cv_results_['mean_train_score']
# In[ ]:
test_score = grid_search_model.cv_results_['mean_test_score']
# In[ ]:
plt.scatter(log_alpha,train_score,label="Train Score")
plt.plot(log_alpha,train_score)
plt.scatter(log_alpha,test_score,label="Test Score")
plt.plot(log_alpha,test_score)
plt.xlabel("Alpha : Hyperparameter")
plt.ylabel("Micro F1 Score")
plt.xticks(log_alpha,alpha)
plt.legend()
# In[ ]:
best_alpha = grid_search_model.best_estimator_.get_params()['estimator__alpha']
print('Best alpha: ',best_alpha)
# In[ ]:
from sklearn.externals import joblib
joblib.dump(classifier_2, 'drive/My Drive/Stackoverflow/gridlog_with_more_title_weight.pkl')
# In[ ]:
start = datetime.now()
classifier = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.00001, penalty='l1'))
classifier.fit(x_train_multilabel, y_train)
# clf.fit(x_train_multilabel.copy(),y_train)
predictions = classifier.predict (x_test_multilabel)
print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))
precision = precision_score(y_test, predictions, average='micro')
recall = recall_score(y_test, predictions, average='micro')
f1 = f1_score(y_test, predictions, average='micro')
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
precision = precision_score(y_test, predictions, average='macro')
recall = recall_score(y_test, predictions, average='macro')
f1 = f1_score(y_test, predictions, average='macro')
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
print (metrics.classification_report(y_test, predictions))
print("Time taken to run this cell :", datetime.now() - start)
# In[ ]:
from sklearn.externals import joblib
joblib.dump(classifier, 'drive/My Drive/Stackoverflow/bestlog200k_with_more_title_weight.pkl')
# ### SGD with hinge loss
#
#
# In[ ]:
from sklearn.model_selection import GridSearchCV
start = datetime.now()
classifier_2 = OneVsRestClassifier(SGDClassifier(loss='hinge',penalty='l1'), n_jobs=-1)
param={'estimator__alpha': [10**-5, 10**-4, 10**-3, 10**-2, 10**-1, 10**0, 10**1]}
grid_search_model = GridSearchCV(estimator = classifier_2, param_grid=param, cv=3, verbose=0,return_train_score=True,scoring='f1_micro',n_jobs=-1)
x_train_multilabel.sort_indices()
grid_search_model.fit(x_train_multilabel, y_train)
print("Time taken to run this cell :", datetime.now() - start)
# In[ ]:
alpha = [10**-5, 10**-4, 10**-3, 10**-2, 10**-1, 10**0, 10**1]
# In[ ]:
log_alpha = np.log([10**-5, 10**-4, 10**-3, 10**-2, 10**-1, 10**0, 10**1])
# In[ ]:
train_score = grid_search_model.cv_results_['mean_train_score']
# In[ ]:
test_score = grid_search_model.cv_results_['mean_test_score']
# In[ ]:
plt.scatter(log_alpha,train_score,label="Train Score")
plt.plot(log_alpha,train_score)
plt.scatter(log_alpha,test_score,label="Test Score")
plt.plot(log_alpha,test_score)
plt.xlabel("Alpha : Hyperparameter")
plt.ylabel("Micro F1 Score")
plt.xticks(log_alpha,alpha)
plt.legend()
# In[ ]:
best_alpha = grid_search_model.best_estimator_.get_params()['estimator__alpha']
print('Best alpha: ',best_alpha)
# In[ ]:
from sklearn.externals import joblib
joblib.dump(classifier_2, 'drive/My Drive/Stackoverflow/gridhinge_with_more_title_weight.pkl')
# In[ ]:
get_ipython().system('grep -c ^processor /proc/cpuinfo')
# In[ ]:
start = datetime.now()
classifier = OneVsRestClassifier(SGDClassifier(loss='hinge', alpha=0.00001, penalty='l1'),n_jobs=-1)
classifier.fit(x_train_multilabel, y_train)
predictions = classifier.predict (x_test_multilabel)
print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))
precision = precision_score(y_test, predictions, average='micro')
recall = recall_score(y_test, predictions, average='micro')
f1 = f1_score(y_test, predictions, average='micro')
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
precision = precision_score(y_test, predictions, average='macro')
recall = recall_score(y_test, predictions, average='macro')
f1 = f1_score(y_test, predictions, average='macro')
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
print (metrics.classification_report(y_test, predictions))
print("Time taken to run this cell :", datetime.now() - start)
# In[ ]:
from sklearn.externals import joblib
joblib.dump(classifier, 'drive/My Drive/Stackoverflow/MySgdhinge_with_more_title_weight.pkl')
# ## Output
# ### 1st - bag of words upto 4 grams SGDClassifier(loss='log') 0.5M data points
# ### 2nd - bag of words upto 4 grams Logistic Regression 0.5M data points
# ### 3rd - tfidf(1 to 3 grams) 200k data points Logistic Regression grid search hyperparameter: alpha
# ### 4th - tfidf(1 to 3 grams) 200k data points (SGDClassifier with loss-hinge) grid search hyperparameter: alpha
# In[ ]:
# Please compare all your models using Prettytable library
# http://zetcode.com/python/prettytable/
from prettytable import PrettyTable
#If you get a ModuleNotFoundError error , install prettytable using: pip3 install prettytable
x = PrettyTable()
x.field_names = ["Data points","Vectorizer", "Model", "Hyper Parameter","Penalty", "Micro F1 Score","Hamming Loss","Macro F1 Score","Accuracy"]
x.add_row(["500k","BOW(upto 4 grams)", "SGDClassifier(loss='log')","Fixed : 10^-5","L1",0.3544 ,0.0059,0.2634,0.092 ])
x.add_row(["500k","BOW(upto 4 grams)", "Logisitic Regression","Fixed:10^-5","L1", 0.4792 ,0.0031,0.3852,0.2101])
x.add_row(["200k","TFIDF(upto 3 grams)", "Logisitic Regression implementation using SGDclassifier(loss='log')","Best Alpha:10^-5" ,"L1", 0.6968,0.00205,0.2415,0.3487])
x.add_row(["200k","TFIDF(upto 3 grams)", "SGDClassifier(loss='hinge')","Best Alpha:10^-5","L1", 0.6943,0.0019605,0.1880,0.3644])
print(x)
# ### Conclusion:
#
# 1. Due to high computation requirement , We have taken 0.5 M points for 2 models and 0.2 M for other two models and have considerd 90% tags i.e. 500 tags for all models.
# 2. We have highest accuracy using SGDClassifier(loss='hinge') | Best Alpha:10^-5 | L1
# 3. Performance of SGDClassifier with hinge loss is better in terms of time required for execution and scores .
# 4. It took almost 1/3 rd time for same data with SGDClassifier with hinge loss .
# 5. Performance of SGDClassifier with log loss is better in terms of time required for execution Logistic Regression .