#!/usr/bin/env python # coding: utf-8 # In[ ]: get_ipython().system('pip3 install scikit-multilearn') # In[ ]: import warnings warnings.filterwarnings("ignore") import pandas as pd import sqlite3 import csv import matplotlib.pyplot as plt import seaborn as sns import numpy as np from wordcloud import WordCloud import re import os from sqlalchemy import create_engine # database connection import datetime as dt from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from nltk.stem.snowball import SnowballStemmer from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.multiclass import OneVsRestClassifier from sklearn.linear_model import SGDClassifier from sklearn import metrics from sklearn.metrics import f1_score,precision_score,recall_score from sklearn import svm from sklearn.linear_model import LogisticRegression from skmultilearn.adapt import mlknn from skmultilearn.problem_transform import ClassifierChain from skmultilearn.problem_transform import BinaryRelevance from skmultilearn.problem_transform import LabelPowerset from sklearn.naive_bayes import GaussianNB from datetime import datetime # # Stack Overflow: Tag Prediction # In[ ]: from google.colab import drive drive.mount('/content/drive') #

1. Business Problem

#

1.1 Description

#

Description

#

# Stack Overflow is the largest, most trusted online community for developers to learn, share their programming knowledge, and build their careers.
#
# Stack Overflow is something which every programmer use one way or another. Each month, over 50 million developers come to Stack Overflow to learn, share their knowledge, and build their careers. It features questions and answers on a wide range of topics in computer programming. The website serves as a platform for users to ask and answer questions, and, through membership and active participation, to vote questions and answers up or down and edit questions and answers in a fashion similar to a wiki or Digg. As of April 2014 Stack Overflow has over 4,000,000 registered users, and it exceeded 10,000,000 questions in late August 2015. Based on the type of tags assigned to questions, the top eight most discussed topics on the site are: Java, JavaScript, C#, PHP, Android, jQuery, Python and HTML.
#
#

#

Problem Statemtent

# Suggest the tags based on the content that was there in the question posted on Stackoverflow. #

Source: https://www.kaggle.com/c/facebook-recruiting-iii-keyword-extraction/

# #

1.2 Source / useful links

# Data Source : https://www.kaggle.com/c/facebook-recruiting-iii-keyword-extraction/data
# Youtube : https://youtu.be/nNDqbUhtIRg
# Research paper : https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/tagging-1.pdf
# Research paper : https://dl.acm.org/citation.cfm?id=2660970&dl=ACM&coll=DL #

1.3 Real World / Business Objectives and Constraints

# 1. Predict as many tags as possible with high precision and recall. # 2. Incorrect tags could impact customer experience on StackOverflow. # 3. No strict latency constraints. #

2. Machine Learning problem

#

2.1 Data

#

2.1.1 Data Overview

# Refer: https://www.kaggle.com/c/facebook-recruiting-iii-keyword-extraction/data #
# All of the data is in 2 files: Train and Test.
#
# Train.csv contains 4 columns: Id,Title,Body,Tags.
# Test.csv contains the same columns but without the Tags, which you are to predict.
# Size of Train.csv - 6.75GB
# Size of Test.csv - 2GB
# Number of rows in Train.csv = 6034195
#
# The questions are randomized and contains a mix of verbose text sites as well as sites related to math and programming. The number of questions from each site may vary, and no filtering has been performed on the questions (such as closed questions).
#
# # __Data Field Explaination__ # # Dataset contains 6,034,195 rows. The columns in the table are:
#
# Id - Unique identifier for each question
# Title - The question's title
# Body - The body of the question
# Tags - The tags associated with the question in a space-seperated format (all lowercase, should not contain tabs '\t' or ampersands '&')
#
# #
#

2.1.2 Example Data point

#
# Title:  Implementing Boundary Value Analysis of Software Testing in a C++ program?
# Body : 

#         #include<
#         iostream>\n
#         #include<
#         stdlib.h>\n\n
#         using namespace std;\n\n
#         int main()\n
#         {\n
#                  int n,a[n],x,c,u[n],m[n],e[n][4];\n         
#                  cout<<"Enter the number of variables";\n         cin>>n;\n\n         
#                  cout<<"Enter the Lower, and Upper Limits of the variables";\n         
#                  for(int y=1; y<n+1; y++)\n         
#                  {\n                 
#                     cin>>m[y];\n                 
#                     cin>>u[y];\n         
#                  }\n         
#                  for(x=1; x<n+1; x++)\n         
#                  {\n                 
#                     a[x] = (m[x] + u[x])/2;\n         
#                  }\n         
#                  c=(n*4)-4;\n         
#                  for(int a1=1; a1<n+1; a1++)\n         
#                  {\n\n             
#                     e[a1][0] = m[a1];\n             
#                     e[a1][1] = m[a1]+1;\n             
#                     e[a1][2] = u[a1]-1;\n             
#                     e[a1][3] = u[a1];\n         
#                  }\n         
#                  for(int i=1; i<n+1; i++)\n         
#                  {\n            
#                     for(int l=1; l<=i; l++)\n            
#                     {\n                 
#                         if(l!=1)\n                 
#                         {\n                    
#                             cout<<a[l]<<"\\t";\n                 
#                         }\n            
#                     }\n            
#                     for(int j=0; j<4; j++)\n            
#                     {\n                
#                         cout<<e[i][j];\n                
#                         for(int k=0; k<n-(i+1); k++)\n                
#                         {\n                    
#                             cout<<a[k]<<"\\t";\n               
#                         }\n                
#                         cout<<"\\n";\n            
#                     }\n        
#                  }    \n\n        
#                  system("PAUSE");\n        
#                  return 0;    \n
#         }\n
#         
\n\n #

The answer should come in the form of a table like

\n\n #
       
#         1            50              50\n       
#         2            50              50\n       
#         99           50              50\n       
#         100          50              50\n       
#         50           1               50\n       
#         50           2               50\n       
#         50           99              50\n       
#         50           100             50\n       
#         50           50              1\n       
#         50           50              2\n       
#         50           50              99\n       
#         50           50              100\n
#         
\n\n #

if the no of inputs is 3 and their ranges are\n # 1,100\n # 1,100\n # 1,100\n # (could be varied too)

\n\n #

The output is not coming,can anyone correct the code or tell me what\'s wrong?

\n' # Tags : 'c++ c' #
#

2.2 Mapping the real-world problem to a Machine Learning Problem

#

2.2.1 Type of Machine Learning Problem

#

It is a multi-label classification problem
# Multi-label Classification: Multilabel classification assigns to each sample a set of target labels. This can be thought as predicting properties of a data-point that are not mutually exclusive, such as topics that are relevant for a document. A question on Stackoverflow might be about any of C, Pointers, FileIO and/or memory-management at the same time or none of these.
# __Credit__: http://scikit-learn.org/stable/modules/multiclass.html #

#

2.2.2 Performance metric

# Micro-Averaged F1-Score (Mean F Score) : # The F1 score can be interpreted as a weighted average of the precision and recall, where an F1 score reaches its best value at 1 and worst score at 0. The relative contribution of precision and recall to the F1 score are equal. The formula for the F1 score is: # # F1 = 2 * (precision * recall) / (precision + recall)
# # In the multi-class and multi-label case, this is the weighted average of the F1 score of each class.
# # 'Micro f1 score':
# Calculate metrics globally by counting the total true positives, false negatives and false positives. This is a better metric when we have class imbalance. #
# # 'Macro f1 score':
# Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account. #
# # https://www.kaggle.com/wiki/MeanFScore
# http://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html
#
# Hamming loss : The Hamming loss is the fraction of labels that are incorrectly predicted.
# https://www.kaggle.com/wiki/HammingLoss
#

3. Exploratory Data Analysis

#

3.1 Data Loading and Cleaning

#

3.1.1 Using Pandas with SQLite to Load the data

# In[ ]: #Creating db file from csv #Learn SQL: https://www.w3schools.com/sql/default.asp if not os.path.isfile('train.db'): start = datetime.now() disk_engine = create_engine('sqlite:///train.db') start = dt.datetime.now() chunksize = 180000 j = 0 index_start = 1 for df in pd.read_csv('Train.csv', names=['Id', 'Title', 'Body', 'Tags'], chunksize=chunksize, iterator=True, encoding='utf-8', ): df.index += index_start j+=1 print('{} rows'.format(j*chunksize)) df.to_sql('data', disk_engine, if_exists='append') index_start = df.index[-1] + 1 print("Time taken to run this cell :", datetime.now() - start) #

3.1.2 Counting the number of rows

# In[ ]: if os.path.isfile('train.db'): start = datetime.now() con = sqlite3.connect('train.db') num_rows = pd.read_sql_query("""SELECT count(*) FROM data""", con) #Always remember to close the database print("Number of rows in the database :","\n",num_rows['count(*)'].values[0]) con.close() print("Time taken to count the number of rows :", datetime.now() - start) else: print("Please download the train.db file from drive or run the above cell to genarate train.db file") #

3.1.3 Checking for duplicates

# In[ ]: #Learn SQl: https://www.w3schools.com/sql/default.asp if os.path.isfile('train.db'): start = datetime.now() con = sqlite3.connect('train.db') df_no_dup = pd.read_sql_query('SELECT Title, Body, Tags, COUNT(*) as cnt_dup FROM data GROUP BY Title, Body, Tags', con) con.close() print("Time taken to run this cell :", datetime.now() - start) else: print("Please download the train.db file from drive or run the first to genarate train.db file") # In[ ]: df_no_dup.head() # we can observe that there are duplicates # In[ ]: print("number of duplicate questions :", num_rows['count(*)'].values[0]- df_no_dup.shape[0], "(",(1-((df_no_dup.shape[0])/(num_rows['count(*)'].values[0])))*100,"% )") # In[ ]: # number of times each question appeared in our database df_no_dup.cnt_dup.value_counts() # In[ ]: start = datetime.now() df_no_dup["tag_count"] = df_no_dup["Tags"].apply(lambda text: len(text.split(" "))) # adding a new feature number of tags per question print("Time taken to run this cell :", datetime.now() - start) df_no_dup.head() # In[ ]: # distribution of number of tags per question df_no_dup.tag_count.value_counts() # In[ ]: #Creating a new database with no duplicates if not os.path.isfile('train_no_dup.db'): disk_dup = create_engine("sqlite:///train_no_dup.db") no_dup = pd.DataFrame(df_no_dup, columns=['Title', 'Body', 'Tags']) no_dup.to_sql('no_dup_train',disk_dup) # In[ ]: #This method seems more appropriate to work with this much data. #creating the connection with database file. if os.path.isfile('drive/My Drive/Stackoverflow/data/train_no_dup.db'): start = datetime.now() con = sqlite3.connect('train_no_dup.db') tag_data = pd.read_sql_query("""SELECT Tags FROM no_dup_train""", con) #Always remember to close the database con.close() # Let's now drop unwanted column. tag_data.drop(tag_data.index[0], inplace=True) #Printing first 5 columns from our data frame tag_data.head() print("Time taken to run this cell :", datetime.now() - start) else: print("Please download the train.db file from drive or run the above cells to genarate train.db file") #

3.2 Analysis of Tags

#

3.2.1 Total number of unique tags

# In[ ]: # Importing & Initializing the "CountVectorizer" object, which #is scikit-learn's bag of words tool. #by default 'split()' will tokenize each tag using space. vectorizer = CountVectorizer(tokenizer = lambda x: x.split()) # fit_transform() does two functions: First, it fits the model # and learns the vocabulary; second, it transforms our training data # into feature vectors. The input to fit_transform should be a list of strings. tag_dtm = vectorizer.fit_transform(tag_data['Tags']) # In[ ]: print("Number of data points :", tag_dtm.shape[0]) print("Number of unique tags :", tag_dtm.shape[1]) # In[ ]: #'get_feature_name()' gives us the vocabulary. tags = vectorizer.get_feature_names() #Lets look at the tags we have. print("Some of the tags we have :", tags[:10]) #

3.2.3 Number of times a tag appeared

# In[ ]: # https://stackoverflow.com/questions/15115765/how-to-access-sparse-matrix-elements #Lets now store the document term matrix in a dictionary. freqs = tag_dtm.sum(axis=0).A1 result = dict(zip(tags, freqs)) # In[ ]: #Saving this dictionary to csv files. if not os.path.isfile('tag_counts_dict_dtm.csv'): with open('tag_counts_dict_dtm.csv', 'w') as csv_file: writer = csv.writer(csv_file) for key, value in result.items(): writer.writerow([key, value]) tag_df = pd.read_csv("tag_counts_dict_dtm.csv", names=['Tags', 'Counts']) tag_df.head() # In[ ]: tag_df_sorted = tag_df.sort_values(['Counts'], ascending=False) tag_counts = tag_df_sorted['Counts'].values # In[ ]: plt.plot(tag_counts) plt.title("Distribution of number of times tag appeared questions") plt.grid() plt.xlabel("Tag number") plt.ylabel("Number of times tag appeared") plt.show() # In[ ]: plt.plot(tag_counts[0:10000]) plt.title('first 10k tags: Distribution of number of times tag appeared questions') plt.grid() plt.xlabel("Tag number") plt.ylabel("Number of times tag appeared") plt.show() print(len(tag_counts[0:10000:25]), tag_counts[0:10000:25]) # In[ ]: plt.plot(tag_counts[0:1000]) plt.title('first 1k tags: Distribution of number of times tag appeared questions') plt.grid() plt.xlabel("Tag number") plt.ylabel("Number of times tag appeared") plt.show() print(len(tag_counts[0:1000:5]), tag_counts[0:1000:5]) # In[ ]: plt.plot(tag_counts[0:500]) plt.title('first 500 tags: Distribution of number of times tag appeared questions') plt.grid() plt.xlabel("Tag number") plt.ylabel("Number of times tag appeared") plt.show() print(len(tag_counts[0:500:5]), tag_counts[0:500:5]) # In[ ]: plt.plot(tag_counts[0:100], c='b') plt.scatter(x=list(range(0,100,5)), y=tag_counts[0:100:5], c='orange', label="quantiles with 0.05 intervals") # quantiles with 0.25 difference plt.scatter(x=list(range(0,100,25)), y=tag_counts[0:100:25], c='m', label = "quantiles with 0.25 intervals") for x,y in zip(list(range(0,100,25)), tag_counts[0:100:25]): plt.annotate(s="({} , {})".format(x,y), xy=(x,y), xytext=(x-0.05, y+500)) plt.title('first 100 tags: Distribution of number of times tag appeared questions') plt.grid() plt.xlabel("Tag number") plt.ylabel("Number of times tag appeared") plt.legend() plt.show() print(len(tag_counts[0:100:5]), tag_counts[0:100:5]) # In[ ]: # Store tags greater than 10K in one list lst_tags_gt_10k = tag_df[tag_df.Counts>10000].Tags #Print the length of the list print ('{} Tags are used more than 10000 times'.format(len(lst_tags_gt_10k))) # Store tags greater than 100K in one list lst_tags_gt_100k = tag_df[tag_df.Counts>100000].Tags #Print the length of the list. print ('{} Tags are used more than 100000 times'.format(len(lst_tags_gt_100k))) # Observations:
# 1. There are total 153 tags which are used more than 10000 times. # 2. 14 tags are used more than 100000 times. # 3. Most frequent tag (i.e. c#) is used 331505 times. # 4. Since some tags occur much more frequenctly than others, Micro-averaged F1-score is the appropriate metric for this probelm. #

3.2.4 Tags Per Question

# In[ ]: #Storing the count of tag in each question in list 'tag_count' tag_quest_count = tag_dtm.sum(axis=1).tolist() #Converting list of lists into single list, we will get [[3], [4], [2], [2], [3]] and we are converting this to [3, 4, 2, 2, 3] tag_quest_count=[int(j) for i in tag_quest_count for j in i] print ('We have total {} datapoints.'.format(len(tag_quest_count))) print(tag_quest_count[:5]) # In[ ]: print( "Maximum number of tags per question: %d"%max(tag_quest_count)) print( "Minimum number of tags per question: %d"%min(tag_quest_count)) print( "Avg. number of tags per question: %f"% ((sum(tag_quest_count)*1.0)/len(tag_quest_count))) # In[ ]: sns.countplot(tag_quest_count, palette='gist_rainbow') plt.title("Number of tags in the questions ") plt.xlabel("Number of Tags") plt.ylabel("Number of questions") plt.show() # Observations:
# 1. Maximum number of tags per question: 5 # 2. Minimum number of tags per question: 1 # 3. Avg. number of tags per question: 2.899 # 4. Most of the questions are having 2 or 3 tags #

3.2.5 Most Frequent Tags

# In[ ]: # Ploting word cloud start = datetime.now() # Lets first convert the 'result' dictionary to 'list of tuples' tup = dict(result.items()) #Initializing WordCloud using frequencies of tags. wordcloud = WordCloud( background_color='black', width=1600, height=800, ).generate_from_frequencies(tup) fig = plt.figure(figsize=(30,20)) plt.imshow(wordcloud) plt.axis('off') plt.tight_layout(pad=0) fig.savefig("tag.png") plt.show() print("Time taken to run this cell :", datetime.now() - start) # Observations:
# A look at the word cloud shows that "c#", "java", "php", "asp.net", "javascript", "c++" are some of the most frequent tags. #

3.2.6 The top 20 tags

# In[ ]: i=np.arange(30) tag_df_sorted.head(30).plot(kind='bar') plt.title('Frequency of top 20 tags') plt.xticks(i, tag_df_sorted['Tags']) plt.xlabel('Tags') plt.ylabel('Counts') plt.show() # Observations:
# 1. Majority of the most frequent tags are programming language. # 2. C# is the top most frequent programming language. # 3. Android, IOS, Linux and windows are among the top most frequent operating systems. #

3.3 Cleaning and preprocessing of Questions

#

3.3.1 Preprocessing

#
    #
  1. Sample 1M data points
  2. #
  3. Separate out code-snippets from Body
  4. #
  5. Remove Spcial characters from Question title and description (not in code)
  6. #
  7. Remove stop words (Except 'C')
  8. #
  9. Remove HTML Tags
  10. #
  11. Convert all the characters into small letters
  12. #
  13. Use SnowballStemmer to stem the words
  14. #
# In[ ]: import nltk nltk.download('stopwords') def striphtml(data): cleanr = re.compile('<.*?>') cleantext = re.sub(cleanr, ' ', str(data)) return cleantext stop_words = set(stopwords.words('english')) stemmer = SnowballStemmer("english") # In[ ]: #http://www.sqlitetutorial.net/sqlite-python/create-tables/ def create_connection(db_file): """ create a database connection to the SQLite database specified by db_file :param db_file: database file :return: Connection object or None """ try: conn = sqlite3.connect(db_file) print(type(conn)) return conn except Error as e: print(e) return None def create_table(conn, create_table_sql): """ create a table from the create_table_sql statement :param conn: Connection object :param create_table_sql: a CREATE TABLE statement :return: """ try: c = conn.cursor() c.execute(create_table_sql) except Error as e: print(e) def checkTableExists(dbcon): cursr = dbcon.cursor() str = "select name from sqlite_master where type='table'" table_names = cursr.execute(str) print("Tables in the databse:") tables =table_names.fetchall() print(tables[0][0]) return(len(tables)) def create_database_table(database, query): conn = create_connection(database) if conn is not None: create_table(conn, query) checkTableExists(conn) else: print("Error! cannot create the database connection.") conn.close() # sql_create_table = """CREATE TABLE IF NOT EXISTS QuestionsProcessed (question text NOT NULL, code text, tags text, words_pre integer, words_post integer, is_code integer);""" # create_database_table("Processed.db", sql_create_table) # In[ ]: # http://www.sqlitetutorial.net/sqlite-delete/ # https://stackoverflow.com/questions/2279706/select-random-row-from-a-sqlite-table start = datetime.now() read_db = 'train_no_dup.db' write_db = 'Processed.db' if os.path.isfile(read_db): conn_r = create_connection(read_db) if conn_r is not None: reader =conn_r.cursor() reader.execute("SELECT Title, Body, Tags From no_dup_train ORDER BY RANDOM() LIMIT 1000000;") if os.path.isfile(write_db): conn_w = create_connection(write_db) if conn_w is not None: tables = checkTableExists(conn_w) writer =conn_w.cursor() if tables != 0: writer.execute("DELETE FROM QuestionsProcessed WHERE 1") print("Cleared All the rows") print("Time taken to run this cell :", datetime.now() - start) # __ we create a new data base to store the sampled and preprocessed questions __ # In[ ]: #http://www.bernzilla.com/2008/05/13/selecting-a-random-row-from-an-sqlite-table/ start = datetime.now() preprocessed_data_list=[] reader.fetchone() questions_with_code=0 len_pre=0 len_post=0 questions_proccesed = 0 for row in reader: is_code = 0 title, question, tags = row[0], row[1], row[2] if '' in question: questions_with_code+=1 is_code = 1 x = len(question)+len(title) len_pre+=x code = str(re.findall(r'(.*?)', question, flags=re.DOTALL)) question=re.sub('(.*?)', '', question, flags=re.MULTILINE|re.DOTALL) question=striphtml(question.encode('utf-8')) title=title.encode('utf-8') question=str(title)+" "+str(question) question=re.sub(r'[^A-Za-z]+',' ',question) words=word_tokenize(str(question.lower())) #Removing all single letter and and stopwords from question exceptt for the letter 'c' question=' '.join(str(stemmer.stem(j)) for j in words if j not in stop_words and (len(j)!=1 or j=='c')) len_post+=len(question) tup = (question,code,tags,x,len(question),is_code) questions_proccesed += 1 writer.execute("insert into QuestionsProcessed(question,code,tags,words_pre,words_post,is_code) values (?,?,?,?,?,?)",tup) if (questions_proccesed%100000==0): print("number of questions completed=",questions_proccesed) no_dup_avg_len_pre=(len_pre*1.0)/questions_proccesed no_dup_avg_len_post=(len_post*1.0)/questions_proccesed print( "Avg. length of questions(Title+Body) before processing: %d"%no_dup_avg_len_pre) print( "Avg. length of questions(Title+Body) after processing: %d"%no_dup_avg_len_post) print ("Percent of questions containing code: %d"%((questions_with_code*100.0)/questions_proccesed)) print("Time taken to run this cell :", datetime.now() - start) # In[ ]: # dont forget to close the connections, or else you will end up with locks conn_r.commit() conn_w.commit() conn_r.close() conn_w.close() # In[ ]: if os.path.isfile(write_db): conn_r = create_connection(write_db) if conn_r is not None: reader =conn_r.cursor() reader.execute("SELECT question From QuestionsProcessed LIMIT 10") print("Questions after preprocessed") print('='*100) reader.fetchone() for row in reader: print(row) print('-'*100) conn_r.commit() conn_r.close() # In[ ]: #Taking 1 Million entries to a dataframe. #check here write_db = 'drive/My Drive/Stackoverflow/data/Processed.db' if os.path.isfile(write_db): conn_r = create_connection(write_db) if conn_r is not None: preprocessed_data = pd.read_sql_query("""SELECT question, Tags FROM QuestionsProcessed LIMIT 500000 """, conn_r) conn_r.commit() conn_r.close() # In[ ]: preprocessed_data.head() # In[ ]: print("number of data points in sample :", preprocessed_data.shape[0]) print("number of dimensions :", preprocessed_data.shape[1]) #

4. Machine Learning Models

#

4.1 Converting tags for multilabel problems

# # # # # # # # # # # # # #
Xy1y2y3y4
x10110
x11000
x10100
# In[ ]: # binary='true' will give a binary vectorizer vectorizer = CountVectorizer(tokenizer = lambda x: x.split(), binary='true') multilabel_y = vectorizer.fit_transform(preprocessed_data['tags']) # __ We will sample the number of tags instead considering all of them (due to limitation of computing power) __ # In[ ]: def tags_to_choose(n): t = multilabel_y.sum(axis=0).tolist()[0] sorted_tags_i = sorted(range(len(t)), key=lambda i: t[i], reverse=True) multilabel_yn=multilabel_y[:,sorted_tags_i[:n]] return multilabel_yn def questions_explained_fn(n): multilabel_yn = tags_to_choose(n) x= multilabel_yn.sum(axis=1) return (np.count_nonzero(x==0)) # In[ ]: questions_explained = [] total_tags=multilabel_y.shape[1] total_qs=preprocessed_data.shape[0] for i in range(500, total_tags, 100): questions_explained.append(np.round(((total_qs-questions_explained_fn(i))/total_qs)*100,3)) # In[ ]: fig, ax = plt.subplots() ax.plot(questions_explained) xlabel = list(500+np.array(range(-50,450,50))*50) ax.set_xticklabels(xlabel) plt.xlabel("Number of tags") plt.ylabel("Number Questions coverd partially") plt.grid() plt.show() # you can choose any number of tags based on your computing power, minimun is 50(it covers 90% of the tags) print("with ",5500,"tags we are covering ",questions_explained[50],"% of questions") # In[ ]: multilabel_yx = tags_to_choose(500) print("number of questions that are not covered :", questions_explained_fn(500),"out of 500000" ) # In[ ]: print("Number of tags in sample :", multilabel_y.shape[1]) print("number of tags taken :", multilabel_yx.shape[1],"(",(multilabel_yx.shape[1]/multilabel_y.shape[1])*100,"%)") # __ We consider top 15% tags which covers 99% of the questions __ #

4.2 Split the data into test and train (80:20)

# In[ ]: total_size=preprocessed_data.shape[0] train_size=int(0.80*total_size) x_train=preprocessed_data.head(train_size) x_test=preprocessed_data.tail(total_size - train_size) y_train = multilabel_yx[0:train_size,:] y_test = multilabel_yx[train_size:total_size,:] # In[ ]: print("Number of data points in train data :", x_train.shape) print("Number of data points in test data :", x_test.shape) # In[ ]: print("Number of data points in train data :", y_train.shape) print("Number of data points in test data :", y_test.shape) #

4.3 Featurizing data

# In[ ]: start = datetime.now() vectorizer = TfidfVectorizer(min_df=0.00009, max_features=200000, smooth_idf=True, norm="l2", \ tokenizer = lambda x: x.split(), sublinear_tf=False, ngram_range=(1,3)) x_train_multilabel = vectorizer.fit_transform(x_train['question']) x_test_multilabel = vectorizer.transform(x_test['question']) print("Time taken to run this cell :", datetime.now() - start) # In[ ]: print("Dimensions of train data X:",x_train_multilabel.shape, "Y :",y_train.shape) print("Dimensions of test data X:",x_test_multilabel.shape,"Y:",y_test.shape) # In[ ]: # https://www.analyticsvidhya.com/blog/2017/08/introduction-to-multi-label-classification/ #https://stats.stackexchange.com/questions/117796/scikit-multi-label-classification # classifier = LabelPowerset(GaussianNB()) """ from skmultilearn.adapt import MLkNN classifier = MLkNN(k=21) # train classifier.fit(x_train_multilabel, y_train) # predict predictions = classifier.predict(x_test_multilabel) print(accuracy_score(y_test,predictions)) print(metrics.f1_score(y_test, predictions, average = 'macro')) print(metrics.f1_score(y_test, predictions, average = 'micro')) print(metrics.hamming_loss(y_test,predictions)) """ # we are getting memory error because the multilearn package # is trying to convert the data into dense matrix # --------------------------------------------------------------------------- #MemoryError Traceback (most recent call last) # in () #----> classifier.fit(x_train_multilabel, y_train) #

4.4 Applying Logistic Regression with OneVsRest Classifier

# In[ ]: # this will be taking so much time try not to run it, download the lr_with_equal_weight.pkl file and use to predict # This takes about 6-7 hours to run. classifier = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.00001, penalty='l1'), n_jobs=-1) classifier.fit(x_train_multilabel, y_train) predictions = classifier.predict(x_test_multilabel) print("accuracy :",metrics.accuracy_score(y_test,predictions)) print("macro f1 score :",metrics.f1_score(y_test, predictions, average = 'macro')) print("micro f1 scoore :",metrics.f1_score(y_test, predictions, average = 'micro')) print("hamming loss :",metrics.hamming_loss(y_test,predictions)) print("Precision recall report :\n",metrics.classification_report(y_test, predictions)) # In[ ]: from sklearn.externals import joblib joblib.dump(classifier, 'lr_with_equal_weight.pkl') #

4.5 Modeling with less data points (0.5M data points) and more weight to title and 500 tags only.

# In[ ]: sql_create_table = """CREATE TABLE IF NOT EXISTS QuestionsProcessed (question text NOT NULL, code text, tags text, words_pre integer, words_post integer, is_code integer);""" create_database_table("drive/My Drive/Stackoverflow/MyTitlemoreweight.db", sql_create_table) # In[ ]: title_more = create_engine("sqlite:///drive/My Drive/Stackoverflow/MyTitlemoreweight.db") # no_dup = pd.DataFrame(df_no_dup, columns=['Title', 'Body', 'Tags']) # no_dup.to_sql('no_dup_train',title_more) # In[ ]: # http://www.sqlitetutorial.net/sqlite-delete/ # https://stackoverflow.com/questions/2279706/select-random-row-from-a-sqlite-table #check here read_db = 'drive/My Drive/Stackoverflow/data/train_no_dup.db' write_db = 'drive/My Drive/Stackoverflow/MyTitlemoreweight.db' train_datasize = 400000 if os.path.isfile(read_db): conn_r = create_connection(read_db) if conn_r is not None: reader =conn_r.cursor() # for selecting first 0.5M rows reader.execute("SELECT Title, Body, Tags From no_dup_train LIMIT 500001;") # for selecting random points #reader.execute("SELECT Title, Body, Tags From no_dup_train ORDER BY RANDOM() LIMIT 500001;") if os.path.isfile(write_db): conn_w = create_connection(write_db) if conn_w is not None: tables = checkTableExists(conn_w) writer =conn_w.cursor() if tables != 0: writer.execute("DELETE FROM QuestionsProcessed WHERE 1") print("Cleared All the rows") #

4.5.1 Preprocessing of questions

#
    #
  1. Separate Code from Body
  2. #
  3. Remove Spcial characters from Question title and description (not in code)
  4. #
  5. Give more weightage to title : Add title three times to the question
  6. # #
  7. Remove stop words (Except 'C')
  8. #
  9. Remove HTML Tags
  10. #
  11. Convert all the characters into small letters
  12. #
  13. Use SnowballStemmer to stem the words
  14. #
# In[ ]: #http://www.bernzilla.com/2008/05/13/selecting-a-random-row-from-an-sqlite-table/ import nltk nltk.download('punkt') start = datetime.now() preprocessed_data_list=[] reader.fetchone() questions_with_code=0 len_pre=0 len_post=0 questions_proccesed = 0 for row in reader: is_code = 0 title, question, tags = row[0], row[1], str(row[2]) if '' in question: questions_with_code+=1 is_code = 1 x = len(question)+len(title) len_pre+=x code = str(re.findall(r'(.*?)', question, flags=re.DOTALL)) question=re.sub('(.*?)', '', question, flags=re.MULTILINE|re.DOTALL) question=striphtml(question.encode('utf-8')) title=title.encode('utf-8') # adding title three time to the data to increase its weight # add tags string to the training data question=str(title)+" "+str(title)+" "+str(title)+" "+question # if questions_proccesed<=train_datasize: # question=str(title)+" "+str(title)+" "+str(title)+" "+question+" "+str(tags) # else: # question=str(title)+" "+str(title)+" "+str(title)+" "+question question=re.sub(r'[^A-Za-z0-9#+.\-]+',' ',question) words=word_tokenize(str(question.lower())) #Removing all single letter and and stopwords from question exceptt for the letter 'c' question=' '.join(str(stemmer.stem(j)) for j in words if j not in stop_words and (len(j)!=1 or j=='c')) len_post+=len(question) tup = (question,code,tags,x,len(question),is_code) questions_proccesed += 1 writer.execute("insert into QuestionsProcessed(question,code,tags,words_pre,words_post,is_code) values (?,?,?,?,?,?)",tup) if (questions_proccesed%100000==0): print("number of questions completed=",questions_proccesed) no_dup_avg_len_pre=(len_pre*1.0)/questions_proccesed no_dup_avg_len_post=(len_post*1.0)/questions_proccesed print( "Avg. length of questions(Title+Body) before processing: %d"%no_dup_avg_len_pre) print( "Avg. length of questions(Title+Body) after processing: %d"%no_dup_avg_len_post) print ("Percent of questions containing code: %d"%((questions_with_code*100.0)/questions_proccesed)) print("Time taken to run this cell :", datetime.now() - start) # In[ ]: # never forget to close the conections or else we will end up with database locks conn_r.commit() conn_w.commit() conn_r.close() conn_w.close() # __ Sample quesitons after preprocessing of data __ # In[ ]: if os.path.isfile(write_db): conn_r = create_connection(write_db) if conn_r is not None: reader =conn_r.cursor() reader.execute("SELECT question From QuestionsProcessed LIMIT 50") print("Questions after preprocessed") print('='*100) reader.fetchone() for row in reader: print(row) print('-'*100) conn_r.commit() conn_r.close() # __ Saving Preprocessed data to a Database __ # In[ ]: i=0 conn_r = create_connection(read_db) if conn_r is not None: reader =conn_r.cursor() # for selecting first 0.5M rows reader.execute("SELECT Title, Body, Tags From no_dup_train LIMIT 2;") for row in reader: is_code = 0 title, question, tags = row[0], row[1], str(row[2]) if '' in question: questions_with_code+=1 is_code = 1 x = len(question)+len(title) len_pre+=x code = str(re.findall(r'(.*?)', question, flags=re.DOTALL)) question=re.sub('(.*?)', '', question, flags=re.MULTILINE|re.DOTALL) question=striphtml(question.encode('utf-8')) title=title.encode('utf-8') # adding title three time to the data to increase its weight # add tags string to the training data print("title ",title," title") question=str(title)+" "+str(title)+" "+str(title)+" "+question print(question) # if questions_proccesed<=train_datasize: # question=str(title)+" "+str(title)+" "+str(title)+" "+question+" "+str(tags) # else: # question=str(title)+" "+str(title)+" "+str(title)+" "+question question=re.sub(r'[^A-Za-z0-9#+.\-]+',' ',question) words=word_tokenize(str(question.lower())) #Removing all single letter and and stopwords from question exceptt for the letter 'c' question=' '.join(str(stemmer.stem(j)) for j in words if j not in stop_words and (len(j)!=1 or j=='c')) len_post+=len(question) tup = (question,code,tags,x,len(question),is_code) questions_proccesed += 1 print(" Final Question: ",question) conn_r.commit() conn_r.close() # In[ ]: preprocessed_old.head() # In[ ]: start = datetime.now() #check here #Taking 0.5 Million entries to a dataframe. write_db = 'drive/My Drive/Stackoverflow/MyTitlemoreweight.db' if os.path.isfile(write_db): conn_r = create_connection(write_db) print(type(conn_r)) if conn_r is not None: print(type(conn_r),"inside") preprocessed_data = pd.read_sql_query("""SELECT question, Tags FROM QuestionsProcessed""", conn_r) print("done") conn_r.commit() conn_r.close() print("Time taken to run this cell :", datetime.now() - start) # In[ ]: checkTableExists(conn_r) def checkTableExists(dbcon): cursr = dbcon.cursor() str = "select name from sqlite_master where type='table'" table_names = cursr.execute(str) print("Tables in the databse:") tables =table_names.fetchall() print(tables[0][0]) return(len(tables)) # In[ ]: cou = "select name from sqlite_master where type='table'" table_names = cursr.execute(str) print("Tables in the databse:") tables =table_names.fetchall() # In[ ]: preprocessed_data.head() # In[ ]: print("number of data points in sample :", preprocessed_data.shape[0]) print("number of dimensions :", preprocessed_data.shape[1]) # __ Converting string Tags to multilable output variables __ # In[ ]: vectorizer = CountVectorizer(tokenizer = lambda x: x.split(), binary='true') multilabel_y = vectorizer.fit_transform(preprocessed_data['tags']) # In[ ]: # __ Selecting 500 Tags __ # In[ ]: questions_explained = [] total_tags=multilabel_y.shape[1] total_qs=preprocessed_data.shape[0] for i in range(500, total_tags, 100): questions_explained.append(np.round(((total_qs-questions_explained_fn(i))/total_qs)*100,3)) # In[ ]: fig, ax = plt.subplots() ax.plot(questions_explained) xlabel = list(500+np.array(range(-50,450,50))*50) ax.set_xticklabels(xlabel) plt.xlabel("Number of tags") plt.ylabel("Number Questions coverd partially") plt.grid() plt.show() # you can choose any number of tags based on your computing power, minimun is 500(it covers 90% of the tags) print("with ",5500,"tags we are covering ",questions_explained[50],"% of questions") print("with ",500,"tags we are covering ",questions_explained[0],"% of questions") # In[ ]: # we will be taking 500 tags multilabel_yx = tags_to_choose(500) print("number of questions that are not covered :", questions_explained_fn(500),"out of 500000") # In[ ]: sub_size = 200000 preprocessed_data_sub = preprocessed_data.head(sub_size) total_size=preprocessed_data_sub.shape[0] train_size=int(0.80*total_size) x_train=preprocessed_data_sub.head(train_size) x_test=preprocessed_data_sub.tail(total_size - train_size) y_train = multilabel_yx[0:train_size,:] y_test = multilabel_yx[train_size:total_size,:] # In[ ]: print("Number of data points in train data :", y_train.shape) print("Number of data points in test data :", y_test.shape) #

4.5.2 Featurizing data with TfIdf vectorizer

# In[ ]: start = datetime.now() vectorizer = TfidfVectorizer(min_df=0.00009, max_features=200000, smooth_idf=True, norm="l2", \ tokenizer = lambda x: x.split(), sublinear_tf=False, ngram_range=(1,3)) x_train_multilabel = vectorizer.fit_transform(x_train['question']) x_test_multilabel = vectorizer.transform(x_test['question']) print("Time taken to run this cell :", datetime.now() - start) # In[ ]: train_datasize = 400000 x_train=preprocessed_data.head(train_datasize) x_test=preprocessed_data.tail(preprocessed_data.shape[0] - 400000) y_train = multilabel_yx[0:train_datasize,:] y_test = multilabel_yx[train_datasize:preprocessed_data.shape[0],:] # In[ ]: print("Number of data points in train data :", y_train.shape) print("Number of data points in test data :", y_test.shape) #

4.5.2 Featurizing data with TfIdf vectorizer

# In[ ]: start = datetime.now() vectorizer = CountVectorizer(min_df=0.00009, max_features=200000, \ tokenizer = lambda x: x.split(), ngram_range=(1,4)) x_train_multilabel = vectorizer.fit_transform(x_train['question']) x_test_multilabel = vectorizer.transform(x_test['question']) print("Time taken to run this cell :", datetime.now() - start) # In[ ]: joblib.dump(x_train_multilabel, 'lr_with_more_title_weight.pkl') # In[ ]: print("Dimensions of train data X:",x_train_multilabel.shape, "Y :",y_train.shape) print("Dimensions of test data X:",x_test_multilabel.shape,"Y:",y_test.shape) #

4.5.3 Applying Logistic Regression with OneVsRest Classifier

# In[ ]: start = datetime.now() classifier = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.00001, penalty='l1')) classifier.fit(x_train_multilabel, y_train) # clf.fit(x_train_multilabel.copy(),y_train) predictions = classifier.predict (x_test_multilabel) print("Accuracy :",metrics.accuracy_score(y_test, predictions)) print("Hamming loss ",metrics.hamming_loss(y_test,predictions)) precision = precision_score(y_test, predictions, average='micro') recall = recall_score(y_test, predictions, average='micro') f1 = f1_score(y_test, predictions, average='micro') print("Micro-average quality numbers") print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1)) precision = precision_score(y_test, predictions, average='macro') recall = recall_score(y_test, predictions, average='macro') f1 = f1_score(y_test, predictions, average='macro') print("Macro-average quality numbers") print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1)) print (metrics.classification_report(y_test, predictions)) print("Time taken to run this cell :", datetime.now() - start) # In[ ]: start = datetime.now() classifier_2 = OneVsRestClassifier(LogisticRegression(penalty='l1'), n_jobs=-1) x_train_multilabel.sort_indices() classifier_2.fit(x_train_multilabel, y_train) predictions_2 = classifier_2.predict(x_test_multilabel) print("Accuracy :",metrics.accuracy_score(y_test, predictions_2)) print("Hamming loss ",metrics.hamming_loss(y_test,predictions_2)) precision = precision_score(y_test, predictions_2, average='micro') recall = recall_score(y_test, predictions_2, average='micro') f1 = f1_score(y_test, predictions_2, average='micro') print("Micro-average quality numbers") print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1)) precision = precision_score(y_test, predictions_2, average='macro') recall = recall_score(y_test, predictions_2, average='macro') f1 = f1_score(y_test, predictions_2, average='macro') print("Macro-average quality numbers") print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1)) print (metrics.classification_report(y_test, predictions_2)) print("Time taken to run this cell :", datetime.now() - start) # In[ ]: from sklearn.externals import joblib joblib.dump(classifier_2, 'drive/My Drive/Stackoverflow/Mylrnew_with_more_title_weight.pkl') # In[ ]: start = datetime.now() classifier_2 = OneVsRestClassifier(LogisticRegression(penalty='l1'), n_jobs=-1) classifier_2.fit(x_train_multilabel, y_train) predictions_2 = classifier_2.predict(x_test_multilabel) print("Accuracy :",metrics.accuracy_score(y_test, predictions_2)) print("Hamming loss ",metrics.hamming_loss(y_test,predictions_2)) precision = precision_score(y_test, predictions_2, average='micro') recall = recall_score(y_test, predictions_2, average='micro') f1 = f1_score(y_test, predictions_2, average='micro') print("Micro-average quality numbers") print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1)) precision = precision_score(y_test, predictions_2, average='macro') recall = recall_score(y_test, predictions_2, average='macro') f1 = f1_score(y_test, predictions_2, average='macro') print("Macro-average quality numbers") print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1)) print (metrics.classification_report(y_test, predictions_2)) print("Time taken to run this cell :", datetime.now() - start) #

5. Assignments

#
    #
  1. Use bag of words upto 4 grams and compute the micro f1 score with Logistic regression(OvR)
  2. #
  3. Perform hyperparam tuning on alpha (or lambda) for Logistic regression to improve the performance using GridSearch
  4. #
  5. Try OneVsRestClassifier with Linear-SVM (SGDClassifier with loss-hinge)
  6. #
# In[ ]: ## Output # 1st - bag of words upto 4 grams SGDClassifier(loss='log') 0.5M data points # 2nd - bag of words upto 4 grams Logistic Regression 0.5M data points # 3rd - tfidf(1 to 3 grams) 200k data points Logistic Regression grid search hyperparameter: alpha # 4th - tfidf(1 to 3 grams) 200k data points (SGDClassifier with loss-hinge) grid search hyperparameter: alpha # In[ ]: start = datetime.now() classifier = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.00001, penalty='l1')) classifier.fit(x_train_multilabel, y_train) # clf.fit(x_train_multilabel.copy(),y_train) predictions = classifier.predict (x_test_multilabel) print("Accuracy :",metrics.accuracy_score(y_test, predictions)) print("Hamming loss ",metrics.hamming_loss(y_test,predictions)) precision = precision_score(y_test, predictions, average='micro') recall = recall_score(y_test, predictions, average='micro') f1 = f1_score(y_test, predictions, average='micro') print("Micro-average quality numbers") print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1)) precision = precision_score(y_test, predictions, average='macro') recall = recall_score(y_test, predictions, average='macro') f1 = f1_score(y_test, predictions, average='macro') print("Macro-average quality numbers") print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1)) print (metrics.classification_report(y_test, predictions)) print("Time taken to run this cell :", datetime.now() - start) # In[ ]: from sklearn.externals import joblib joblib.dump(classifier, 'drive/My Drive/Stackoverflow/MySgd_with_more_title_weight.pkl') # ### LR with 0.5M data points BOW # In[ ]: start = datetime.now() classifier_2 = OneVsRestClassifier(LogisticRegression(penalty='l1'), n_jobs=-1) x_train_multilabel.sort_indices() classifier_2.fit(x_train_multilabel, y_train) predictions_2 = classifier_2.predict(x_test_multilabel) print("Accuracy :",metrics.accuracy_score(y_test, predictions_2)) print("Hamming loss ",metrics.hamming_loss(y_test,predictions_2)) precision = precision_score(y_test, predictions_2, average='micro') recall = recall_score(y_test, predictions_2, average='micro') f1 = f1_score(y_test, predictions_2, average='micro') print("Micro-average quality numbers") print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1)) precision = precision_score(y_test, predictions_2, average='macro') recall = recall_score(y_test, predictions_2, average='macro') f1 = f1_score(y_test, predictions_2, average='macro') print("Macro-average quality numbers") print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1)) print (metrics.classification_report(y_test, predictions_2)) print("Time taken to run this cell :", datetime.now() - start) # In[ ]: from sklearn.externals import joblib joblib.dump(classifier_2, 'drive/My Drive/Stackoverflow/Mylrnew_with_more_title_weight.pkl') # ### Gridsearchcv on 200k data points Tfidf Vectors # In[ ]: from sklearn.model_selection import GridSearchCV start = datetime.now() classifier_2 = OneVsRestClassifier(SGDClassifier(loss='log',penalty='l1'), n_jobs=-1) param={'estimator__alpha': [10**-5, 10**-4, 10**-3, 10**-2, 10**-1, 10**0, 10**1]} grid_search_model = GridSearchCV(estimator = classifier_2, param_grid=param, cv=3, verbose=0,return_train_score=True,scoring='f1_micro',n_jobs=-1) x_train_multilabel.sort_indices() grid_search_model.fit(x_train_multilabel, y_train) # In[ ]: print("Time taken to run this cell :", datetime.now() - start) # In[ ]: alpha = [10**-5, 10**-4, 10**-3, 10**-2, 10**-1, 10**0, 10**1] # In[ ]: log_alpha = np.log([10**-5, 10**-4, 10**-3, 10**-2, 10**-1, 10**0, 10**1]) # In[ ]: train_score = grid_search_model.cv_results_['mean_train_score'] # In[ ]: test_score = grid_search_model.cv_results_['mean_test_score'] # In[ ]: plt.scatter(log_alpha,train_score,label="Train Score") plt.plot(log_alpha,train_score) plt.scatter(log_alpha,test_score,label="Test Score") plt.plot(log_alpha,test_score) plt.xlabel("Alpha : Hyperparameter") plt.ylabel("Micro F1 Score") plt.xticks(log_alpha,alpha) plt.legend() # In[ ]: best_alpha = grid_search_model.best_estimator_.get_params()['estimator__alpha'] print('Best alpha: ',best_alpha) # In[ ]: from sklearn.externals import joblib joblib.dump(classifier_2, 'drive/My Drive/Stackoverflow/gridlog_with_more_title_weight.pkl') # In[ ]: start = datetime.now() classifier = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.00001, penalty='l1')) classifier.fit(x_train_multilabel, y_train) # clf.fit(x_train_multilabel.copy(),y_train) predictions = classifier.predict (x_test_multilabel) print("Accuracy :",metrics.accuracy_score(y_test, predictions)) print("Hamming loss ",metrics.hamming_loss(y_test,predictions)) precision = precision_score(y_test, predictions, average='micro') recall = recall_score(y_test, predictions, average='micro') f1 = f1_score(y_test, predictions, average='micro') print("Micro-average quality numbers") print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1)) precision = precision_score(y_test, predictions, average='macro') recall = recall_score(y_test, predictions, average='macro') f1 = f1_score(y_test, predictions, average='macro') print("Macro-average quality numbers") print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1)) print (metrics.classification_report(y_test, predictions)) print("Time taken to run this cell :", datetime.now() - start) # In[ ]: from sklearn.externals import joblib joblib.dump(classifier, 'drive/My Drive/Stackoverflow/bestlog200k_with_more_title_weight.pkl') # ### SGD with hinge loss # # # In[ ]: from sklearn.model_selection import GridSearchCV start = datetime.now() classifier_2 = OneVsRestClassifier(SGDClassifier(loss='hinge',penalty='l1'), n_jobs=-1) param={'estimator__alpha': [10**-5, 10**-4, 10**-3, 10**-2, 10**-1, 10**0, 10**1]} grid_search_model = GridSearchCV(estimator = classifier_2, param_grid=param, cv=3, verbose=0,return_train_score=True,scoring='f1_micro',n_jobs=-1) x_train_multilabel.sort_indices() grid_search_model.fit(x_train_multilabel, y_train) print("Time taken to run this cell :", datetime.now() - start) # In[ ]: alpha = [10**-5, 10**-4, 10**-3, 10**-2, 10**-1, 10**0, 10**1] # In[ ]: log_alpha = np.log([10**-5, 10**-4, 10**-3, 10**-2, 10**-1, 10**0, 10**1]) # In[ ]: train_score = grid_search_model.cv_results_['mean_train_score'] # In[ ]: test_score = grid_search_model.cv_results_['mean_test_score'] # In[ ]: plt.scatter(log_alpha,train_score,label="Train Score") plt.plot(log_alpha,train_score) plt.scatter(log_alpha,test_score,label="Test Score") plt.plot(log_alpha,test_score) plt.xlabel("Alpha : Hyperparameter") plt.ylabel("Micro F1 Score") plt.xticks(log_alpha,alpha) plt.legend() # In[ ]: best_alpha = grid_search_model.best_estimator_.get_params()['estimator__alpha'] print('Best alpha: ',best_alpha) # In[ ]: from sklearn.externals import joblib joblib.dump(classifier_2, 'drive/My Drive/Stackoverflow/gridhinge_with_more_title_weight.pkl') # In[ ]: get_ipython().system('grep -c ^processor /proc/cpuinfo') # In[ ]: start = datetime.now() classifier = OneVsRestClassifier(SGDClassifier(loss='hinge', alpha=0.00001, penalty='l1'),n_jobs=-1) classifier.fit(x_train_multilabel, y_train) predictions = classifier.predict (x_test_multilabel) print("Accuracy :",metrics.accuracy_score(y_test, predictions)) print("Hamming loss ",metrics.hamming_loss(y_test,predictions)) precision = precision_score(y_test, predictions, average='micro') recall = recall_score(y_test, predictions, average='micro') f1 = f1_score(y_test, predictions, average='micro') print("Micro-average quality numbers") print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1)) precision = precision_score(y_test, predictions, average='macro') recall = recall_score(y_test, predictions, average='macro') f1 = f1_score(y_test, predictions, average='macro') print("Macro-average quality numbers") print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1)) print (metrics.classification_report(y_test, predictions)) print("Time taken to run this cell :", datetime.now() - start) # In[ ]: from sklearn.externals import joblib joblib.dump(classifier, 'drive/My Drive/Stackoverflow/MySgdhinge_with_more_title_weight.pkl') # ## Output # ### 1st - bag of words upto 4 grams SGDClassifier(loss='log') 0.5M data points # ### 2nd - bag of words upto 4 grams Logistic Regression 0.5M data points # ### 3rd - tfidf(1 to 3 grams) 200k data points Logistic Regression grid search hyperparameter: alpha # ### 4th - tfidf(1 to 3 grams) 200k data points (SGDClassifier with loss-hinge) grid search hyperparameter: alpha # In[ ]: # Please compare all your models using Prettytable library # http://zetcode.com/python/prettytable/ from prettytable import PrettyTable #If you get a ModuleNotFoundError error , install prettytable using: pip3 install prettytable x = PrettyTable() x.field_names = ["Data points","Vectorizer", "Model", "Hyper Parameter","Penalty", "Micro F1 Score","Hamming Loss","Macro F1 Score","Accuracy"] x.add_row(["500k","BOW(upto 4 grams)", "SGDClassifier(loss='log')","Fixed : 10^-5","L1",0.3544 ,0.0059,0.2634,0.092 ]) x.add_row(["500k","BOW(upto 4 grams)", "Logisitic Regression","Fixed:10^-5","L1", 0.4792 ,0.0031,0.3852,0.2101]) x.add_row(["200k","TFIDF(upto 3 grams)", "Logisitic Regression implementation using SGDclassifier(loss='log')","Best Alpha:10^-5" ,"L1", 0.6968,0.00205,0.2415,0.3487]) x.add_row(["200k","TFIDF(upto 3 grams)", "SGDClassifier(loss='hinge')","Best Alpha:10^-5","L1", 0.6943,0.0019605,0.1880,0.3644]) print(x) # ### Conclusion: # # 1. Due to high computation requirement , We have taken 0.5 M points for 2 models and 0.2 M for other two models and have considerd 90% tags i.e. 500 tags for all models. # 2. We have highest accuracy using SGDClassifier(loss='hinge') | Best Alpha:10^-5 | L1 # 3. Performance of SGDClassifier with hinge loss is better in terms of time required for execution and scores . # 4. It took almost 1/3 rd time for same data with SGDClassifier with hinge loss . # 5. Performance of SGDClassifier with log loss is better in terms of time required for execution Logistic Regression .