import os
import platform
TINY = True
# set the path if the notebook is running in collab or otherwise
ROOT = '/content' if os.path.exists('/content') else os.path.expanduser('~')
name = 'jigsaw-unintended-bias-in-toxicity-classification'
path = os.path.join(ROOT, "jigsaw-unintended-bias-in-toxicity-classification")
# set enev variables to be used for data extraction later
os.environ['name'] = name
os.environ['path'] = path
if platform.system() == 'Darwin':
# to circumvent the ssl context issue while making secure requests in macos
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
# only run for a tiny subset on mac
TINY = True
# make sure the kaggle json is present
if not os.path.exists(os.path.join(os.path.expanduser('~'), ".kaggle/kaggle.json")):
raise Exception('kaggle.json is not there')
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device
device(type='cuda')
import fastai
from fastai.basics import *
from fastai import text as fastai_text
if(os.path.exists(path) == False):
os.mkdir(path)
print("path is = {}".format(path))
path is = /home/ubuntu/jigsaw-unintended-bias-in-toxicity-classification
train_path = os.path.join(path, "train.csv")
if(os.path.exists(train_path) == False):
!echo 'path is = {path}'
!cd $path && kaggle competitions download -c $name
!ls $path/*.zip | xargs -I {} unzip {} -d $path
!chmod +r $path/*.csv
df = pd.read_csv(train_path)
if TINY:
df = df[:1000]
df.head()
id | target | comment_text | severe_toxicity | obscene | identity_attack | insult | threat | asian | atheist | ... | article_id | rating | funny | wow | sad | likes | disagree | sexual_explicit | identity_annotator_count | toxicity_annotator_count | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 59848 | 0.000000 | This is so cool. It's like, 'would you want yo... | 0.000000 | 0.0 | 0.000000 | 0.00000 | 0.0 | NaN | NaN | ... | 2006 | rejected | 0 | 0 | 0 | 0 | 0 | 0.0 | 0 | 4 |
1 | 59849 | 0.000000 | Thank you!! This would make my life a lot less... | 0.000000 | 0.0 | 0.000000 | 0.00000 | 0.0 | NaN | NaN | ... | 2006 | rejected | 0 | 0 | 0 | 0 | 0 | 0.0 | 0 | 4 |
2 | 59852 | 0.000000 | This is such an urgent design problem; kudos t... | 0.000000 | 0.0 | 0.000000 | 0.00000 | 0.0 | NaN | NaN | ... | 2006 | rejected | 0 | 0 | 0 | 0 | 0 | 0.0 | 0 | 4 |
3 | 59855 | 0.000000 | Is this something I'll be able to install on m... | 0.000000 | 0.0 | 0.000000 | 0.00000 | 0.0 | NaN | NaN | ... | 2006 | rejected | 0 | 0 | 0 | 0 | 0 | 0.0 | 0 | 4 |
4 | 59856 | 0.893617 | haha you guys are a bunch of losers. | 0.021277 | 0.0 | 0.021277 | 0.87234 | 0.0 | 0.0 | 0.0 | ... | 2006 | rejected | 0 | 0 | 0 | 1 | 0 | 0.0 | 4 | 47 |
5 rows × 45 columns
df['toxic'] = df['target'] >= 0.5
df.columns
Index(['id', 'target', 'comment_text', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat', 'asian', 'atheist', 'bisexual', 'black', 'buddhist', 'christian', 'female', 'heterosexual', 'hindu', 'homosexual_gay_or_lesbian', 'intellectual_or_learning_disability', 'jewish', 'latino', 'male', 'muslim', 'other_disability', 'other_gender', 'other_race_or_ethnicity', 'other_religion', 'other_sexual_orientation', 'physical_disability', 'psychiatric_or_mental_illness', 'transgender', 'white', 'created_date', 'publication_id', 'parent_id', 'article_id', 'rating', 'funny', 'wow', 'sad', 'likes', 'disagree', 'sexual_explicit', 'identity_annotator_count', 'toxicity_annotator_count', 'toxic'], dtype='object')
train_df, valid_df = train_test_split(df, test_size=0.2)
train_df.shape
(800, 46)
valid_df.shape
(200, 46)
train_df.iloc[0]['comment_text']
'Sadly, episodes I-III exist which put Star Wars in a pretty hard spot.'
train_df.columns
Index(['id', 'target', 'comment_text', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat', 'asian', 'atheist', 'bisexual', 'black', 'buddhist', 'christian', 'female', 'heterosexual', 'hindu', 'homosexual_gay_or_lesbian', 'intellectual_or_learning_disability', 'jewish', 'latino', 'male', 'muslim', 'other_disability', 'other_gender', 'other_race_or_ethnicity', 'other_religion', 'other_sexual_orientation', 'physical_disability', 'psychiatric_or_mental_illness', 'transgender', 'white', 'created_date', 'publication_id', 'parent_id', 'article_id', 'rating', 'funny', 'wow', 'sad', 'likes', 'disagree', 'sexual_explicit', 'identity_annotator_count', 'toxicity_annotator_count', 'toxic'], dtype='object')
train_df.iloc[4]
id 240494 target 0 comment_text With a light reduction of plomeek leaves and n... severe_toxicity 0 obscene 0 identity_attack 0 insult 0 threat 0 asian NaN atheist NaN bisexual NaN black NaN buddhist NaN christian NaN female NaN heterosexual NaN hindu NaN homosexual_gay_or_lesbian NaN intellectual_or_learning_disability NaN jewish NaN latino NaN male NaN muslim NaN other_disability NaN other_gender NaN other_race_or_ethnicity NaN other_religion NaN other_sexual_orientation NaN physical_disability NaN psychiatric_or_mental_illness NaN transgender NaN white NaN created_date 2015-10-13 17:05:48.912469+00 publication_id 111 parent_id 240491 article_id 32846 rating approved funny 0 wow 0 sad 0 likes 0 disagree 0 sexual_explicit 0 identity_annotator_count 0 toxicity_annotator_count 4 toxic False Name: 423, dtype: object
data_bunch = fastai_text.data.TextClasDataBunch.from_df(
path=path,
train_df=train_df,
valid_df=valid_df,
label_cols=-1,
text_cols=2,
)
data_bunch.train_ds.classes
[False, True]
import torchtext.vocab as vocab
embedding_dim = 100
glove = vocab.GloVe(name='6B', dim=embedding_dim)
glove.vectors.shape
torch.Size([400000, 100])
vocab_size = len(data_bunch.x.vocab.itos)
vocab_size
3047
class BiLSTM(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim=64, dropout_rate=0.5):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True)
self.linear = nn.Linear(hidden_dim * 2, 64)
self.relu = nn.ReLU()
self.dropout = nn.Dropout(dropout_rate)
self.out = nn.Linear(64, 1)
def forward(self, x):
embeds = self.embedding(x)
hidden, (last_hidden, last_cell) = self.lstm(embeds)
out = self.linear(hidden)
out = self.dropout(F.relu(out))
out = self.out(out)
return out.squeeze()
def init_embeds(self, embeddings):
self.embedding.weight = nn.Parameter(embeddings)
model = BiLSTM(vocab_size, embedding_dim=embedding_dim, hidden_dim=64, dropout_rate=0.5)
model.init_embeds(glove.vectors)
del glove
model = model.to(device)
model
BiLSTM( (embedding): Embedding(3047, 100) (lstm): LSTM(100, 64, bidirectional=True) (linear): Linear(in_features=128, out_features=64, bias=True) (relu): ReLU() (dropout): Dropout(p=0.5) (out): Linear(in_features=64, out_features=1, bias=True) )
"""
criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.003)
epochs = 5
steps = 0
train_losses, test_losses = [], []
for e in range(epochs):
running_loss = 0
for x, labels in data_bunch.train_dl:
optimizer.zero_grad()
log_ps = model(x)
loss = criterion(log_ps, labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
else:
for x, labels in data_bunch.valid_dl:
log_ps = model(x)
top_p, top_class = torch.exp(log_ps).topk(1, dim=1)
accuracy = torch.mean(
(top_class == labels.view(*top_class.shape)).type(
torch.FloatTensor)
)
print(f'epoch = {e} Accuracy = {accuracy.item()*100}% Running loss = {running_loss}')
"""
"\ncriterion = nn.NLLLoss()\noptimizer = optim.Adam(model.parameters(), lr=0.003)\n\nepochs = 5\nsteps = 0\n\ntrain_losses, test_losses = [], []\nfor e in range(epochs):\n running_loss = 0\n for x, labels in data_bunch.train_dl:\n\n optimizer.zero_grad()\n \n log_ps = model(x)\n loss = criterion(log_ps, labels)\n loss.backward()\n optimizer.step()\n \n running_loss += loss.item()\n \n else:\n for x, labels in data_bunch.valid_dl:\n log_ps = model(x)\n top_p, top_class = torch.exp(log_ps).topk(1, dim=1)\n accuracy = torch.mean(\n (top_class == labels.view(*top_class.shape)).type(\n torch.FloatTensor)\n )\n print(f'epoch = {e} Accuracy = {accuracy.item()*100}% Running loss = {running_loss}')\n"
learner = Learner(data_bunch, model, loss_func=nn.NLLLoss(), metrics=[accuracy])
learner.lr_find()
LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.
learner.recorder.plot()
learner.fit_one_cycle(1)
epoch | train_loss | valid_loss | accuracy | time |
---|---|---|---|---|
0 | -1.517934 | -3.193337 | 0.740000 | 00:01 |
learner.fit(epochs=25)
epoch | train_loss | valid_loss | accuracy | time |
---|---|---|---|---|
0 | -11.986995 | -22.695547 | 0.825000 | 00:01 |
1 | -25.095106 | -51.403030 | 0.925000 | 00:01 |
2 | -43.715351 | -92.846794 | 0.925000 | 00:01 |
3 | -69.478432 | -148.985580 | 0.080000 | 00:01 |
4 | -103.488358 | -221.308472 | 0.925000 | 00:01 |
5 | -148.444092 | -310.500793 | 0.120000 | 00:01 |
6 | -203.879211 | -417.515259 | 0.925000 | 00:01 |
7 | -271.384247 | -541.972046 | 0.925000 | 00:01 |
8 | -352.546753 | -684.316467 | 0.925000 | 00:01 |
9 | -450.111816 | -844.593750 | 0.910000 | 00:01 |
10 | -560.828064 | -1022.578735 | 0.265000 | 00:01 |
11 | -688.058777 | -1217.918091 | 0.260000 | 00:01 |
12 | -827.425049 | -1429.867065 | 0.165000 | 00:01 |
13 | -985.060547 | -1658.507446 | 0.855000 | 00:01 |
14 | -1161.532349 | -1903.640991 | 0.205000 | 00:01 |
15 | -1352.526367 | -2164.700195 | 0.235000 | 00:01 |
16 | -1558.553467 | -2441.044434 | 0.210000 | 00:01 |
17 | -1780.528564 | -2732.953369 | 0.210000 | 00:01 |
18 | -2016.040161 | -3039.396240 | 0.790000 | 00:01 |
19 | -2270.288574 | -3360.536865 | 0.040000 | 00:01 |
20 | -2543.219238 | -3698.862305 | 0.230000 | 00:01 |
21 | -2825.636963 | -4050.409668 | 0.200000 | 00:01 |
22 | -3126.056152 | -4415.383301 | 0.890000 | 00:01 |
23 | -3440.700195 | -4794.647461 | 0.890000 | 00:01 |
24 | -3782.953125 | -5187.447266 | 0.890000 | 00:01 |
for x, label in data_bunch.train_dl:
print(f'type of x = {type(x)}')
print(f'type of label = {type(label)}')
out = model(x)
print(type(out))
print(len(out))
break
type of x = <class 'torch.Tensor'> type of label = <class 'torch.Tensor'> <class 'torch.Tensor'> 64
out.shape
torch.Size([64, 400])
label.shape
torch.Size([64])