In [101]:

import os
import platform

TINY = True
# set the path if the notebook is running in collab or otherwise
ROOT = '/content' if os.path.exists('/content') else os.path.expanduser('~')

name = 'jigsaw-unintended-bias-in-toxicity-classification'
path = os.path.join(ROOT, "jigsaw-unintended-bias-in-toxicity-classification")

# set enev variables to be used for data extraction later
os.environ['name'] = name
os.environ['path'] = path


if platform.system() == 'Darwin':
    # to circumvent the ssl context issue while making secure requests in macos
    import ssl
    ssl._create_default_https_context = ssl._create_unverified_context
    # only run for a tiny subset on mac
    TINY = True

# make sure the kaggle json is present
if not os.path.exists(os.path.join(os.path.expanduser('~'), ".kaggle/kaggle.json")):
    raise Exception('kaggle.json is not there')

In [102]:

import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from sklearn.model_selection import train_test_split

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [103]:

device

Out[103]:

device(type='cuda')

In [104]:

import fastai
from fastai.basics import *
from fastai import text as fastai_text

In [105]:

if(os.path.exists(path) == False):
  os.mkdir(path)
print("path is = {}".format(path))

path is = /home/ubuntu/jigsaw-unintended-bias-in-toxicity-classification

In [106]:

train_path = os.path.join(path, "train.csv")

In [107]:

if(os.path.exists(train_path) == False):
    !echo 'path is = {path}'
    !cd $path && kaggle competitions download -c $name
    !ls $path/*.zip | xargs -I {}  unzip {} -d $path
    !chmod +r $path/*.csv

In [108]:

df = pd.read_csv(train_path)

In [109]:

if TINY:
    df = df[:1000]

In [110]:

df.head()

Out[110]:

	id	target	comment_text	severe_toxicity	identity_attack	insult	asian	atheist	...	article_id	rating	likes	identity_annotator_count	toxicity_annotator_count
0	59848	0.000000	This is so cool. It's like, 'would you want yo...	0.000000	0.000000	0.00000	NaN	NaN	...	2006	rejected	0	0	4
1	59849	0.000000	Thank you!! This would make my life a lot less...	0.000000	0.000000	0.00000	NaN	NaN	...	2006	rejected	0	0	4
2	59852	0.000000	This is such an urgent design problem; kudos t...	0.000000	0.000000	0.00000	NaN	NaN	...	2006	rejected	0	0	4
3	59855	0.000000	Is this something I'll be able to install on m...	0.000000	0.000000	0.00000	NaN	NaN	...	2006	rejected	0	0	4
4	59856	0.893617	haha you guys are a bunch of losers.	0.021277	0.021277	0.87234	0.0	0.0	...	2006	rejected	1	4	47

5 rows × 45 columns

In [111]:

df['toxic'] = df['target'] >= 0.5

In [112]:

df.columns

Out[112]:

Index(['id', 'target', 'comment_text', 'severe_toxicity', 'obscene',
       'identity_attack', 'insult', 'threat', 'asian', 'atheist', 'bisexual',
       'black', 'buddhist', 'christian', 'female', 'heterosexual', 'hindu',
       'homosexual_gay_or_lesbian', 'intellectual_or_learning_disability',
       'jewish', 'latino', 'male', 'muslim', 'other_disability',
       'other_gender', 'other_race_or_ethnicity', 'other_religion',
       'other_sexual_orientation', 'physical_disability',
       'psychiatric_or_mental_illness', 'transgender', 'white', 'created_date',
       'publication_id', 'parent_id', 'article_id', 'rating', 'funny', 'wow',
       'sad', 'likes', 'disagree', 'sexual_explicit',
       'identity_annotator_count', 'toxicity_annotator_count', 'toxic'],
      dtype='object')

In [113]:

train_df, valid_df = train_test_split(df, test_size=0.2)

In [114]:

train_df.shape

Out[114]:

(800, 46)

In [115]:

valid_df.shape

Out[115]:

(200, 46)

In [116]:

train_df.iloc[0]['comment_text']

Out[116]:

'Sadly, episodes I-III exist which put Star Wars in a pretty hard spot.'

In [117]:

train_df.columns

Out[117]:

Index(['id', 'target', 'comment_text', 'severe_toxicity', 'obscene',
       'identity_attack', 'insult', 'threat', 'asian', 'atheist', 'bisexual',
       'black', 'buddhist', 'christian', 'female', 'heterosexual', 'hindu',
       'homosexual_gay_or_lesbian', 'intellectual_or_learning_disability',
       'jewish', 'latino', 'male', 'muslim', 'other_disability',
       'other_gender', 'other_race_or_ethnicity', 'other_religion',
       'other_sexual_orientation', 'physical_disability',
       'psychiatric_or_mental_illness', 'transgender', 'white', 'created_date',
       'publication_id', 'parent_id', 'article_id', 'rating', 'funny', 'wow',
       'sad', 'likes', 'disagree', 'sexual_explicit',
       'identity_annotator_count', 'toxicity_annotator_count', 'toxic'],
      dtype='object')

In [118]:

train_df.iloc[4]

Out[118]:

id                                                                                240494
target                                                                                 0
comment_text                           With a light reduction of plomeek leaves and n...
severe_toxicity                                                                        0
obscene                                                                                0
identity_attack                                                                        0
insult                                                                                 0
threat                                                                                 0
asian                                                                                NaN
atheist                                                                              NaN
bisexual                                                                             NaN
black                                                                                NaN
buddhist                                                                             NaN
christian                                                                            NaN
female                                                                               NaN
heterosexual                                                                         NaN
hindu                                                                                NaN
homosexual_gay_or_lesbian                                                            NaN
intellectual_or_learning_disability                                                  NaN
jewish                                                                               NaN
latino                                                                               NaN
male                                                                                 NaN
muslim                                                                               NaN
other_disability                                                                     NaN
other_gender                                                                         NaN
other_race_or_ethnicity                                                              NaN
other_religion                                                                       NaN
other_sexual_orientation                                                             NaN
physical_disability                                                                  NaN
psychiatric_or_mental_illness                                                        NaN
transgender                                                                          NaN
white                                                                                NaN
created_date                                               2015-10-13 17:05:48.912469+00
publication_id                                                                       111
parent_id                                                                         240491
article_id                                                                         32846
rating                                                                          approved
funny                                                                                  0
wow                                                                                    0
sad                                                                                    0
likes                                                                                  0
disagree                                                                               0
sexual_explicit                                                                        0
identity_annotator_count                                                               0
toxicity_annotator_count                                                               4
toxic                                                                              False
Name: 423, dtype: object

In [119]:

data_bunch = fastai_text.data.TextClasDataBunch.from_df(
    path=path,
    train_df=train_df,
    valid_df=valid_df,
    label_cols=-1,
    text_cols=2,
)

In [120]:

data_bunch.train_ds.classes 

Out[120]:

[False, True]

In [121]:

import torchtext.vocab as vocab

embedding_dim = 100
glove = vocab.GloVe(name='6B', dim=embedding_dim)

In [122]:

glove.vectors.shape

Out[122]:

torch.Size([400000, 100])

In [123]:

vocab_size = len(data_bunch.x.vocab.itos)
vocab_size

Out[123]:

In [124]:

class BiLSTM(nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_dim=64, dropout_rate=0.5):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True)
    self.linear = nn.Linear(hidden_dim * 2, 64)
    self.relu = nn.ReLU()
    self.dropout = nn.Dropout(dropout_rate)
    self.out = nn.Linear(64, 1)
    
  
  def forward(self, x):
    embeds = self.embedding(x)
    hidden, (last_hidden, last_cell) = self.lstm(embeds)
    out = self.linear(hidden)
    out = self.dropout(F.relu(out))
    out = self.out(out)
    return out.squeeze()

  def init_embeds(self, embeddings):
    self.embedding.weight = nn.Parameter(embeddings)

In [125]:

model = BiLSTM(vocab_size, embedding_dim=embedding_dim, hidden_dim=64, dropout_rate=0.5)
model.init_embeds(glove.vectors)

In [126]:

del glove

In [127]:

model = model.to(device)

In [128]:

model

Out[128]:

BiLSTM(
  (embedding): Embedding(3047, 100)
  (lstm): LSTM(100, 64, bidirectional=True)
  (linear): Linear(in_features=128, out_features=64, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.5)
  (out): Linear(in_features=64, out_features=1, bias=True)
)

In [129]:

"""
criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.003)

epochs = 5
steps = 0

train_losses, test_losses = [], []
for e in range(epochs):
    running_loss = 0
    for x, labels in data_bunch.train_dl:

        optimizer.zero_grad()
        
        log_ps = model(x)
        loss = criterion(log_ps, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        
    else:
        for x, labels in data_bunch.valid_dl:
            log_ps = model(x)
            top_p, top_class = torch.exp(log_ps).topk(1, dim=1)
            accuracy = torch.mean(
                (top_class == labels.view(*top_class.shape)).type(
                    torch.FloatTensor)
            )
        print(f'epoch = {e} Accuracy =  {accuracy.item()*100}% Running loss = {running_loss}')
"""

Out[129]:

"\ncriterion = nn.NLLLoss()\noptimizer = optim.Adam(model.parameters(), lr=0.003)\n\nepochs = 5\nsteps = 0\n\ntrain_losses, test_losses = [], []\nfor e in range(epochs):\n    running_loss = 0\n    for x, labels in data_bunch.train_dl:\n\n        optimizer.zero_grad()\n        \n        log_ps = model(x)\n        loss = criterion(log_ps, labels)\n        loss.backward()\n        optimizer.step()\n        \n        running_loss += loss.item()\n        \n    else:\n        for x, labels in data_bunch.valid_dl:\n            log_ps = model(x)\n            top_p, top_class = torch.exp(log_ps).topk(1, dim=1)\n            accuracy = torch.mean(\n                (top_class == labels.view(*top_class.shape)).type(\n                    torch.FloatTensor)\n            )\n        print(f'epoch = {e} Accuracy =  {accuracy.item()*100}% Running loss = {running_loss}')\n"

In [130]:

learner = Learner(data_bunch, model, loss_func=nn.NLLLoss(), metrics=[accuracy])

In [131]:

learner.lr_find()

LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.

In [137]:

learner.recorder.plot()

In [132]:

learner.fit_one_cycle(1)

epoch	train_loss	valid_loss	accuracy	time
0	-1.517934	-3.193337	0.740000	00:01

In [133]:

learner.fit(epochs=25)

epoch	train_loss	valid_loss	accuracy	time
0	-11.986995	-22.695547	0.825000	00:01
1	-25.095106	-51.403030	0.925000	00:01
2	-43.715351	-92.846794	0.925000	00:01
3	-69.478432	-148.985580	0.080000	00:01
4	-103.488358	-221.308472	0.925000	00:01
5	-148.444092	-310.500793	0.120000	00:01
6	-203.879211	-417.515259	0.925000	00:01
7	-271.384247	-541.972046	0.925000	00:01
8	-352.546753	-684.316467	0.925000	00:01
9	-450.111816	-844.593750	0.910000	00:01
10	-560.828064	-1022.578735	0.265000	00:01
11	-688.058777	-1217.918091	0.260000	00:01
12	-827.425049	-1429.867065	0.165000	00:01
13	-985.060547	-1658.507446	0.855000	00:01
14	-1161.532349	-1903.640991	0.205000	00:01
15	-1352.526367	-2164.700195	0.235000	00:01
16	-1558.553467	-2441.044434	0.210000	00:01
17	-1780.528564	-2732.953369	0.210000	00:01
18	-2016.040161	-3039.396240	0.790000	00:01
19	-2270.288574	-3360.536865	0.040000	00:01
20	-2543.219238	-3698.862305	0.230000	00:01
21	-2825.636963	-4050.409668	0.200000	00:01
22	-3126.056152	-4415.383301	0.890000	00:01
23	-3440.700195	-4794.647461	0.890000	00:01
24	-3782.953125	-5187.447266	0.890000	00:01

In [134]:

for x, label in data_bunch.train_dl:
  print(f'type of x = {type(x)}')
  print(f'type of label = {type(label)}')
  out = model(x)
  print(type(out))
  print(len(out))
  break

type of x = <class 'torch.Tensor'>
type of label = <class 'torch.Tensor'>
<class 'torch.Tensor'>
64

In [135]:

out.shape

Out[135]:

torch.Size([64, 400])

In [136]:

label.shape

Out[136]:

torch.Size([64])

In [ ]: