%%capture
%load_ext autoreload
%autoreload 2
%matplotlib inline
# %cd ..
import sys
sys.path.append("..")
import statnlpbook.util as util
import matplotlib
matplotlib.rcParams['figure.figsize'] = (10.0, 5.0)
import random
from collections import defaultdict
random.seed(2)
%%HTML
<style>
td,th {
font-size: x-large;
text-align: left;
}
</style>
Automatically classify input text into a set of atomic classes
Simplest instance of structured prediction
Let us focus on a specific task: sentiment analysis
from os import listdir
from os.path import isfile, join
def load_from_dir(directory,label):
"""
Load documents from a directory, and give them all the same label `label`.
Params:
directory: the directory to load the documents from.
label: the label to assign to each document.
Returns:
a list of (x,y) pairs where x is a tokenised document (list of words), and y is the label `label`.
"""
result = []
for file in listdir(directory):
with open(directory + file, 'r') as f:
text = f.read()
tokens = [t.strip() for t in text.split()]
result.append((tokens,label))
return result
data_pos = load_from_dir('../data/rt-2k/txt_sentoken/pos/', 'pos')
data_neg = load_from_dir('../data/rt-2k/txt_sentoken/neg/', 'neg')
data_all = data_pos + data_neg
Let us look at some example data ...
" ".join(data_neg[11][0][:200])
"in 1970s , many european intellectuals , especially those on the left political hemisphere , became obsessed with the rise of fascism . which wasn't so hard to expect , because the social turmoil of 1960s and economic decline of 1970s seemed to be the breeding ground for many dangerous ideologies . in such times , when political involvement could be associated with noble passion , many filmmakers tried to warn the present generations of dangers that lurk ahead by giving the look of pre-war europe and circumstances that led to phenomena like fascist italy and nazi germany . of course , there were authors who jumped on the bandwagon for other , less noble reasons . for them , moral depravity of fascism could be explained to the audience by explicitly showing sexual depravity of those era . which , naturally , made some of those films very popular among teen audience . one of such filmmakers was italian director tinto brass , who later made career shooting expensive , stylish soft porn . salon kitty , his 1976 film , is very losely based on the novel by peter nordern , book that deals with bizarre yet true"
Divide dataset in train, test and development set
random.seed(0)
shuffled = list(data_all)
random.shuffle(shuffled)
train, dev, test = shuffled[:1600], shuffled[1600:1800], shuffled[1800:]
len([(x,y) for (x,y) in train if y == 'pos']) # check balance
815
The simplest and most
uses a distribution $p^{\mbox{NB}}_{\params}$ for $s_\params$:
\begin{equation} s_{\params}(\x,\y)\ = p^{\text{NB}}_{\params}(\x,y) \end{equation}Naive Bayes is an instance of a
Makes a naive conditional independence assumption
\begin{equation} p^{\text{NB}}_{\params}(\x|y) = \prod_i^{\text{length}(\x)} p^{\text{NB}}_{\params}(x_i|y) \end{equation}observed words are independent of each other
For example:
$$ p^{\text{NB}}_{\params}(\text{great,tremendous}\bar+) = p^{\text{NB}}_{\params}(\text{great}\bar+) p^{\text{NB}}_{\params}(\text{tremendous}\bar+) $$The NB model has the parameters $\params=(\balpha,\bbeta)$ where
\begin{split} p^{\text{NB}}_{\params}(w|y) & = \alpha_{w,y} \\\\ p^{\text{NB}}_{\params}(y) & = \beta_{y}. \end{split}Maximum Likelihood estimation:
\begin{split} \alpha_{w,y} & = \frac{\counts{\train}{w,y}}{\sum_{w'}\counts{\train}{w',y}}\\\\ \beta_{y} & = \frac{\counts{\train}{y}}{\left| \train \right|} \end{split}MLE can produce zero probabilities for unseen events in two ways:
Let $V$ be the full training set vocabulary, and $\gamma$ a pseudo-count, then
\begin{split} \alpha^\gamma_{w,y} & = \frac{\counts{\train}{w,y} + \gamma}{|V|\gamma + \sum_{w'}\counts{\train}{w',y}}\\\\ \end{split}def train_nb(data, pseudo_count=0.0):
alpha = defaultdict(float)
beta = defaultdict(float)
vocab = set(w for x,_ in data for w in x)
labels = set(y for _,y in data)
norm = 0
for x,y in data:
for w in x:
beta[y] += 1.0
alpha[w,y] += 1
norm += 1
for y in labels:
for w in vocab:
alpha[w,y] = (alpha[w,y]+pseudo_count) / (beta[y] + len(vocab) * pseudo_count)
for y in list(beta.keys()):
beta[y] = beta[y] / norm
return (alpha, beta)
Train NB on data:
theta = (alpha, beta) = train_nb(train)
Inspect the learned parameters of the NB model!
The class prior $\bbeta$ looks sensible:
beta
defaultdict(float, {'neg': 0.46393907123093997, 'pos': 0.53606092876906})
The per-class word distributions $\balpha$:
def plot_top_k(alpha, label='pos', k=10):
positive_words = [w for (w,y) in alpha.keys() if y == label]
sorted_positive_words = sorted(positive_words, key=lambda w:-alpha[w,label])[:k]
util.plot_bar_graph([alpha[w,label] for w in sorted_positive_words],sorted_positive_words,rotation=45)
plot_top_k(alpha, 'pos')
Fairly uninformative! For most words
Remove such words apriori, using a so-called stop-word list
import string
stop_words = set(['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours',
'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers',
'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves',
'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are',
'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does',
'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until',
'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into',
'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down',
'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here',
'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',
'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so',
'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now', '\n', 'the'] + list(string.punctuation))
def filter_dataset(data):
"""
Removes stop words from a dataset of (x,y) pairs.
"""
return [([w for w in x if w not in stop_words],y) for x,y in data]
train_filtered = filter_dataset(train)
dev_filtered = filter_dataset(dev)
test_filtered = filter_dataset(test)
theta_filtered = (alpha_filtered, beta_filtered) = train_nb(train_filtered,pseudo_count=0.01)
Let us look at $\balpha$ again:
plot_top_k(alpha_filtered,'neg',k=20)
See words that
How about negative reviews?
plot_top_k(alpha_filtered,'neg', 20)
Negative reviews fairly similar
Mention "good" just as often as the positive reviews
Subtle differences:
"bad" appears with high probability only in the negative reviews
"good" in negative reviews?
def show_context(word, label="neg", index=0, window=5):
docs_with_word = [x for x,y in train if word in x and y==label]
word_index = docs_with_word[index].index(word)
return docs_with_word[index][max(word_index-window,0):min(word_index+window,len(docs_with_word[index]))]
" ".join(show_context("good", "neg", 2, 20))
'to be censored in some manner . in an early scene of the film , for example , the lead good guy and bad guy have a minor confrontation in the streets . when the good guy spouts out a'
Looking at the most likely words not that helpful:
def diff(alpha, w):
return alpha[w,'pos'] - alpha[w,'neg']
def plot_discriminative_features(alpha, threshold = 0.0, reverse=False):
frequent_words = {w for ((w,y),p) in alpha.items() if p > threshold}
sorted_by_ratio = sorted(frequent_words, key=lambda w: diff(alpha, w),reverse=reverse)[-20:]
util.plot_bar_graph([diff(alpha,w) for w in sorted_by_ratio],sorted_by_ratio,rotation=45)
plot_discriminative_features(alpha_filtered,reverse=False)
Many of these words seem to match our intuition for negative reviews.
Given a trained NB model, how do we predict the class of a given text?
Like in MT and parsing, search for the $y\in\Ys$ with maximum a posteriori probability:
$$ \argmax_{y\in\Ys} \prob_\params(y|\x) = \argmax_{y\in\Ys} \frac{\prob(\x|y) \prob(y) }{ \prob(\x) } =\\ \argmax_{y\in\Ys} \prob(\x|y) \prob(y) $$What to do with words outside of the training vocabulary?
from math import log, exp
def log_prob_nb(theta, x, y):
"""
Calculates the log probability log(p_theta(x|y)).
"""
alpha, beta = theta
result = util.safe_log(beta[y])
for w in x:
if (w,y) in alpha:
result += util.safe_log(alpha[w,y])
return result
def predict_nb(theta, x):
"""
Finds y^*=argmax_y p_theta(y|x)
"""
if log_prob_nb(theta, x, 'pos') > log_prob_nb(theta, x, 'neg'):
return 'pos'
else:
return 'neg'
i = 0
predict_nb(theta_filtered,train_filtered[i][0]), train_filtered[i][1]
('neg', 'neg')
Use accuracy, the ratio of the
$\y^*$ is predicted sequence of labels, $\y$ the true labels:
def accuracy(data, guess):
correct = 0
for (x,y),y_guess in zip(data,guess):
if y_guess == y:
correct += 1
return correct / len(data)
def batch_predict_nb(theta, data):
return [predict_nb(theta, x) for x,_ in data]
def accuracy_nb(theta,data):
return accuracy(data, batch_predict_nb(theta,data))
theta_filtered = (alpha_smoothed, beta_smoothed) = train_nb(train_filtered,pseudo_count=1.0)
accuracy_nb(theta_filtered, train_filtered), \
accuracy_nb(theta_filtered, dev_filtered)
(0.98625, 0.805)
Naive Bayes assumption can make sense, e.g.
$\prob(\text{... great ... awesome}\bar +) \approx \ldots \prob(\text{great}\bar +) \ldots \prob(\text{awesome}\bar +) \ldots$
When is it violated?
What is $p(\text{Fiction}\bar \text{Pulp},+)$ according to NB?
$\x$ | $y$ |
---|---|
Pulp Fiction | + |
Pulp Fiction | + |
Pulp Fiction | + |
Fiction | - |
Fiction | - |
Pulp Fiction | - |
What is $p(\text{Fiction}\bar \text{Pulp},+)$ according to this data? What about $p(\text{Fiction}\bar +)$?
$1$, $\frac{1}{2}$
Therefore NB will underestimate $$ p(\text{Pulp Fiction}\bar +) $$
which should be
$$ p(\text{Pulp}\bar+,\text{Fiction}) p(\text{Fiction}\bar +) = p(\text{Pulp}\bar+) $$but is $$ p(\text{Pulp}\bar +)p(\text{Fiction}\bar +) $$
This means a positive instance with "Pulp Fiction" may be misclassified
Problem can be partially addressed by using a bag of bigrams
Turn
into
def bigram_dataset(data):
return [([tuple(x[i:i+2]) for i in range(0,len(x)-1)] + x,y) for x,y in data]
train_bigram = bigram_dataset(train_filtered)
dev_bigram = bigram_dataset(dev_filtered)
test_bigram = bigram_dataset(test_filtered)
train_bigram[0][0][:10]
[('paul', 'verhoeven'), ('verhoeven', 'dutch'), ('dutch', 'auteur'), ('auteur', 'dragged'), ('dragged', 'violent'), ('violent', 'sexually'), ('sexually', 'aggressive'), ('aggressive', 'aesthetic'), ('aesthetic', 'american'), ('american', 'film')]
theta_bigram = (alpha_bigram, beta_bigram) = train_nb(train_bigram, 1.0)
accuracy_nb(theta_bigram, dev_bigram)
0.82
Look at features
plot_discriminative_features({(f,y):p for (f,y),p in alpha_bigram.items() if len(f)==2}, reverse=True)
# alpha_bigram
Observe:
For example
How do such reviews look like?
docs_with_good = [x for x,y in train if 'pulp' in x and 'fiction' in x and y=='pos']
for doc_index in range(0,14):
good_index = docs_with_good[doc_index].index("pulp")
print(" ".join(docs_with_good[doc_index][good_index-10:good_index+10]))
in essence , this is the science fiction equivolence of pulp fiction . the easiest way to write a review than anything actually shown--like most of the violence in " pulp fiction " . " se7en " is gory , , the film is paced at half the speed of pulp fiction , which avary co-wrote with quentin tarantino . this statement , but you need look no further than pulp fiction for an example ) . and , while since this is a gritty crime comedy , flashbacks of pulp fiction should arise ) . at first , i , stock & two smoking barrels ( 8/10 ) - pulp fiction ( 8/10 ) - reservoir dogs ( 9/10 incredibly fun to watch . it starts off as a pulp fiction-type crime story , with criminal brothers george clooney can be even more graphic ( e . g . pulp fiction ) . pulp fiction did it in a you couldn't find in any of the two dozen " pulp fiction " wannabes ; fortunately , the reappearance of but if most of those movies have their roots in pulp fiction , exploring a modern myth of the doomed the three years since the release of the groundbreaking success pulp fiction , the cinematic output from its creator , first movie quentin tarantino has directed since the highly touted pulp fiction . to say he has been inactive in me give you an example . do you know in pulp fiction where jules and vincent go on brain detail
Still problems, like longer distance dependencies:
$$ p(\text{Quentin} \bar \text{Pulp},+) $$Can we do more with unigrams?
Let $\hat{\prob}$ be the empirical distribution
The problem is: $\pnb(\text{Pulp Fiction}\bar +)$ is too small
\begin{split} \pnb(\text{Pulp Fiction}\bar +) &= \pnb(\text{Pulp}\bar +)\pnb(\text{Fiction} \bar +) \\\\ &\approx \hat{\prob}(\text{Pulp}\bar +) \hat{\prob}(\text{Fiction} \bar +) \\\\ & < \hat{\prob}(\text{Pulp}\bar +) \hat{\prob}(\text{Fiction}\bar \Pulp, +) \\\\ &\approx \hat{\prob}(\text{Pulp}\bar +) \\\\ \end{split}How to fix this for a unigram model?
If
$$ \pnb(\text{Pulp}\bar +)\approx \frac{\hat{\prob}(\text{Pulp}\bar +)}{\hat{\prob}(\text{Fiction}\bar +)} $$we'd get
\begin{split} \pnb(\text{Pulp Fiction}\bar +) & \approx \frac{\hat{\prob}(\text{Pulp}\bar +)}{\hat{\prob}(\text{Fiction}\bar +)} \hat{\prob}(\text{Fiction}\bar +) \\\\ & = \hat{\prob}(\text{Pulp}\bar +) \end{split}Note: $\pnb(\text{Pulp}\bar +)$ may be larger than 1
But this is not the maximum likelhood estimate...
If you were to generate data from it, it could look like
$\x$ | $y$ |
---|---|
... Pulp Pulp ... | + |
... Pulp Pulp ... | + |
... Pulp Pulp ... | + |
... Fiction ... | - |
... Fiction ... | - |
... Pulp Fiction ... | - |
But it doesn't matter for us, because we get $\prob(\text{Pulp Fiction}|+)$ right and we may never encounter "Pulp Pulp" anyway.
Ignore generation, care about discrimination
How can we set probabilities algorithmically?
Turn it into an optimisation problem and
maximise $\prob(+ \bar \text{...Pulp Fiction...})$ directly
Directly optimise the conditional likelihood of the correct labels
$$ \mathit{CL}(\params) = \sum_{(\x,y) \in \train} \log(\prob_\params(y|\x)) = \sum_{(\x,y) \in \train} \log\left(\frac{\prob_\params(y,\x)}{\sum_y \prob_\params(y,\x)}\right) $$Unfortunately less trivial to optimise (no closed form solution)
Present NB (and other following models) in log-space:
For log-linear models we need feature functions $f_i(\x)$ mapping input to a real value, e.g.
$$ f_{\text{Pulp}}(\x) = \counts{\x}{\text{Pulp}} $$i.e., the number of times Pulp appears in the input $\x$.
Features don't have to correspond to single words $$ f_{\text{Pulp Fiction}}(\x) = \counts{\x}{\text{Pulp Fiction}} $$
or even correspond to n-grams $$ f_{\text{RT}}(\x) = \text{Lowest rotten tomatoes score of all mentioned movies in }\x $$
Assume we have a weight vector $\weights_y$ for each class $y$, then a log linear model defines
$$ p_{\weights}(\x,y)= \frac{1}{Z} \exp \left( \sum_{i \in \mathcal{I}} f_i(x) w_{y,i} \right) = \frac{1}{Z} \exp \langle \mathbf{f}(\x), \mathbf{w}_y \rangle $$where $Z = \sum_{y,\x} \exp \langle \mathbf{f}(\x), \mathbf{w}_y \rangle$ is the partition function (or Zustandssumme) and intractable in general
Note: weights do not need to normalise
You can convert a Naive Bayes model into log-linear form with $$ f_{\text{w}}(\x) = \counts{\x}{w} $$ and $$ \weights_{w,y} = \log(\alpha_{w,y}) $$
and a bias feature $f_0(\x)=1$
We care only about conditional probabilities: \begin{equation} p_{\params}(y|\x)= \frac{1}{Z_\x} \exp \langle \mathbf{f}(\x), \mathbf{w}_y \rangle = \frac{1}{Z_\x} \exp s_\weights(\x,y) \end{equation}
with a tractable conditional normalizer:
$$ Z_\x=\sum_{y\in\Ys} \exp s_\weights(\x,y) $$where $s_\weights(\x,y)= \langle \mathbf{f}(\x), \mathbf{w}_y \rangle$ is the linear score of $\x$ and $y$.
For binary tasks corresponds to logistic regression model:
\begin{split} \frac{1}{Z_\x} \exp s_\weights(\x,y) & = \frac{\exp s_\weights(\x,y)}{\exp s_\weights(\x,+)+\exp s_\weights(\x,-)} \\\\ & = \operatorname{sigmoid}_y\left(s_\weights(\x,y)\right) \end{split}Conditional log-likelihood in log-linear form:
$$ \mathit{CL}(\weights) = \sum_{(\x,y) \in \train} \log(\prob_\params(y|\x)) = \sum_{(\x,y) \in \train} \log \left (\frac{1}{Z_\x} \exp s_\weights(\x,y) \right) =\\ \sum_{(\x,y) \in \train} s_\weights(\x,y) - \log Z_\x. $$p,f = "Pulp", "Fiction"
data = [((p,f),True),((p,f),True),((p,f),True), ((f,),False), ((f,),False),((p,f),False)]
from math import log, exp
def cl(data, w_p_true, w_f_true, w_true, l=0.0):
loss = 0.0
for x,y in data:
count_p = len([w for w in x if w==p])
count_f = len([w for w in x if w==f])
score_true = w_true + count_p * w_p_true + count_f * w_f_true
log_z = log(exp(score_true) + exp(0))
# print(count_p)
# print(count_f)
# print(log_z)
if y:
loss += score_true - log_z
else:
loss += 0 - log_z
return loss - l * (w_p_true * w_p_true + w_f_true * w_f_true + w_true * w_true)
def jl(data, w_p_true, w_f_true, w_true, w_p_false, w_f_false, w_false):
loss = 0.0
for x,y in data:
count_p = len([w for w in x if w==p])
count_f = len([w for w in x if w==f])
score_true = w_true + count_p * w_p_true + count_f * w_f_true
score_false = w_false + count_p * w_p_false + count_f * w_f_false
if y:
loss += score_true
else:
loss += score_false
return loss
# cl(data,log(0.5),log(0.5),log(0.5))
import matplotlib.pyplot as plt
import mpld3
import numpy as np
# x = np.linspace(-10, 10, 100)
# cl_loss = np.vectorize(lambda w: cl(data,w,log(0.5),log(0.5),1))
Consider a log-linear model $$ \mathbf{f}(\x) = \begin{bmatrix} \counts{\x}{\text{Pulp}} \\ \counts{\x}{\text{Fiction}} \\ 1 \end{bmatrix} $$
and loss on the following data:
import pandas as pd
pd.DataFrame(data, columns = ["x","y"])
x | y | |
---|---|---|
0 | (Pulp, Fiction) | True |
1 | (Pulp, Fiction) | True |
2 | (Pulp, Fiction) | True |
3 | (Fiction,) | False |
4 | (Fiction,) | False |
5 | (Pulp, Fiction) | False |
What is $\pnb(\Pulp \bar +)$?
Set $w_{\Fiction,+}=\log 0.5$ and $w_{+}=\log 0.5$, vary
$$ \prob(\Pulp \bar +) = \exp(w_{\Pulp,+}) $$to show that increasing $w_{\Pulp,+} > \log(0.5)$ helps to improve the CL
x = np.linspace(0.01, 20.0, 100)
cl_loss = np.vectorize(lambda p: cl(data,log(p),log(0.5),log(0.5)))
plt.plot(x, cl_loss(x))
[<matplotlib.lines.Line2D at 0x7fe40b1ad080>]
Compare to the joint likelihood we optimised before:
$$ \sum_{(\x,y) \in \train} \log(\prob_\params(y,\x)) $$and see that MLE encourages $w=\log(0.5)$
x = np.linspace(0.01, 0.999, 100)
jl_loss = np.vectorize(lambda p: jl(data,log(p),log(1-p),log(0.5),
log(0.1),log(0.9),log(0.5)))
plt.plot(x, jl_loss(x)) # CHANGE to jl
[<matplotlib.lines.Line2D at 0x7fe40a099a90>]
Other structured prediction losses replace $\log Z_\x$ with other terms:
$$ \mathit{PERCEPTRON}(\weights) = \sum_{(\x,y) \in \train} s_\weights(\x,y) - s_\weights(\x,y^*(\x)) $$where $y^*(\x)=\argmax_{y\in\Ys} s_\weights(\x,y)$
Aka structured perceptron loss!
Estimating a log-linear models can lead to overfitting
regularise the model by penalising large weights:
$C$ controls inverse strength of regularisation
Both L1 and L2 have their strength and weaknesses.
Vary regularisation:
x = np.linspace(0.01, 100.0, 100)
C = 10000.0
cl_loss = np.vectorize(lambda p: cl(data,log(p),log(0.5),log(0.5),1/C))
plt.plot(x, cl_loss(x))
[<matplotlib.lines.Line2D at 0x7fe409459278>]
Can be understood as maximum-a-posteriori inference of the parameters under specific priors for weights
For example, L2 regularisation assumes a Gaussian prior.
No closed form solution, use iterative methods such as
"Easy" because concave in weights $\weights$
x = np.linspace(-10, 10, 100)
C = 1
cl_loss = np.vectorize(lambda w: cl(data,w,log(0.5),log(0.5),1/C))
plt.plot(x, cl_loss(x))
[<matplotlib.lines.Line2D at 0x7fe423f277f0>]
Can we characterise the optimum?
def prob_log_reg(x, w_p_true, w_f_true, w_true):
count_p = len([w for w in x if w==p])
count_f = len([w for w in x if w==f])
score_true = w_true + count_p * w_p_true + count_f * w_f_true
log_z = log(exp(score_true) + exp(0))
return exp(score_true - log_z)
def indi(pred, value=1.0):
return value if pred else 0.0
def expectations(data, w_p_true, w_f_true, w_true):
data_probs = [(x,
indi(y),
prob_log_reg(x,w_p_true,w_f_true,w_true),
indi(p in x and y),
indi(p in x, prob_log_reg(x,w_p_true,w_f_true,w_true)),
indi(f in x and y),
indi(f in x, prob_log_reg(x,w_p_true,w_f_true,w_true)),
cl([(x,y)],w_p_true,w_f_true,w_true))
for x,y in data]
last_row = [('[Avg]',
sum(data[1] for data in data_probs)/6,
sum(data[2] for data in data_probs)/6,
sum(data[3] for data in data_probs)/6,
sum(data[4] for data in data_probs)/6,
sum(data[5] for data in data_probs)/6,
sum(data[6] for data in data_probs)/6,
sum(data[7] for data in data_probs)/6
)]
return pd.DataFrame(data_probs + last_row, columns = ['x','y','p(+)','#(Pulp) * p_data(+)','#(Pulp) * p(+)',
'#(Fiction) * p_data(+)','#(Fiction) * p(+)','CL'])
#expectations(data, w_p_true=10, w_f_true=-10, w_true=1.1)
expectations(data, w_p_true=10, w_f_true=-10, w_true=1.1)
x | y | p(+) | #(Pulp) * p_data(+) | #(Pulp) * p(+) | #(Fiction) * p_data(+) | #(Fiction) * p(+) | CL | |
---|---|---|---|---|---|---|---|---|
0 | (Pulp, Fiction) | 1.0 | 0.750260 | 1.0 | 0.750260 | 1.0 | 0.750260 | -0.287335 |
1 | (Pulp, Fiction) | 1.0 | 0.750260 | 1.0 | 0.750260 | 1.0 | 0.750260 | -0.287335 |
2 | (Pulp, Fiction) | 1.0 | 0.750260 | 1.0 | 0.750260 | 1.0 | 0.750260 | -0.287335 |
3 | (Fiction,) | 0.0 | 0.000136 | 0.0 | 0.000000 | 0.0 | 0.000136 | -0.000136 |
4 | (Fiction,) | 0.0 | 0.000136 | 0.0 | 0.000000 | 0.0 | 0.000136 | -0.000136 |
5 | (Pulp, Fiction) | 0.0 | 0.750260 | 0.0 | 0.750260 | 0.0 | 0.750260 | -1.387335 |
6 | [Avg] | 0.5 | 0.500219 | 0.5 | 0.500173 | 0.5 | 0.500219 | -0.374936 |
Let's look at the $\mathit{CL}$ gradient:
\begin{split} \nabla_{\weights_{y'}} CL(\params) &= \sum_{(\x,y) \in \train} \mathbf{f}(\x) \delta(y,y') - p_\params(y'|\x) \mathbf{f}(\x) \\\\ &= \sum_{(\x,y) \in \train} \mathbf{f}(\x) \delta(y,y') - \sum_{(\x,y) \in \train} p_\params(y'|\x) \mathbf{f}(\x). \end{split}Solution $\weights^*$ when this gradient is zero:
match
In fact, the Conditional Likelihood solution is the unique distribution that
In practice one of the following:
Here: scikit-learn
A log-linear model trained by maximising the CL corresponds to training a logistic regression model
logistic regression implementation of scikit-learn
Convert $\x \in \Xs$ to a (sparse) feature vector $\mathbf{f}(\x)$:
def feats(x):
result = defaultdict(float)
for w in x:
result[w] += 1.0
return result
feats(['pulp','fiction','fiction'])
defaultdict(float, {'fiction': 2.0, 'pulp': 1.0})
Apply to training and test instances:
from sklearn.feature_extraction import DictVectorizer
vectorizer = DictVectorizer()
train_X = vectorizer.fit_transform([feats(x) for x,_ in train_filtered])
dev_X = vectorizer.transform([feats(x) for x,_ in dev_filtered])
dev_X
<200x45329 sparse matrix of type '<class 'numpy.float64'>' with 52209 stored elements in Compressed Sparse Row format>
scikit-learn prefers numbers as classes:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
train_Y = label_encoder.fit_transform([y for _,y in train_filtered])
dev_Y = label_encoder.transform([y for _,y in dev_filtered])
dev_Y[:10]
array([1, 1, 0, 0, 0, 1, 0, 0, 1, 1])
Train the logistic regression with l1 regularisation $C=1000$
from sklearn.linear_model import LogisticRegression
import numpy as np
lr = LogisticRegression(C=1000, penalty="l1", random_state=1)
lr.fit(train_X, train_Y)
LogisticRegression(C=1000, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l1', random_state=1, solver='liblinear', tol=0.0001, verbose=0, warm_start=False)
Weights learned:
weights = vectorizer.inverse_transform(lr.coef_)[0]
sorted_weights = sorted(weights.items(), key=lambda t: t[1])
util.plot_bar_graph([w for _,w in sorted_weights[:20]],
[f for f,_ in sorted_weights[:20]],rotation=45)
More obvious discriminative features for the negative class
Positive weights?
util.plot_bar_graph([w for _,w in sorted_weights[-20:]],
[f for f,_ in sorted_weights[-20:]],rotation=45)
Discriminative training helps!
(Compare 0.82 with unigram model before)
lr = LogisticRegression(C=1000, penalty="l2",random_state=1, tol=0.00001)
lr.fit(train_X, train_Y)
lr_guess = label_encoder.inverse_transform(lr.predict(dev_X))
accuracy(dev_filtered, lr_guess)
0.805