%%capture
%load_ext autoreload
%autoreload 2
%matplotlib inline
# %cd ..
import sys
sys.path.append("..")
import statnlpbook.util as util
import statnlpbook.sequence as seq
import pandas as pd
import matplotlib
import warnings
warnings.filterwarnings('ignore')
matplotlib.rcParams['figure.figsize'] = (8.0, 5.0)
%%HTML
<style>
.rendered_html td {
font-size: x-large;
text-align: left; !important
}
.rendered_html th {
font-size: x-large;
text-align: left; !important
}
</style>
assigns labels to each element in a sequence
assign each token in a sentence its Part-of-Speech tag
1 | 2 | 3 | 4 | 5 | 6 | 7 | |
---|---|---|---|---|---|---|---|
I | predict | I | won't | win | a | single | game |
O | V | O | V | V | D | A | N |
label tokens as beginning (B), inside (I) our outside (O) a named entity
Barack | Obama | was | born | in | |
---|---|---|---|---|---|
B-PER | I-PER | O | O | O | B-LOC |
model probability distributions over label sequences $\y$ conditioned on input sequences $\x$ $$ s_{\params}(\x,\y) = \prob_\params(\y|\x) $$
Example data: tagging for tweets for the Tweebank dataset
def show_instance(x,y,begin=0,end=-1):
return pd.DataFrame([x[begin:end],y[begin:end]])
train = seq.load_tweebank("../data/oct27.splits/oct27.train")
dev = seq.load_tweebank("../data/oct27.splits/oct27.dev")
test = seq.load_tweebank("../data/oct27.splits/oct27.test")
show_instance(*train[0],0,12)
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | I | predict | I | won't | win | a | single | game | I | bet | on | . |
1 | O | V | O | V | V | D | A | N | O | V | P | , |
Tags (such as "O", "V" and "^") are described in the Tweebank annotation guideline
# count tags here?`xw
from collections import defaultdict
import pandas as pd
examples = {}
counts = defaultdict(int)
words = defaultdict(set)
for x,y in train:
for i in range(0, len(x)):
if y[i] not in examples:
examples[y[i]] = [x[j] + "/" + y[j] if i == j else x[j] for j in range(max(i-2,0),min(i+2,len(x)-1))]
counts[y[i]] += 1
words[y[i]].add(x[i])
sorted_tags = sorted(counts.items(),key=lambda x:-x[1])
sorted_tags_with_examples = [(t,c,len(words[t])," ".join(examples[t])) for t,c in sorted_tags]
sorted_tags_table = pd.DataFrame(sorted_tags_with_examples, columns=['Tag','Count','Unique Words','Example'])
sorted_tags_table[:10]
Tag | Count | Unique Words | Example | |
---|---|---|---|---|
0 | V | 2219 | 873 | I predict/V I |
1 | N | 2003 | 1377 | a single game/N I |
2 | , | 1715 | 84 | bet on ./, Got |
3 | P | 1252 | 126 | I bet on/P . |
4 | O | 1063 | 97 | I/O predict |
5 | ^ | 890 | 741 | . Got Cliff/^ Lee |
6 | D | 869 | 68 | won't win a/D single |
7 | A | 755 | 449 | win a single/A game |
8 | @ | 713 | 694 | me RT @e_one/@ : |
9 | R | 689 | 217 | but I still/R hate |
A fully factorised or local model:
$$ p_\params(\y|\x) = \prod_{i=1}^n p_\params(y_i|\x,i) $$Log-linear classifier $p_\params(y\bar\x,i)$ to predict class for sentence $\x$ and position $i$
$$ p_\params(y\bar\x,i) = \frac{1}{Z_\x} \exp \langle \repr(\x,i),\params_y \rangle $$What are good
show_instance(*train[0],0,12)
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | I | predict | I | won't | win | a | single | game | I | bet | on | . |
1 | O | V | O | V | V | D | A | N | O | V | P | , |
Bias: $$ \repr_0(\x,i) = 1 $$
Word at token to tag: $$ \repr_w(\x,i) = \begin{cases}1 \text{ if }x_i=w \\\\ 0 \text{ else} \end{cases} $$
def feat_1(x,i):
return {
'bias':1.0,
'word:' + x[i]: 1.0,
}
local_1 = seq.LocalSequenceLabeler(feat_1, train, class_weight='balanced')
We can assess the accuracy of this model on the development set.
seq.accuracy(dev, local_1.predict(dev))
0.6964544889073191
Look at confusion matrix
seq.plot_confusion_matrix(dev, local_1.predict(dev))
Shows:
N
receives a lot of wrong counts@
complete failureLet us start with @ ...
local_1.plot_lr_weights('@')
Features for specific users such as "word=@justinbieber" do not generalise well
How to address this?
def feat_2(x,i):
return {
**feat_1(x,i),
'first_at:' + str(x[i][0:1] == '@'): 1.0,
}
local_2 = seq.LocalSequenceLabeler(feat_2, train)
seq.accuracy(dev, local_2.predict(dev))
0.7484967862326353
To confirm that these results actually from improved '@' prediction, let us look at the confusion matrix again
seq.plot_confusion_matrix(dev, local_2.predict(dev))
Solved!
local_2.plot_lr_weights('@')
Other errors?
seq.plot_confusion_matrix(dev, local_2.predict(dev))
Look for errors with high frequency:
How do these errors look like?
util.Carousel(local_2.errors(dev[10:20],
filter_guess=lambda y: y=='N',
filter_gold=lambda y: y=='^'))
Senate | #ArtsGrades | are | in | ! |
^ | N | V | P | , |
N | N | V | P | , |
bias | first_at:False | word:Senate |
1.0 | 1.0 | 1.0 |
-2.93 | 0.30 | 1.67 |
-2.97 | 1.45 | 1.94 |
passed | and | who | made | the | Dirty | Dozen | . | #arts | http://t.co/BAh2iUL |
V | & | O | V | D | ^ | ^ | , | # | U |
N | & | O | V | D | N | N | , | N | N |
bias | first_at:False | word:Dirty |
1.0 | 1.0 | 1.0 |
-2.93 | 0.30 | 0.00 |
-2.97 | 1.45 | 0.00 |
and | who | made | the | Dirty | Dozen | . | #arts | http://t.co/BAh2iUL | via |
& | O | V | D | ^ | ^ | , | # | U | P |
& | O | V | D | N | N | , | N | N | P |
bias | first_at:False | word:Dozen |
1.0 | 1.0 | 1.0 |
-2.93 | 0.30 | 0.00 |
-2.97 | 1.45 | 0.00 |
29p | 11r | Pal | Gasol | went | da | fuck |
N | N | ^ | ^ | V | D | N |
N | N | N | N | V | D | V |
bias | first_at:False | word:Pal |
1.0 | 1.0 | 1.0 |
-2.93 | 0.30 | 0.00 |
-2.97 | 1.45 | 0.00 |
29p | 11r | Pal | Gasol | went | da | fuck | off |
N | N | ^ | ^ | V | D | N | P |
N | N | N | N | V | D | V | T |
bias | first_at:False | word:Gasol |
1.0 | 1.0 | 1.0 |
-2.93 | 0.30 | 0.00 |
-2.97 | 1.45 | 0.00 |
@comicsguy024 | I | don't | use | Chrome | due | to | the | lack |
@ | O | V | V | ^ | P | P | D | N |
@ | O | V | V | N | A | P | D | N |
bias | first_at:False | word:Chrome |
1.0 | 1.0 | 1.0 |
-2.93 | 0.30 | 0.00 |
-2.97 | 1.45 | 0.00 |
So | who's | going | to | the | Ethernet | Expo | next | week | in |
P | L | V | P | D | ^ | ^ | A | N | P |
P | L | V | P | D | N | ^ | A | N | P |
bias | first_at:False | word:Ethernet |
1.0 | 1.0 | 1.0 |
-2.93 | 0.30 | 0.00 |
-2.97 | 1.45 | 0.00 |
Ethernet | Expo | next | week | in | NYC | ? |
^ | ^ | A | N | P | ^ | , |
N | ^ | A | N | P | N | , |
bias | first_at:False | word:NYC |
1.0 | 1.0 | 1.0 |
-2.93 | 0.30 | 0.00 |
-2.97 | 1.45 | 0.00 |
X | ) | RT | @DarCoxaj | : | TETRIS | ! | (: | " |
E | E | ~ | @ | ~ | ^ | , | E | , |
N | , | ~ | @ | ~ | N | , | E | , |
bias | first_at:False | word:TETRIS |
1.0 | 1.0 | 1.0 |
-2.93 | 0.30 | 0.00 |
-2.97 | 1.45 | 0.00 |
tied | up | , | Physed | at | Forest | Green | - | have | a |
V | T | , | N | P | ^ | ^ | , | V | D |
N | T | , | N | P | N | A | , | V | D |
bias | first_at:False | word:Forest |
1.0 | 1.0 | 1.0 |
-2.93 | 0.30 | 0.00 |
-2.97 | 1.45 | 0.00 |
to | go | for | Halloween | on | fri | and | sat | ... | Thinking |
P | V | P | ^ | P | ^ | & | ^ | , | V |
P | V | P | ^ | P | N | & | V | , | N |
bias | first_at:False | word:fri |
1.0 | 1.0 | 1.0 |
-2.93 | 0.30 | 0.00 |
-2.97 | 1.45 | 0.00 |
fri | and | sat | ... | Thinking | pyramid | on | sat | ... |
^ | & | ^ | , | V | ^ | P | ^ | , |
N | & | V | , | N | N | P | V | , |
bias | first_at:False | word:pyramid |
1.0 | 1.0 | 1.0 |
-2.93 | 0.30 | 0.00 |
-2.97 | 1.45 | 0.00 |
Proper nouns tend to be capitalised!
def feat_3(x,i):
return {
**feat_2(x,i),
'is_lower:' + str(x[i].islower()): 1.0,
# 'first_char:' + str(x[i][0:1]): 1.0
'is_lower:' + str(x[i].islower()): 1.0
}
local_3 = seq.LocalSequenceLabeler(feat_3, train)
seq.accuracy(dev, local_3.predict(dev))
0.771511507360564
This improvement indeed comes from being able to identify proper nouns when they are capitalised:
util.Carousel(local_3.errors(dev[10:20],
filter_guess=lambda y: y=='N',
filter_gold=lambda y: y=='^'))
# seq.find_contexts(train, lambda w: w == 'Senate')
Senate | #ArtsGrades | are | in | ! |
^ | N | V | P | , |
N | ^ | V | P | , |
bias | first_at:False | is_lower:False | word:Senate |
1.0 | 1.0 | 1.0 | 1.0 |
-2.45 | 0.85 | 0.01 | 0.78 |
-2.28 | 1.80 | -1.67 | 2.49 |
to | go | for | Halloween | on | fri | and | sat | ... | Thinking |
P | V | P | ^ | P | ^ | & | ^ | , | V |
P | V | P | ^ | P | N | & | V | , | ^ |
bias | first_at:False | is_lower:True | word:fri |
1.0 | 1.0 | 1.0 | 1.0 |
-2.45 | 0.85 | -2.46 | 0.00 |
-2.28 | 1.80 | -0.61 | 0.00 |
fri | and | sat | ... | Thinking | pyramid | on | sat | ... |
^ | & | ^ | , | V | ^ | P | ^ | , |
N | & | V | , | ^ | N | P | V | , |
bias | first_at:False | is_lower:True | word:pyramid |
1.0 | 1.0 | 1.0 | 1.0 |
-2.45 | 0.85 | -2.46 | 0.00 |
-2.28 | 1.80 | -0.61 | 0.00 |
Find more problems:
seq.plot_confusion_matrix(dev, local_3.predict(dev))
High frequency error:
Inspect examples...
util.Carousel(local_3.errors(dev[:20],
filter_guess=lambda y: y=='N',
filter_gold=lambda y: y=='V'))
the | players | and | his | wife | own | smash | burger |
D | N | & | D | N | V | ^ | ^ |
D | N | & | D | N | N | N | N |
bias | first_at:False | is_lower:True | word:own |
1.0 | 1.0 | 1.0 | 1.0 |
-2.49 | 1.37 | -0.57 | -0.78 |
-2.28 | 1.80 | -0.61 | 2.30 |
RT | @TheRealQuailman | : | Currently | laughing | at | Laker | haters | . |
~ | @ | ~ | R | V | P | ^ | N | , |
~ | @ | ~ | ^ | N | P | ^ | N | , |
bias | first_at:False | is_lower:True | word:laughing |
1.0 | 1.0 | 1.0 | 1.0 |
-2.49 | 1.37 | -0.57 | 0.00 |
-2.28 | 1.80 | -0.61 | 0.00 |
@ShiversTheNinja | forgive | me | for | blowing | up |
@ | V | O | P | V | T |
@ | N | O | P | N | T |
bias | first_at:False | is_lower:True | word:forgive |
1.0 | 1.0 | 1.0 | 1.0 |
-2.49 | 1.37 | -0.57 | 0.00 |
-2.28 | 1.80 | -0.61 | 0.00 |
@ShiversTheNinja | forgive | me | for | blowing | up | your | youtube | comment |
@ | V | O | P | V | T | D | ^ | N |
@ | N | O | P | N | T | D | N | N |
bias | first_at:False | is_lower:True | word:blowing |
1.0 | 1.0 | 1.0 | 1.0 |
-2.49 | 1.37 | -0.57 | 0.00 |
-2.28 | 1.80 | -0.61 | 0.00 |
Question | : | How | CAN | you | mend | a | broken | heart | ? |
N | , | R | V | O | V | D | A | N | , |
^ | ~ | R | V | O | N | D | V | V | , |
bias | first_at:False | is_lower:True | word:mend |
1.0 | 1.0 | 1.0 | 1.0 |
-2.49 | 1.37 | -0.57 | 0.00 |
-2.28 | 1.80 | -0.61 | 0.00 |
last | night | , | but | didn't | bother | calling | Shawn | because | I'd |
A | N | , | & | V | V | V | ^ | P | L |
A | N | , | & | V | N | V | ^ | P | L |
bias | first_at:False | is_lower:True | word:bother |
1.0 | 1.0 | 1.0 | 1.0 |
-2.49 | 1.37 | -0.57 | 0.00 |
-2.28 | 1.80 | -0.61 | 0.00 |
are | in | ! | See | who | passed | and | who | made | the |
V | P | , | V | O | V | & | O | V | D |
V | P | , | V | O | N | & | O | V | D |
bias | first_at:False | is_lower:True | word:passed |
1.0 | 1.0 | 1.0 | 1.0 |
-2.49 | 1.37 | -0.57 | 0.00 |
-2.28 | 1.80 | -0.61 | 0.00 |
and | watch | the | news | and | tune | out | over | some | fresh |
& | V | D | N | & | V | T | P | D | A |
& | V | D | N | & | N | T | P | D | A |
bias | first_at:False | is_lower:True | word:tune |
1.0 | 1.0 | 1.0 | 1.0 |
-2.49 | 1.37 | -0.57 | -0.78 |
-2.28 | 1.80 | -0.61 | 2.30 |
that | , | regretfully | I | was | tied | up | , | Physed | at |
P | , | R | O | V | V | T | , | N | P |
P | , | N | O | V | N | T | , | ^ | P |
bias | first_at:False | is_lower:True | word:tied |
1.0 | 1.0 | 1.0 | 1.0 |
-2.49 | 1.37 | -0.57 | 0.00 |
-2.28 | 1.80 | -0.61 | 0.00 |
Suggests that word has not appeared (or not appeared as a verb) in the training set!
However, we can tell that these words may be verbs:
Incorporate as features!
def feat_4(x,i):
return {
**feat_3(x,i),
'last_3:' + "".join(x[i][-3:]): 1.0,
'last_2:' + "".join(x[i][-2:]): 1.0,
}
local_4 = seq.LocalSequenceLabeler(feat_4, train)
seq.accuracy(dev, local_4.predict(dev))
0.7876840140991085
util.Carousel(local_4.errors(dev[:20],
filter_guess=lambda y: y=='N',
filter_gold=lambda y: y=='V' ))
the | players | and | his | wife | own | smash | burger |
D | N | & | D | N | V | ^ | ^ |
D | N | & | D | N | N | V | N |
bias | first_at:False | is_lower:True | last_2:wn | last_3:own | word:own |
1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 |
-3.17 | 1.01 | -1.03 | -0.15 | 0.07 | -0.28 |
-2.96 | 1.82 | -1.02 | 1.01 | -0.45 | 2.63 |
Question | : | How | CAN | you | mend | a | broken | heart | ? |
N | , | R | V | O | V | D | A | N | , |
N | ~ | R | V | O | N | D | V | V | , |
bias | first_at:False | is_lower:True | last_2:nd | last_3:end | word:mend |
1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 |
-3.17 | 1.01 | -1.03 | 0.39 | 2.05 | 0.00 |
-2.96 | 1.82 | -1.02 | 1.02 | 1.57 | 0.00 |
last | night | , | but | didn't | bother | calling | Shawn | because | I'd |
A | N | , | & | V | V | V | ^ | P | L |
A | N | , | & | V | N | V | N | P | L |
bias | first_at:False | is_lower:True | last_2:er | last_3:her | word:bother |
1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 |
-3.17 | 1.01 | -1.03 | -0.47 | -1.53 | 0.00 |
-2.96 | 1.82 | -1.02 | 2.08 | -1.21 | 0.00 |
and | watch | the | news | and | tune | out | over | some | fresh |
& | V | D | N | & | V | T | P | D | A |
& | V | D | N | & | N | T | P | D | A |
bias | first_at:False | is_lower:True | last_2:ne | last_3:une | word:tune |
1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 |
-3.17 | 1.01 | -1.03 | 0.17 | -0.27 | -0.27 |
-2.96 | 1.82 | -1.02 | 0.95 | 1.48 | 1.48 |
We have dependencies between consecutive labels
after non-possessive pronoun ("O") such as "I" a verb ("V") is more likely than a noun ("N")
local model cannot directly capture this
a product of local logistic regression (aka Maximum Entropy) classifiers $\prob_\params(y_i|\x,y_{i-1},i)$
Log-linear version with access to previous label:
$$ p_\params(y_i|\x,y_{i-1},i) = \frac{1}{Z_{\x,y_{i-1},i}} \exp \langle \repr(\x,y_{i-1},i),\params_{y_i} \rangle $$where $Z_{\x,y_{i-1},i}=\sum_y \exp \langle \repr(\x,y_{i-1},i),\params_{y_i} \rangle $ is a local per-token normalisation factor
Optimising the conditional likelihood
$$ \sum_{(\x,\y) \in \train} \log \prob_\params(\y|\x) $$Decomposes nicely: $$ \sum_{(\x,\y) \in \train} \sum_{i=1}^{|\x|} \log \prob_\params(y_i|\x,y_{i-1},i) $$
Easy to train
Let's specify a MEMM using
def memm_feat_1(x,i,hist):
return {
**feat_4(x,i),
'prev_y': hist[0],
}
memm_1 = seq.MEMMSequenceLabeler(memm_feat_1, train, order=1, C=10)
To predict the best label sequence find a $\y^*$ with maximal conditional probability
$$ \y^* =\argmax_\y \prob_\params(\y|\x). $$We cannot simply choose each label in isolation because decisions depend on each other
Simple alternative:
memm_1.predict_next(["the","man"],0,[])
'D'
memm_1.predict_next(["the","man"],1,['D'])
'N'
def memm_greedy_predict(memm: seq.MEMMSequenceLabeler, data, use_gold_history=False):
result = []
for x, y in data:
y_guess = []
for i in range(0, len(x)):
prediction = memm.predict_next(x, i, y_guess if not use_gold_history else y)
y_guess.append(prediction)
result.append(y_guess)
return result
seq.accuracy(dev,memm_greedy_predict(memm_1, dev))
0.8100767157370931
Some Noun vs Verb errors fixed:
util.Carousel(seq.errors(dev[:20], memm_greedy_predict(memm_1, dev[:20]),
'V', 'N',model=memm_1))
the | players | and | his | wife | own | smash | burger |
D | N | & | D | N | V | ^ | ^ |
D | N | & | D | N | N | V | N |
bias | first_at:False | is_lower:True | last_2:wn | last_3:own | prev_y | word:own |
1.0 | 1.0 | 1.0 | 1.0 | 1.0 | N | 1.0 |
-3.36 | 0.90 | -1.11 | -0.16 | -0.02 | 0.00 | -0.01 |
-3.01 | 1.52 | -1.15 | 1.11 | -0.52 | 0.00 | 1.41 |
and | watch | the | news | and | tune | out | over | some | fresh |
& | V | D | N | & | V | T | P | D | A |
& | V | D | N | & | N | P | P | D | A |
bias | first_at:False | is_lower:True | last_2:ne | last_3:une | prev_y | word:tune |
1.0 | 1.0 | 1.0 | 1.0 | 1.0 | & | 1.0 |
-3.36 | 0.90 | -1.11 | -0.04 | -0.01 | 0.00 | -0.01 |
-3.01 | 1.52 | -1.15 | 0.59 | 0.78 | 0.00 | 0.78 |
For the case of verbs ('V') we observe a high weight for $f_{\text{prev_y},\text{O}}$
memm_1.plot_lr_weights('V',feat_filter=lambda s: s.startswith("prev_"))
Greedy decoding may lead to
when returned $\y^*$ is not highest scoring global solution
memm_1.predict_label_scores(["What","better","way"],0,[])[:3]
[('O', -0.21551259764203171), ('D', -2.0962312777396073), ('#', -3.7029947994500017)]
memm_1.predict_label_scores(["What","better","way"],1,['O'])[:3]
[('R', -0.5630160447367506), ('A', -1.0671842955591822), ('V', -2.6373919064817759)]
memm_1.predict_label_scores(["What","better","way"],2,['O','A'])[:3]
[('N', -0.036587380196569194), ('R', -3.8313002827728648), ('A', -5.9863085075420566)]
x = ["What","better","way"]
init_beam = []
for y, score in memm_1.predict_label_scores(x,1,['O']):
init_beam.append((('O',y),score))
beam_size = 3
beam = init_beam[:beam_size]
beam
[(('O', 'R'), -0.5630160447367506), (('O', 'A'), -1.0671842955591822), (('O', 'V'), -2.6373919064817759)]
new_beam = []
for prev_y, prev_s in beam:
for y,s in memm_1.predict_label_scores(x,2,prev_y):
new_beam.append((prev_y + (y,), prev_s + s))
sorted(new_beam, key=lambda p: -p[1])[:3]
[(('O', 'A', 'N'), -1.1037716757557514), (('O', 'R', 'N'), -1.1870333223939724), (('O', 'R', 'R'), -1.9702637950583792)]
def memm_beam_search(memm, x, width=2):
beam = [([],0.)]
history = [beam]
for i in range(0, len(x)):
# use priority queue
candidates = []
for (prev,score) in beam:
scores = memm.predict_scores(x, i, prev)
for label_index,label_score in enumerate(scores):
candidates.append((prev + [memm.labels()[label_index]], score + label_score))
beam = sorted(candidates, key=lambda x: -x[1])[:width]
history.append(beam)
return beam, history
def batch_predict(data, beam_predictor):
return [beam_predictor(x)[0][0][0] for x,y in data]
Full Example:
example = 56
beam, history = memm_beam_search(memm_1, dev[example][0],1)
seq.render_beam_history(history, dev[example], end=17)
Happy | International | Year | of | Biodiversity | ! | What | better | way | to | celebrate | than | tuning | in | to | CropLife's | Biodiversity |
A | A | N | P | N | , | O | A | N | P | V | P | V | T | P | Z | ^ |
0.00 |
Happy | International | Year | of | Biodiversity | ! | What | better | way | to | celebrate | than | tuning | in | to | CropLife's | Biodiversity |
A | A | N | P | N | , | O | A | N | P | V | P | V | T | P | Z | ^ |
A | -0.10 |
Happy | International | Year | of | Biodiversity | ! | What | better | way | to | celebrate | than | tuning | in | to | CropLife's | Biodiversity |
A | A | N | P | N | , | O | A | N | P | V | P | V | T | P | Z | ^ |
A | N | -0.73 |
Happy | International | Year | of | Biodiversity | ! | What | better | way | to | celebrate | than | tuning | in | to | CropLife's | Biodiversity |
A | A | N | P | N | , | O | A | N | P | V | P | V | T | P | Z | ^ |
A | N | N | -1.40 |
Happy | International | Year | of | Biodiversity | ! | What | better | way | to | celebrate | than | tuning | in | to | CropLife's | Biodiversity |
A | A | N | P | N | , | O | A | N | P | V | P | V | T | P | Z | ^ |
A | N | N | P | -1.41 |
Happy | International | Year | of | Biodiversity | ! | What | better | way | to | celebrate | than | tuning | in | to | CropLife's | Biodiversity |
A | A | N | P | N | , | O | A | N | P | V | P | V | T | P | Z | ^ |
A | N | N | P | N | -1.76 |
Happy | International | Year | of | Biodiversity | ! | What | better | way | to | celebrate | than | tuning | in | to | CropLife's | Biodiversity |
A | A | N | P | N | , | O | A | N | P | V | P | V | T | P | Z | ^ |
A | N | N | P | N | , | -1.77 |
Happy | International | Year | of | Biodiversity | ! | What | better | way | to | celebrate | than | tuning | in | to | CropLife's | Biodiversity |
A | A | N | P | N | , | O | A | N | P | V | P | V | T | P | Z | ^ |
A | N | N | P | N | , | O | -1.93 |
Happy | International | Year | of | Biodiversity | ! | What | better | way | to | celebrate | than | tuning | in | to | CropLife's | Biodiversity |
A | A | N | P | N | , | O | A | N | P | V | P | V | T | P | Z | ^ |
A | N | N | P | N | , | O | R | -2.49 |
Happy | International | Year | of | Biodiversity | ! | What | better | way | to | celebrate | than | tuning | in | to | CropLife's | Biodiversity |
A | A | N | P | N | , | O | A | N | P | V | P | V | T | P | Z | ^ |
A | N | N | P | N | , | O | R | N | -3.11 |
Happy | International | Year | of | Biodiversity | ! | What | better | way | to | celebrate | than | tuning | in | to | CropLife's | Biodiversity |
A | A | N | P | N | , | O | A | N | P | V | P | V | T | P | Z | ^ |
A | N | N | P | N | , | O | R | N | P | -3.12 |
Happy | International | Year | of | Biodiversity | ! | What | better | way | to | celebrate | than | tuning | in | to | CropLife's | Biodiversity |
A | A | N | P | N | , | O | A | N | P | V | P | V | T | P | Z | ^ |
A | N | N | P | N | , | O | R | N | P | N | -3.73 |
Happy | International | Year | of | Biodiversity | ! | What | better | way | to | celebrate | than | tuning | in | to | CropLife's | Biodiversity |
A | A | N | P | N | , | O | A | N | P | V | P | V | T | P | Z | ^ |
A | N | N | P | N | , | O | R | N | P | N | P | -3.78 |
Happy | International | Year | of | Biodiversity | ! | What | better | way | to | celebrate | than | tuning | in | to | CropLife's | Biodiversity |
A | A | N | P | N | , | O | A | N | P | V | P | V | T | P | Z | ^ |
A | N | N | P | N | , | O | R | N | P | N | P | V | -3.88 |
Happy | International | Year | of | Biodiversity | ! | What | better | way | to | celebrate | than | tuning | in | to | CropLife's | Biodiversity |
A | A | N | P | N | , | O | A | N | P | V | P | V | T | P | Z | ^ |
A | N | N | P | N | , | O | R | N | P | N | P | V | P | -4.07 |
Happy | International | Year | of | Biodiversity | ! | What | better | way | to | celebrate | than | tuning | in | to | CropLife's | Biodiversity |
A | A | N | P | N | , | O | A | N | P | V | P | V | T | P | Z | ^ |
A | N | N | P | N | , | O | R | N | P | N | P | V | P | P | -4.07 |
Happy | International | Year | of | Biodiversity | ! | What | better | way | to | celebrate | than | tuning | in | to | CropLife's | Biodiversity |
A | A | N | P | N | , | O | A | N | P | V | P | V | T | P | Z | ^ |
A | N | N | P | N | , | O | R | N | P | N | P | V | P | P | Z | -4.83 |
Does it this help?
seq.accuracy(dev, batch_predict(dev, lambda x: memm_beam_search(memm_1, x, 10)))
0.8127721335268505
Beam search is wasteful for first-order models, instead use
Consider a beam of size 2:
example = 56
beam, history = memm_beam_search(memm_1, dev[example][0],2)
seq.render_beam_history(history, dev[example], end=17)
Happy | International | Year | of | Biodiversity | ! | What | better | way | to | celebrate | than | tuning | in | to | CropLife's | Biodiversity |
A | A | N | P | N | , | O | A | N | P | V | P | V | T | P | Z | ^ |
0.00 |
Happy | International | Year | of | Biodiversity | ! | What | better | way | to | celebrate | than | tuning | in | to | CropLife's | Biodiversity |
A | A | N | P | N | , | O | A | N | P | V | P | V | T | P | Z | ^ |
A | -0.10 | |||||||||||||||
^ | -3.44 |
Happy | International | Year | of | Biodiversity | ! | What | better | way | to | celebrate | than | tuning | in | to | CropLife's | Biodiversity |
A | A | N | P | N | , | O | A | N | P | V | P | V | T | P | Z | ^ |
A | N | -0.73 | ||||||||||||||
A | A | -1.01 |
Happy | International | Year | of | Biodiversity | ! | What | better | way | to | celebrate | than | tuning | in | to | CropLife's | Biodiversity |
A | A | N | P | N | , | O | A | N | P | V | P | V | T | P | Z | ^ |
A | A | N | -1.18 | |||||||||||||
A | N | N | -1.40 |
Happy | International | Year | of | Biodiversity | ! | What | better | way | to | celebrate | than | tuning | in | to | CropLife's | Biodiversity |
A | A | N | P | N | , | O | A | N | P | V | P | V | T | P | Z | ^ |
A | A | N | P | -1.19 | ||||||||||||
A | N | N | P | -1.41 |
Happy | International | Year | of | Biodiversity | ! | What | better | way | to | celebrate | than | tuning | in | to | CropLife's | Biodiversity |
A | A | N | P | N | , | O | A | N | P | V | P | V | T | P | Z | ^ |
A | A | N | P | N | -1.54 | |||||||||||
A | N | N | P | N | -1.76 |
Happy | International | Year | of | Biodiversity | ! | What | better | way | to | celebrate | than | tuning | in | to | CropLife's | Biodiversity |
A | A | N | P | N | , | O | A | N | P | V | P | V | T | P | Z | ^ |
A | A | N | P | N | , | -1.54 | ||||||||||
A | N | N | P | N | , | -1.77 |
Happy | International | Year | of | Biodiversity | ! | What | better | way | to | celebrate | than | tuning | in | to | CropLife's | Biodiversity |
A | A | N | P | N | , | O | A | N | P | V | P | V | T | P | Z | ^ |
A | A | N | P | N | , | O | -1.70 | |||||||||
A | N | N | P | N | , | O | -1.93 |
Happy | International | Year | of | Biodiversity | ! | What | better | way | to | celebrate | than | tuning | in | to | CropLife's | Biodiversity |
A | A | N | P | N | , | O | A | N | P | V | P | V | T | P | Z | ^ |
A | A | N | P | N | , | O | R | -2.26 | ||||||||
A | N | N | P | N | , | O | R | -2.49 |
Happy | International | Year | of | Biodiversity | ! | What | better | way | to | celebrate | than | tuning | in | to | CropLife's | Biodiversity |
A | A | N | P | N | , | O | A | N | P | V | P | V | T | P | Z | ^ |
A | A | N | P | N | , | O | R | N | -2.89 | |||||||
A | N | N | P | N | , | O | R | N | -3.11 |
Happy | International | Year | of | Biodiversity | ! | What | better | way | to | celebrate | than | tuning | in | to | CropLife's | Biodiversity |
A | A | N | P | N | , | O | A | N | P | V | P | V | T | P | Z | ^ |
A | A | N | P | N | , | O | R | N | P | -2.89 | ||||||
A | N | N | P | N | , | O | R | N | P | -3.12 |
Happy | International | Year | of | Biodiversity | ! | What | better | way | to | celebrate | than | tuning | in | to | CropLife's | Biodiversity |
A | A | N | P | N | , | O | A | N | P | V | P | V | T | P | Z | ^ |
A | A | N | P | N | , | O | R | N | P | N | -3.51 | |||||
A | N | N | P | N | , | O | R | N | P | N | -3.73 |
Happy | International | Year | of | Biodiversity | ! | What | better | way | to | celebrate | than | tuning | in | to | CropLife's | Biodiversity |
A | A | N | P | N | , | O | A | N | P | V | P | V | T | P | Z | ^ |
A | A | N | P | N | , | O | R | N | P | N | P | -3.56 | ||||
A | N | N | P | N | , | O | R | N | P | N | P | -3.78 |
Happy | International | Year | of | Biodiversity | ! | What | better | way | to | celebrate | than | tuning | in | to | CropLife's | Biodiversity |
A | A | N | P | N | , | O | A | N | P | V | P | V | T | P | Z | ^ |
A | A | N | P | N | , | O | R | N | P | N | P | V | -3.65 | |||
A | N | N | P | N | , | O | R | N | P | N | P | V | -3.88 |
Happy | International | Year | of | Biodiversity | ! | What | better | way | to | celebrate | than | tuning | in | to | CropLife's | Biodiversity |
A | A | N | P | N | , | O | A | N | P | V | P | V | T | P | Z | ^ |
A | A | N | P | N | , | O | R | N | P | N | P | V | P | -3.84 | ||
A | N | N | P | N | , | O | R | N | P | N | P | V | P | -4.07 |
Happy | International | Year | of | Biodiversity | ! | What | better | way | to | celebrate | than | tuning | in | to | CropLife's | Biodiversity |
A | A | N | P | N | , | O | A | N | P | V | P | V | T | P | Z | ^ |
A | A | N | P | N | , | O | R | N | P | N | P | V | P | P | -3.85 | |
A | N | N | P | N | , | O | R | N | P | N | P | V | P | P | -4.07 |
Happy | International | Year | of | Biodiversity | ! | What | better | way | to | celebrate | than | tuning | in | to | CropLife's | Biodiversity |
A | A | N | P | N | , | O | A | N | P | V | P | V | T | P | Z | ^ |
A | A | N | P | N | , | O | R | N | P | N | P | V | P | P | Z | -4.61 |
A | A | N | P | N | , | O | R | N | P | N | P | V | P | P | L | -4.80 |
Histories differ in early positions, but does all the past matter?
memm_1.predict_label_scores(["What","better","way"],2,['O','R'])[:3]
[('N', -0.62401727765722192), ('R', -1.4072477503216287), ('V', -1.8299397831261677)]
Past only matters until the previous token
Viterbi Algorithm = Remember only the best history per last label
from collections import defaultdict
import math
def memm_viterbi_search(memm, x, width=2):
labels = memm.labels()
# initialise
alpha = [{}]
beta = [{}]
for label_index, label_score in enumerate(memm.predict_scores_hist(x, 0, ["PAD"])):
label = labels[label_index]
alpha[0][label] = label_score
beta[0][label] = "PAD"
# prune
seq.prune_alpha_beta(alpha[0], beta[0], width)
# recursion
for i in range(1, len(x)):
alpha.append(defaultdict(lambda: -math.inf))
beta.append({})
for p in alpha[i-1].keys():
for label_index, label_score in enumerate(memm.predict_scores_hist(x, i, [p])):
label = labels[label_index]
new_score = alpha[i-1][p] + label_score
if new_score > alpha[i][label]:
alpha[i][label] = new_score
beta[i][label] = p
# prune
seq.prune_alpha_beta(alpha[i], beta[i], width)
# convert to beam history to be used in the same way beam search was used.
history = seq.convert_alpha_beta_to_history(x, alpha, beta)
return history[-1], history
In action:
beam, history = memm_viterbi_search(memm_1, dev[example][0],2)
seq.render_beam_history(history, dev[example], 17)
Happy | International | Year | of | Biodiversity | ! | What | better | way | to | celebrate | than | tuning | in | to | CropLife's | Biodiversity |
A | A | N | P | N | , | O | A | N | P | V | P | V | T | P | Z | ^ |
0.00 |
Happy | International | Year | of | Biodiversity | ! | What | better | way | to | celebrate | than | tuning | in | to | CropLife's | Biodiversity |
A | A | N | P | N | , | O | A | N | P | V | P | V | T | P | Z | ^ |
A | -0.10 | |||||||||||||||
^ | -3.44 |
Happy | International | Year | of | Biodiversity | ! | What | better | way | to | celebrate | than | tuning | in | to | CropLife's | Biodiversity |
A | A | N | P | N | , | O | A | N | P | V | P | V | T | P | Z | ^ |
A | N | -0.73 | ||||||||||||||
A | A | -1.01 |
Happy | International | Year | of | Biodiversity | ! | What | better | way | to | celebrate | than | tuning | in | to | CropLife's | Biodiversity |
A | A | N | P | N | , | O | A | N | P | V | P | V | T | P | Z | ^ |
A | A | N | -1.18 | |||||||||||||
A | N | V | -1.98 |
Happy | International | Year | of | Biodiversity | ! | What | better | way | to | celebrate | than | tuning | in | to | CropLife's | Biodiversity |
A | A | N | P | N | , | O | A | N | P | V | P | V | T | P | Z | ^ |
A | A | N | P | -1.19 | ||||||||||||
A | A | N | N | -7.50 |
Happy | International | Year | of | Biodiversity | ! | What | better | way | to | celebrate | than | tuning | in | to | CropLife's | Biodiversity |
A | A | N | P | N | , | O | A | N | P | V | P | V | T | P | Z | ^ |
A | A | N | P | N | -1.54 | |||||||||||
A | A | N | P | ^ | -2.57 |
Happy | International | Year | of | Biodiversity | ! | What | better | way | to | celebrate | than | tuning | in | to | CropLife's | Biodiversity |
A | A | N | P | N | , | O | A | N | P | V | P | V | T | P | Z | ^ |
A | A | N | P | N | , | -1.54 | ||||||||||
A | A | N | P | ^ | ^ | -5.17 |
Happy | International | Year | of | Biodiversity | ! | What | better | way | to | celebrate | than | tuning | in | to | CropLife's | Biodiversity |
A | A | N | P | N | , | O | A | N | P | V | P | V | T | P | Z | ^ |
A | A | N | P | N | , | O | -1.70 | |||||||||
A | A | N | P | N | , | D | -4.21 |
Happy | International | Year | of | Biodiversity | ! | What | better | way | to | celebrate | than | tuning | in | to | CropLife's | Biodiversity |
A | A | N | P | N | , | O | A | N | P | V | P | V | T | P | Z | ^ |
A | A | N | P | N | , | O | R | -2.26 | ||||||||
A | A | N | P | N | , | O | A | -2.77 |
Happy | International | Year | of | Biodiversity | ! | What | better | way | to | celebrate | than | tuning | in | to | CropLife's | Biodiversity |
A | A | N | P | N | , | O | A | N | P | V | P | V | T | P | Z | ^ |
A | A | N | P | N | , | O | A | N | -2.81 | |||||||
A | A | N | P | N | , | O | R | R | -3.67 |
Happy | International | Year | of | Biodiversity | ! | What | better | way | to | celebrate | than | tuning | in | to | CropLife's | Biodiversity |
A | A | N | P | N | , | O | A | N | P | V | P | V | T | P | Z | ^ |
A | A | N | P | N | , | O | A | N | P | -2.81 | ||||||
A | A | N | P | N | , | O | A | N | N | -9.36 |
Happy | International | Year | of | Biodiversity | ! | What | better | way | to | celebrate | than | tuning | in | to | CropLife's | Biodiversity |
A | A | N | P | N | , | O | A | N | P | V | P | V | T | P | Z | ^ |
A | A | N | P | N | , | O | A | N | P | N | -3.42 | |||||
A | A | N | P | N | , | O | A | N | P | V | -4.07 |
Happy | International | Year | of | Biodiversity | ! | What | better | way | to | celebrate | than | tuning | in | to | CropLife's | Biodiversity |
A | A | N | P | N | , | O | A | N | P | V | P | V | T | P | Z | ^ |
A | A | N | P | N | , | O | A | N | P | N | P | -3.48 | ||||
A | A | N | P | N | , | O | A | N | P | N | V | -6.93 |
Happy | International | Year | of | Biodiversity | ! | What | better | way | to | celebrate | than | tuning | in | to | CropLife's | Biodiversity |
A | A | N | P | N | , | O | A | N | P | V | P | V | T | P | Z | ^ |
A | A | N | P | N | , | O | A | N | P | N | P | V | -3.57 | |||
A | A | N | P | N | , | O | A | N | P | N | P | N | -6.18 |
Happy | International | Year | of | Biodiversity | ! | What | better | way | to | celebrate | than | tuning | in | to | CropLife's | Biodiversity |
A | A | N | P | N | , | O | A | N | P | V | P | V | T | P | Z | ^ |
A | A | N | P | N | , | O | A | N | P | N | P | V | P | -3.76 | ||
A | A | N | P | N | , | O | A | N | P | N | P | V | T | -5.41 |
Happy | International | Year | of | Biodiversity | ! | What | better | way | to | celebrate | than | tuning | in | to | CropLife's | Biodiversity |
A | A | N | P | N | , | O | A | N | P | V | P | V | T | P | Z | ^ |
A | A | N | P | N | , | O | A | N | P | N | P | V | P | P | -3.77 | |
A | A | N | P | N | , | O | A | N | P | N | P | V | P | ^ | -10.03 |
Happy | International | Year | of | Biodiversity | ! | What | better | way | to | celebrate | than | tuning | in | to | CropLife's | Biodiversity |
A | A | N | P | N | , | O | A | N | P | V | P | V | T | P | Z | ^ |
A | A | N | P | N | , | O | A | N | P | N | P | V | P | P | Z | -4.52 |
A | A | N | P | N | , | O | A | N | P | N | P | V | P | P | L | -4.71 |
Now, does this help?
seq.accuracy(dev, batch_predict(dev, lambda x: memm_viterbi_search(memm_1, x, 2)))
0.8138088326767572
Check Models on Test Set:
pd.DataFrame([
["word", seq.accuracy(test, local_1.predict(test))],
["+ first @", seq.accuracy(test, local_2.predict(test))],
["+ cap", seq.accuracy(test, local_3.predict(test))],
["+ suffix", seq.accuracy(test, local_4.predict(test))],
["MEMM", seq.accuracy(test, memm_1.predict(test))],
])
0 | 1 | |
---|---|---|
0 | word | 0.703440 |
1 | + first @ | 0.757271 |
2 | + cap | 0.776007 |
3 | + suffix | 0.796980 |
4 | MEMM | 0.811102 |