In [1]:
from google.colab import drive
drive.mount('/content/gdrive')
import os
os.chdir('/content/gdrive/My Drive/finch/python/atis/main')
Mounted at /content/gdrive
In [2]:
!pip install python-crfsuite
Collecting python-crfsuite
  Downloading https://files.pythonhosted.org/packages/95/99/869dde6dbf3e0d07a013c8eebfb0a3d30776334e0097f8432b631a9a3a19/python_crfsuite-0.9.7-cp36-cp36m-manylinux1_x86_64.whl (743kB)
     |████████████████████████████████| 747kB 5.3MB/s 
Installing collected packages: python-crfsuite
Successfully installed python-crfsuite-0.9.7
In [3]:
from sklearn.metrics import f1_score, classification_report

import pycrfsuite
import pprint
In [4]:
def word2features(sent, i):
  word = sent[i]
  features = {
    'bias': 1.0,
    'word': word.lower(),
    'word[-4:]': word[-4:],
    'word[-3:]': word[-3:],
    'word[-2:]': word[-2:],
    'word[:4]': word[:4], 
    'word[:3]': word[:3],
    'word[:2]': word[:2],
    'len(word)': len(word),}
  if i > 0:
    word = sent[i - 1]
    features.update({
      '-1:word': word.lower(),
      '-1:word[-4:]': word[-4:],
      '-1:word[-3:]': word[-3:],
      '-1:word[-2:]': word[-2:],
      '-1:word[:4]': word[:4],      
      '-1:word[:3]': word[:3],
      '-1:word[:2]': word[:2],
      '-1:len(word)': len(word),})
  else:
    features['<bos>'] = True
  if i < len(sent) - 1:
    word = sent[i + 1]
    features.update({
      '+1:word': word.lower(),
      '+1:word[-4:]': word[-4:],
      '+1:word[-3:]': word[-3:],
      '+1:word[-2:]': word[-2:],
      '+1:word[:4]': word[:4],
      '+1:word[:3]': word[:3],
      '+1:word[:2]': word[:2],
      '+1:len(word)': len(word),})
  else:
    features['<eos>'] = True
  if i > 1:
    word = sent[i - 2]
    features.update({
      '-2:word': word.lower(),
      '-2:word[-4:]': word[-4:],
      '-2:word[-3:]': word[-3:],
      '-2:word[-2:]': word[-2:],
      '-2:word[:4]': word[:4],
      '-2:word[:3]': word[:3],
      '-2:word[:2]': word[:2],
      '-2:len(word)': len(word),})
  if i < len(sent) - 2:
    word = sent[i + 2]
    features.update({
      '+2:word': word.lower(),
      '+2:word[-4:]': word[-4:],
      '+2:word[-3:]': word[-3:],
      '+2:word[-2:]': word[-2:],
      '+2:word[:4]': word[:4],
      '+2:word[:3]': word[:3],
      '+2:word[:2]': word[:2],
      '+2:len(word)': len(word),})
  return features

def sent2features(sent):
  sent = ['<num>' if word.isdigit() else word for word in sent]
  return [word2features(sent, i) for i in range(len(sent))]
In [5]:
with open('../data/atis.train.w-intent.iob') as f:
  for line in f:
    line = line.rstrip()
    text, slot_intent = line.split('\t')
    words = text.split()[1:-1]
    features = sent2features(words)
    pprint.pprint(features[4])
    break
{'+1:len(word)': 6,
 '+1:word': 'boston',
 '+1:word[-2:]': 'on',
 '+1:word[-3:]': 'ton',
 '+1:word[-4:]': 'ston',
 '+1:word[:2]': 'bo',
 '+1:word[:3]': 'bos',
 '+1:word[:4]': 'bost',
 '+2:len(word)': 2,
 '+2:word': 'at',
 '+2:word[-2:]': 'at',
 '+2:word[-3:]': 'at',
 '+2:word[-4:]': 'at',
 '+2:word[:2]': 'at',
 '+2:word[:3]': 'at',
 '+2:word[:4]': 'at',
 '-1:len(word)': 3,
 '-1:word': 'fly',
 '-1:word[-2:]': 'ly',
 '-1:word[-3:]': 'fly',
 '-1:word[-4:]': 'fly',
 '-1:word[:2]': 'fl',
 '-1:word[:3]': 'fly',
 '-1:word[:4]': 'fly',
 '-2:len(word)': 2,
 '-2:word': 'to',
 '-2:word[-2:]': 'to',
 '-2:word[-3:]': 'to',
 '-2:word[-4:]': 'to',
 '-2:word[:2]': 'to',
 '-2:word[:3]': 'to',
 '-2:word[:4]': 'to',
 'bias': 1.0,
 'len(word)': 4,
 'word': 'from',
 'word[-2:]': 'om',
 'word[-3:]': 'rom',
 'word[-4:]': 'from',
 'word[:2]': 'fr',
 'word[:3]': 'fro',
 'word[:4]': 'from'}
In [6]:
trainer = pycrfsuite.Trainer(verbose=True)

with open('../data/atis.train.w-intent.iob') as f:
  for line in f:
    line = line.rstrip()
    text, slot_intent = line.split('\t')
    words = text.split()[1:-1]
    slot_intent = slot_intent.split()
    slots, intent = slot_intent[1:-1], slot_intent[-1]
    assert len(words) == len(slots)
    features = sent2features(words)
    trainer.append(features, slots)

trainer.set_params({
  'c1': .1,
  'c2': .1,
  'max_iterations': 100,
})

trainer.train('../model/atis.crfsuite')
Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 65662
Seconds required: 0.370

L-BFGS optimization
c1: 0.100000
c2: 0.100000
num_memories: 6
max_iterations: 100
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 185736.274026
Feature norm: 1.000000
Error norm: 206433.745096
Active features: 64570
Line search trials: 1
Line search step: 0.000003
Seconds required for this iteration: 5.173

***** Iteration #2 *****
Loss: 131171.456091
Feature norm: 0.756467
Error norm: 130050.066778
Active features: 60480
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.635

***** Iteration #3 *****
Loss: 115573.176042
Feature norm: 0.730804
Error norm: 110837.007988
Active features: 61325
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 5.210

***** Iteration #4 *****
Loss: 108136.482384
Feature norm: 0.829847
Error norm: 40289.772247
Active features: 64120
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.610

***** Iteration #5 *****
Loss: 103309.590027
Feature norm: 0.988946
Error norm: 29272.672526
Active features: 64092
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.620

***** Iteration #6 *****
Loss: 96558.821468
Feature norm: 1.168982
Error norm: 32512.119676
Active features: 62953
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.612

***** Iteration #7 *****
Loss: 91313.305376
Feature norm: 1.863310
Error norm: 110504.612338
Active features: 63060
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.605

***** Iteration #8 *****
Loss: 81943.667002
Feature norm: 1.872680
Error norm: 19766.760647
Active features: 64561
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.622

***** Iteration #9 *****
Loss: 79032.796371
Feature norm: 2.039394
Error norm: 17708.893633
Active features: 64925
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.638

***** Iteration #10 *****
Loss: 65545.427850
Feature norm: 3.533881
Error norm: 28690.055740
Active features: 62740
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.615

***** Iteration #11 *****
Loss: 61665.953069
Feature norm: 4.756162
Error norm: 70975.481034
Active features: 63378
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.636

***** Iteration #12 *****
Loss: 54059.286749
Feature norm: 5.277837
Error norm: 18958.842484
Active features: 64257
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.599

***** Iteration #13 *****
Loss: 51501.170147
Feature norm: 5.872721
Error norm: 33379.601844
Active features: 64345
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.615

***** Iteration #14 *****
Loss: 47963.720930
Feature norm: 6.374554
Error norm: 11531.366907
Active features: 64688
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.620

***** Iteration #15 *****
Loss: 42980.330826
Feature norm: 7.709764
Error norm: 23148.705513
Active features: 64397
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.658

***** Iteration #16 *****
Loss: 37293.275665
Feature norm: 9.165817
Error norm: 14935.760171
Active features: 64666
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.676

***** Iteration #17 *****
Loss: 31636.405637
Feature norm: 11.514677
Error norm: 9934.798082
Active features: 64782
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.677

***** Iteration #18 *****
Loss: 27353.592054
Feature norm: 13.661067
Error norm: 5971.136100
Active features: 64525
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.676

***** Iteration #19 *****
Loss: 23450.606307
Feature norm: 16.067221
Error norm: 4863.426692
Active features: 64175
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.675

***** Iteration #20 *****
Loss: 19776.519818
Feature norm: 19.769283
Error norm: 12264.772008
Active features: 62416
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.683

***** Iteration #21 *****
Loss: 16837.487284
Feature norm: 21.605196
Error norm: 3437.941596
Active features: 62775
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.659

***** Iteration #22 *****
Loss: 14978.290367
Feature norm: 23.670825
Error norm: 2730.032026
Active features: 62858
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.683

***** Iteration #23 *****
Loss: 13498.334030
Feature norm: 27.332743
Error norm: 12657.068676
Active features: 62182
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 5.247

***** Iteration #24 *****
Loss: 11411.588720
Feature norm: 30.807147
Error norm: 6978.134639
Active features: 62496
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.607

***** Iteration #25 *****
Loss: 10614.382555
Feature norm: 31.660017
Error norm: 2605.342162
Active features: 62778
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.641

***** Iteration #26 *****
Loss: 9372.200089
Feature norm: 34.172812
Error norm: 3636.627237
Active features: 62281
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.607

***** Iteration #27 *****
Loss: 8178.624312
Feature norm: 36.599230
Error norm: 1251.806030
Active features: 61799
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.620

***** Iteration #28 *****
Loss: 7068.828489
Feature norm: 39.715568
Error norm: 1969.402512
Active features: 61295
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.630

***** Iteration #29 *****
Loss: 6936.098419
Feature norm: 41.901050
Error norm: 5963.394053
Active features: 60870
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.626

***** Iteration #30 *****
Loss: 5885.115197
Feature norm: 42.676395
Error norm: 1043.731880
Active features: 61046
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.635

***** Iteration #31 *****
Loss: 5486.718045
Feature norm: 44.009875
Error norm: 871.664327
Active features: 60754
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.619

***** Iteration #32 *****
Loss: 4936.610615
Feature norm: 47.847836
Error norm: 3572.524107
Active features: 59158
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.624

***** Iteration #33 *****
Loss: 4479.542904
Feature norm: 49.359737
Error norm: 2010.070665
Active features: 59378
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.636

***** Iteration #34 *****
Loss: 4217.497522
Feature norm: 50.352963
Error norm: 715.262798
Active features: 59380
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.611

***** Iteration #35 *****
Loss: 3894.015859
Feature norm: 51.922772
Error norm: 1002.128128
Active features: 58527
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.622

***** Iteration #36 *****
Loss: 3529.674701
Feature norm: 54.298559
Error norm: 1655.420253
Active features: 54996
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.608

***** Iteration #37 *****
Loss: 3234.986831
Feature norm: 55.552233
Error norm: 1047.888161
Active features: 54482
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.641

***** Iteration #38 *****
Loss: 3022.732574
Feature norm: 56.564263
Error norm: 646.172215
Active features: 53658
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.631

***** Iteration #39 *****
Loss: 2817.228259
Feature norm: 57.817378
Error norm: 782.562383
Active features: 52795
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.628

***** Iteration #40 *****
Loss: 2614.659135
Feature norm: 58.583852
Error norm: 696.991731
Active features: 52066
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.630

***** Iteration #41 *****
Loss: 2471.699193
Feature norm: 59.183941
Error norm: 472.816242
Active features: 51166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.627

***** Iteration #42 *****
Loss: 2330.916458
Feature norm: 60.079306
Error norm: 467.467855
Active features: 50291
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.622

***** Iteration #43 *****
Loss: 2235.267485
Feature norm: 60.565976
Error norm: 626.291451
Active features: 49645
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.606

***** Iteration #44 *****
Loss: 2172.045869
Feature norm: 61.199016
Error norm: 860.155041
Active features: 48652
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.626

***** Iteration #45 *****
Loss: 2113.872824
Feature norm: 61.413120
Error norm: 277.332926
Active features: 48231
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.631

***** Iteration #46 *****
Loss: 2068.356451
Feature norm: 61.639075
Error norm: 157.454327
Active features: 47284
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.616

***** Iteration #47 *****
Loss: 2020.275625
Feature norm: 62.206709
Error norm: 564.886046
Active features: 45709
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.611

***** Iteration #48 *****
Loss: 1977.840978
Feature norm: 62.455233
Error norm: 412.962357
Active features: 44453
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.623

***** Iteration #49 *****
Loss: 1954.779390
Feature norm: 62.647485
Error norm: 176.817114
Active features: 43629
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.634

***** Iteration #50 *****
Loss: 1924.383348
Feature norm: 63.010855
Error norm: 190.234647
Active features: 42561
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.614

***** Iteration #51 *****
Loss: 1905.593527
Feature norm: 63.149261
Error norm: 158.652723
Active features: 42013
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.623

***** Iteration #52 *****
Loss: 1890.624404
Feature norm: 63.394740
Error norm: 549.980041
Active features: 40618
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.615

***** Iteration #53 *****
Loss: 1872.227020
Feature norm: 63.644931
Error norm: 210.487072
Active features: 40343
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.623

***** Iteration #54 *****
Loss: 1860.517539
Feature norm: 63.746226
Error norm: 83.610337
Active features: 39965
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.630

***** Iteration #55 *****
Loss: 1842.308939
Feature norm: 64.008030
Error norm: 143.038212
Active features: 39060
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.629

***** Iteration #56 *****
Loss: 1834.245416
Feature norm: 64.160519
Error norm: 359.008008
Active features: 38606
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 5.237

***** Iteration #57 *****
Loss: 1822.182127
Feature norm: 64.300127
Error norm: 152.837942
Active features: 38207
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.644

***** Iteration #58 *****
Loss: 1812.443617
Feature norm: 64.326579
Error norm: 130.211799
Active features: 37580
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.607

***** Iteration #59 *****
Loss: 1802.253347
Feature norm: 64.454553
Error norm: 169.714384
Active features: 37006
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.607

***** Iteration #60 *****
Loss: 1793.773289
Feature norm: 64.457917
Error norm: 172.194560
Active features: 36383
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.613

***** Iteration #61 *****
Loss: 1785.516927
Feature norm: 64.531971
Error norm: 101.333736
Active features: 36015
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.616

***** Iteration #62 *****
Loss: 1776.883679
Feature norm: 64.518177
Error norm: 126.496353
Active features: 35544
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.630

***** Iteration #63 *****
Loss: 1768.396253
Feature norm: 64.582392
Error norm: 143.631108
Active features: 34763
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.621

***** Iteration #64 *****
Loss: 1761.922584
Feature norm: 64.559302
Error norm: 137.920913
Active features: 34409
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.613

***** Iteration #65 *****
Loss: 1756.264576
Feature norm: 64.562353
Error norm: 126.259016
Active features: 34126
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.614

***** Iteration #66 *****
Loss: 1749.896613
Feature norm: 64.504392
Error norm: 106.563555
Active features: 33635
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.607

***** Iteration #67 *****
Loss: 1744.584120
Feature norm: 64.488889
Error norm: 167.452059
Active features: 33258
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.627

***** Iteration #68 *****
Loss: 1740.233615
Feature norm: 64.449542
Error norm: 165.132579
Active features: 32911
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.618

***** Iteration #69 *****
Loss: 1735.829250
Feature norm: 64.454792
Error norm: 114.448544
Active features: 32680
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.638

***** Iteration #70 *****
Loss: 1731.657796
Feature norm: 64.442910
Error norm: 117.505592
Active features: 32346
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.623

***** Iteration #71 *****
Loss: 1727.443045
Feature norm: 64.435281
Error norm: 133.227032
Active features: 31958
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.612

***** Iteration #72 *****
Loss: 1724.511824
Feature norm: 64.420806
Error norm: 178.652208
Active features: 31614
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.623

***** Iteration #73 *****
Loss: 1721.304449
Feature norm: 64.457984
Error norm: 131.009293
Active features: 31598
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.635

***** Iteration #74 *****
Loss: 1718.527980
Feature norm: 64.452546
Error norm: 74.251851
Active features: 31432
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.608

***** Iteration #75 *****
Loss: 1715.814569
Feature norm: 64.454188
Error norm: 79.972179
Active features: 31229
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.626

***** Iteration #76 *****
Loss: 1712.892105
Feature norm: 64.442983
Error norm: 154.141611
Active features: 30935
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.615

***** Iteration #77 *****
Loss: 1710.962699
Feature norm: 64.484770
Error norm: 192.807065
Active features: 30889
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.720

***** Iteration #78 *****
Loss: 1707.967984
Feature norm: 64.467845
Error norm: 71.543618
Active features: 30779
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.679

***** Iteration #79 *****
Loss: 1705.976561
Feature norm: 64.487253
Error norm: 57.935540
Active features: 30770
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.632

***** Iteration #80 *****
Loss: 1703.606772
Feature norm: 64.477342
Error norm: 109.514030
Active features: 30469
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.625

***** Iteration #81 *****
Loss: 1702.380571
Feature norm: 64.502957
Error norm: 212.484757
Active features: 30248
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.641

***** Iteration #82 *****
Loss: 1699.477500
Feature norm: 64.499846
Error norm: 81.049846
Active features: 30284
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.618

***** Iteration #83 *****
Loss: 1698.000305
Feature norm: 64.505433
Error norm: 81.690540
Active features: 30168
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.617

***** Iteration #84 *****
Loss: 1696.441907
Feature norm: 64.494066
Error norm: 69.622809
Active features: 30057
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.615

***** Iteration #85 *****
Loss: 1695.133272
Feature norm: 64.499473
Error norm: 131.636793
Active features: 29913
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.659

***** Iteration #86 *****
Loss: 1693.587710
Feature norm: 64.477501
Error norm: 105.102776
Active features: 29734
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.626

***** Iteration #87 *****
Loss: 1692.320854
Feature norm: 64.479593
Error norm: 96.178583
Active features: 29765
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.613

***** Iteration #88 *****
Loss: 1690.982476
Feature norm: 64.477127
Error norm: 62.722836
Active features: 29692
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.606

***** Iteration #89 *****
Loss: 1689.714269
Feature norm: 64.469734
Error norm: 74.287571
Active features: 29611
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.639

***** Iteration #90 *****
Loss: 1688.455642
Feature norm: 64.455096
Error norm: 107.178134
Active features: 29457
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.628

***** Iteration #91 *****
Loss: 1687.504963
Feature norm: 64.448489
Error norm: 135.785170
Active features: 29402
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.628

***** Iteration #92 *****
Loss: 1685.990767
Feature norm: 64.453799
Error norm: 79.243168
Active features: 29307
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.643

***** Iteration #93 *****
Loss: 1684.972958
Feature norm: 64.452554
Error norm: 72.752670
Active features: 29278
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.628

***** Iteration #94 *****
Loss: 1683.929792
Feature norm: 64.450216
Error norm: 57.800750
Active features: 29207
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.626

***** Iteration #95 *****
Loss: 1682.801877
Feature norm: 64.451794
Error norm: 88.713007
Active features: 29097
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.619

***** Iteration #96 *****
Loss: 1682.041215
Feature norm: 64.452782
Error norm: 140.091684
Active features: 28878
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.628

***** Iteration #97 *****
Loss: 1681.081273
Feature norm: 64.462959
Error norm: 132.808391
Active features: 28851
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.605

***** Iteration #98 *****
Loss: 1679.805638
Feature norm: 64.460346
Error norm: 71.776996
Active features: 28800
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.634

***** Iteration #99 *****
Loss: 1679.021402
Feature norm: 64.461403
Error norm: 62.074582
Active features: 28764
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.616

***** Iteration #100 *****
Loss: 1678.243239
Feature norm: 64.454918
Error norm: 75.541765
Active features: 28635
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 2.602

L-BFGS terminated with the maximum number of iterations
Total seconds required for training: 273.175

Storing the model
Number of active features: 28635 (65662)
Number of active attributes: 7876 (15166)
Number of active labels: 121 (121)
Writing labels
Writing attributes
Writing feature references for transitions
Writing feature references for attributes
Seconds required: 0.018

In [7]:
tagger = pycrfsuite.Tagger()
tagger.open('../model/atis.crfsuite')

with open('../data/atis.test.w-intent.iob') as f:
  for line in f:
    line = line.rstrip()
    text, slot_intent = line.split('\t')
    words = text.split()[1:-1]
    slot_intent = slot_intent.split()
    slots, intent = slot_intent[1:-1], slot_intent[-1]
    assert len(words) == len(slots)
    print('expected:', slots)
    print('predicted:', tagger.tag(sent2features(words)))
    break
expected: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-fromloc.city_name', 'O', 'B-toloc.city_name', 'I-toloc.city_name', 'O', 'O', 'O', 'O', 'O', 'B-stoploc.city_name', 'I-stoploc.city_name']
predicted: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-fromloc.city_name', 'O', 'B-toloc.city_name', 'I-toloc.city_name', 'O', 'O', 'O', 'O', 'O', 'B-stoploc.city_name', 'I-stoploc.city_name']
In [8]:
slot_true = []
slot_pred = []

with open('../data/atis.test.w-intent.iob') as f:
  for line in f:
    line = line.rstrip()
    text, slot_intent = line.split('\t')
    words = text.split()[1:-1]
    slot_intent = slot_intent.split()
    slots, intent = slot_intent[1:-1], slot_intent[-1]
    assert len(words) == len(slots)
    slot_pred += tagger.tag(sent2features(words))
    slot_true += slots

mask = [0 if s == 'O' else 1 for s in slot_true]

f1_slots = f1_score(y_true = slot_true,
                    y_pred = slot_pred,
                    sample_weight = mask,
                    average = 'micro',)

print('\n'+classification_report(y_true = slot_true,
                                 y_pred = slot_pred,
                                 sample_weight = mask,
                                 digits = 3))

print('micro avg: {:.3f}'.format(f1_slots))
                              precision    recall  f1-score   support

             B-aircraft_code      1.000     0.515     0.680      33.0
              B-airline_code      0.971     0.971     0.971      34.0
              B-airline_name      1.000     0.980     0.990     101.0
              B-airport_code      0.800     0.444     0.571       9.0
              B-airport_name      0.778     0.333     0.467      21.0
 B-arrive_date.date_relative      0.500     0.500     0.500       2.0
      B-arrive_date.day_name      0.417     0.455     0.435      11.0
    B-arrive_date.day_number      0.500     0.167     0.250       6.0
    B-arrive_date.month_name      0.500     0.167     0.250       6.0
      B-arrive_time.end_time      0.700     0.875     0.778       8.0
 B-arrive_time.period_of_day      0.000     0.000     0.000       6.0
    B-arrive_time.start_time      0.750     0.750     0.750       8.0
          B-arrive_time.time      0.875     0.824     0.848      34.0
 B-arrive_time.time_relative      0.931     0.871     0.900      31.0
             B-booking_class      0.000     0.000     0.000       1.0
                 B-city_name      0.875     0.491     0.629      57.0
                B-class_type      0.960     1.000     0.980      24.0
               B-compartment      0.000     0.000     0.000       1.0
                   B-connect      1.000     1.000     1.000       6.0
             B-cost_relative      1.000     0.973     0.986      37.0
                  B-day_name      0.000     0.000     0.000       2.0
                 B-days_code      0.000     0.000     0.000       1.0
 B-depart_date.date_relative      0.944     1.000     0.971      17.0
      B-depart_date.day_name      0.958     0.972     0.965     212.0
    B-depart_date.day_number      0.915     0.982     0.947      55.0
    B-depart_date.month_name      0.915     0.964     0.939      56.0
B-depart_date.today_relative      1.000     1.000     1.000       9.0
          B-depart_date.year      1.000     1.000     1.000       3.0
      B-depart_time.end_time      0.000     0.000     0.000       3.0
    B-depart_time.period_mod      0.833     1.000     0.909       5.0
 B-depart_time.period_of_day      0.930     0.915     0.922     130.0
    B-depart_time.start_time      0.500     0.333     0.400       3.0
          B-depart_time.time      0.791     0.930     0.855      57.0
 B-depart_time.time_relative      0.923     0.923     0.923      65.0
                   B-economy      1.000     1.000     1.000       6.0
               B-fare_amount      1.000     1.000     1.000       2.0
           B-fare_basis_code      0.882     0.882     0.882      17.0
                    B-flight      0.000     0.000     0.000       1.0
               B-flight_days      1.000     1.000     1.000      10.0
                B-flight_mod      1.000     1.000     1.000      24.0
             B-flight_number      0.917     1.000     0.957      11.0
               B-flight_stop      1.000     1.000     1.000      21.0
               B-flight_time      1.000     1.000     1.000       1.0
      B-fromloc.airport_code      0.833     1.000     0.909       5.0
      B-fromloc.airport_name      0.478     0.917     0.629      12.0
         B-fromloc.city_name      0.979     0.997     0.988     704.0
        B-fromloc.state_code      1.000     1.000     1.000      23.0
        B-fromloc.state_name      1.000     0.765     0.867      17.0
                      B-meal      1.000     1.000     1.000      16.0
                 B-meal_code      0.000     0.000     0.000       1.0
          B-meal_description      1.000     0.700     0.824      10.0
                       B-mod      0.000     0.000     0.000       2.0
                        B-or      1.000     1.000     1.000       3.0
             B-period_of_day      0.000     0.000     0.000       4.0
          B-restriction_code      1.000     1.000     1.000       4.0
 B-return_date.date_relative      1.000     0.667     0.800       3.0
      B-return_date.day_name      0.000     0.000     0.000       2.0
                B-round_trip      1.000     0.986     0.993      73.0
                B-state_code      1.000     1.000     1.000       1.0
                B-state_name      0.000     0.000     0.000       9.0
      B-stoploc.airport_code      0.000     0.000     0.000       1.0
         B-stoploc.city_name      0.947     0.900     0.923      20.0
        B-toloc.airport_code      1.000     0.250     0.400       4.0
        B-toloc.airport_name      1.000     1.000     1.000       3.0
           B-toloc.city_name      0.962     0.990     0.976     716.0
        B-toloc.country_name      0.000     0.000     0.000       1.0
          B-toloc.state_code      1.000     1.000     1.000      18.0
          B-toloc.state_name      0.862     0.893     0.877      28.0
            B-transport_type      1.000     0.800     0.889      10.0
              I-airline_name      0.955     0.969     0.962      65.0
              I-airport_name      0.909     0.345     0.500      29.0
      I-arrive_time.end_time      0.700     0.875     0.778       8.0
    I-arrive_time.start_time      0.000     0.000     0.000       1.0
          I-arrive_time.time      0.879     0.829     0.853      35.0
 I-arrive_time.time_relative      1.000     1.000     1.000       4.0
                 I-city_name      0.818     0.300     0.439      30.0
                I-class_type      1.000     1.000     1.000      17.0
             I-cost_relative      1.000     0.667     0.800       3.0
    I-depart_date.day_number      1.000     1.000     1.000      15.0
      I-depart_time.end_time      0.000     0.000     0.000       3.0
 I-depart_time.period_of_day      1.000     1.000     1.000       1.0
    I-depart_time.start_time      0.500     1.000     0.667       1.0
          I-depart_time.time      0.889     0.923     0.906      52.0
 I-depart_time.time_relative      0.000     0.000     0.000       1.0
               I-fare_amount      1.000     1.000     1.000       2.0
                I-flight_mod      1.000     0.167     0.286       6.0
             I-flight_number      0.000     0.000     0.000       1.0
               I-flight_time      1.000     1.000     1.000       1.0
      I-fromloc.airport_name      0.441     1.000     0.612      15.0
         I-fromloc.city_name      0.927     1.000     0.962     177.0
        I-fromloc.state_name      1.000     1.000     1.000       1.0
          I-restriction_code      1.000     1.000     1.000       3.0
 I-return_date.date_relative      1.000     0.667     0.800       3.0
                I-round_trip      1.000     1.000     1.000      71.0
                I-state_name      0.000     0.000     0.000       1.0
         I-stoploc.city_name      1.000     0.800     0.889      10.0
        I-toloc.airport_name      1.000     1.000     1.000       3.0
           I-toloc.city_name      0.952     0.974     0.963     265.0
          I-toloc.state_name      1.000     1.000     1.000       1.0
            I-transport_type      0.000     0.000     0.000       1.0
                           O      0.000     0.000     0.000       0.0

                    accuracy                          0.926    3663.0
                   macro avg      0.714     0.669     0.675    3663.0
                weighted avg      0.933     0.926     0.923    3663.0

micro avg: 0.926
/usr/local/lib/python3.6/dist-packages/sklearn/metrics/_classification.py:1272: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/usr/local/lib/python3.6/dist-packages/sklearn/metrics/_classification.py:1272: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))