import pyvw DET = 1 NOUN = 2 VERB = 3 ADJ = 4 my_dataset = [ [(DET , 'the'), (NOUN, 'monster'), (VERB, 'ate'), (DET , 'a'), (ADJ , 'big'), (NOUN, 'sandwich')], [(DET , 'the'), (NOUN, 'sandwich'), (VERB, 'was'), (ADJ , 'tasty')], [(NOUN, 'it'), (VERB, 'ate'), (NOUN, 'it'), (ADJ , 'all')] ] print my_dataset[2] class SequenceLabeler(pyvw.SearchTask): def __init__(self, vw, sch, num_actions): # you must must must initialize the parent class # this will automatically store self.sch <- sch, self.vw <- vw pyvw.SearchTask.__init__(self, vw, sch, num_actions) # set whatever options you want sch.set_options( sch.AUTO_HAMMING_LOSS | sch.AUTO_CONDITION_FEATURES ) def _run(self, sentence): # it's called _run to remind you that you shouldn't call it directly! output = [] for n in range(len(sentence)): pos,word = sentence[n] # use "with...as..." to guarantee that the example is finished properly with self.vw.example({'w': [word]}) as ex: pred = self.sch.predict(examples=ex, my_tag=n+1, oracle=pos, condition=[(n,'p'), (n-1, 'q')]) output.append(pred) return output vw = pyvw.vw("--search 4 --audit --quiet --search_task hook --ring_size 1024") sequenceLabeler = vw.init_search_task(SequenceLabeler) for i in xrange(10): sequenceLabeler.learn(my_dataset) test_example = [ (0,w) for w in "the sandwich ate a monster".split() ] print test_example out = sequenceLabeler.predict(test_example) print out class SequenceLabeler2(pyvw.SearchTask): def __init__(self, vw, sch, num_actions): pyvw.SearchTask.__init__(self, vw, sch, num_actions) def _run(self, sentence): output = [] loss = 0. for n in range(len(sentence)): pos,word = sentence[n] prevPred = output[n-1] if n > 0 else '' with self.vw.example({'w': [word], 'p': [prevPred]}) as ex: pred = self.sch.predict(examples=ex, my_tag=n+1, oracle=pos, condition=(n,'p')) output.append(pred) if pred != pos: loss += 1. self.sch.loss(loss) return output sequenceLabeler2 = vw.init_search_task(SequenceLabeler2) sequenceLabeler2.learn(my_dataset) print sequenceLabeler2.predict( [(0,w) for w in "the sandwich ate a monster".split()] ) # the label for each word is its parent, or -1 for root my_dataset = [ [("the", 1), # 0 ("monster", 2), # 1 ("ate", -1), # 2 ("a", 5), # 3 ("big", 5), # 4 ("sandwich", 2) ] # 5 , [("the", 1), # 0 ("sandwich", 2), # 1 ("is", -1), # 2 ("tasty", 2)] # 3 , [("a", 1), # 0 ("sandwich", 2), # 1 ("ate", -1), # 2 ("itself", 2), # 3 ] ] class CovingtonDepParser(pyvw.SearchTask): def __init__(self, vw, sch, num_actions): pyvw.SearchTask.__init__(self, vw, sch, num_actions) sch.set_options( sch.AUTO_HAMMING_LOSS | sch.AUTO_CONDITION_FEATURES ) def _run(self, sentence): N = len(sentence) # initialize our output so everything is a root output = [-1 for i in range(N)] for n in range(N): wordN,parN = sentence[n] for m in range(-1,N): if m == n: continue wordM = sentence[m][0] if m > 0 else "*root*" # ask the question: is m the parent of n? isParent = 2 if m == parN else 1 # construct an example dir = 'l' if m < n else 'r' with self.vw.example({'a': [wordN, dir + '_' + wordN], 'b': [wordM, dir + '_' + wordN], 'p': [wordN + '_' + wordM, dir + '_' + wordN + '_' + wordM], 'd': [ str(m-n <= d) + '<=' + str(d) for d in [-8, -4, -2, -1, 1, 2, 4, 8] ] + [ str(m-n >= d) + '>=' + str(d) for d in [-8, -4, -2, -1, 1, 2, 4, 8] ] }) as ex: pred = self.sch.predict(examples = ex, my_tag = (m+1)*N + n + 1, oracle = isParent, condition = [ (max(0, (m )*N + n + 1), 'p'), (max(0, (m+1)*N + n ), 'q') ]) if pred == 2: output[n] = m break return output vw = pyvw.vw("--search 2 --quiet --search_task hook --ring_size 1024") task = vw.init_search_task(CovingtonDepParser) for p in range(10): # do ten passes over the training data task.learn(my_dataset) print 'testing' print task.predict( [(w,-1) for w in "the monster ate a sandwich".split()] ) print 'should have printed [ 1 2 -1 4 2 ]' class CovingtonDepParserLDF(pyvw.SearchTask): def __init__(self, vw, sch, num_actions): pyvw.SearchTask.__init__(self, vw, sch, num_actions) sch.set_options( sch.AUTO_HAMMING_LOSS | sch.IS_LDF | sch.AUTO_CONDITION_FEATURES ) def makeExample(self, sentence, n, m): wordN = sentence[n][0] wordM = sentence[m][0] if m >= 0 else '*ROOT*' dir = 'l' if m < n else 'r' ex = self.vw.example( { 'a': [wordN, dir + '_' + wordN], 'b': [wordM, dir + '_' + wordN], 'p': [wordN + '_' + wordM, dir + '_' + wordN + '_' + wordM], 'd': [ str(m-n <= d) + '<=' + str(d) for d in [-8, -4, -2, -1, 1, 2, 4, 8] ] + [ str(m-n >= d) + '>=' + str(d) for d in [-8, -4, -2, -1, 1, 2, 4, 8] ] }, labelType=self.vw.lCostSensitive) ex.set_label_string(str(m+2) + ":0") return ex def _run(self, sentence): N = len(sentence) # initialize our output so everything is a root output = [-1 for i in range(N)] for n in range(N): # make LDF examples examples = [ self.makeExample(sentence,n,m) for m in range(-1,N) if n != m ] # truth parN = sentence[n][1] oracle = parN+1 if parN < n else parN # have to -1 because we excluded n==m from list # make a prediction pred = self.sch.predict(examples = examples, my_tag = n+1, oracle = oracle, condition = [ (n, 'p'), (n-1, 'q') ] ) output[n] = pred-1 if pred < n else pred # have to +1 because n==m excluded for ex in examples: ex.finish() # clean up return output vw = pyvw.vw("--search 0 --csoaa_ldf m --search_task hook --ring_size 1024 --quiet") task = vw.init_search_task(CovingtonDepParserLDF) for p in range(2): # do two passes over the training data task.learn(my_dataset) print task.predict( [(w,-1) for w in "the monster ate a sandwich".split()] ) my_dataset = [ ( "the blue house".split(), ([0], [2], [1]), "la maison bleue".split() ), ( "the house".split(), ([0], [1]), "la maison".split() ), ( "the flower".split(), ([0], [1]), "la fleur".split() ) ] def alignmentError(true, sys): t = set(true) s = set(sys) if len(t | s) == 0: return 0. return 1. - float(len(t & s)) / float(len(t | s)) class WordAligner(pyvw.SearchTask): def __init__(self, vw, sch, num_actions): pyvw.SearchTask.__init__(self, vw, sch, num_actions) sch.set_options( sch.AUTO_HAMMING_LOSS | sch.IS_LDF | sch.AUTO_CONDITION_FEATURES ) def makeExample(self, E, F, i, j0, l): f = 'Null' if j0 is None else [ F[j0+k] for k in range(l+1) ] ex = self.vw.example( { 'e': E[i], 'f': f, 'p': '_'.join(f), 'l': str(l), 'o': [str(i-j0), str(i-j0-l)] if j0 is not None else [] }, labelType = self.vw.lCostSensitive ) lab = 'Null' if j0 is None else str(j0+l) ex.set_label_string(lab + ':0') return ex def _run(self, alignedSentence): E,A,F = alignedSentence # for each E word, we pick a F span covered = {} # which F words have been covered so far? output = [] for i in range(len(E)): examples = [] # contains vw examples spans = [] # contains triples (alignment error, index in examples, [range]) # empty span: examples.append( self.makeExample(E, F, i, None, None) ) spans.append( (alignmentError(A[i], []), 0, []) ) # non-empty spans for j0 in range(len(F)): for l in range(3): # max phrase length of 3 if j0+l >= len(F): break if covered.has_key(j0+l): break id = len(examples) examples.append( self.makeExample(E, F, i, j0, l) ) spans.append( (alignmentError(A[i], range(j0,j0+l+1)), id, range(j0,j0+l+1)) ) sortedSpans = [] for s in spans: sortedSpans.append(s) sortedSpans.sort() oracle = [] for id in range(len(sortedSpans)): if sortedSpans[id][0] > sortedSpans[0][0]: break oracle.append( sortedSpans[id][1] ) pred = self.sch.predict(examples = examples, my_tag = i+1, oracle = oracle, condition = [ (i, 'p'), (i-1, 'q') ] ) for ex in examples: ex.finish() output.append( spans[pred][2] ) for j in spans[pred][2]: covered[j] = True return output vw = pyvw.vw("--search 0 --csoaa_ldf m --search_task hook --ring_size 1024 --quiet -q ef -q ep") task = vw.init_search_task(WordAligner) for p in range(10): task.learn(my_dataset) print task.predict( ("the blue flower".split(), ([],[],[]), "la fleur bleue".split()) )