# load the meters
def load_yaml(filename):
import yaml
stream = file(filename)
return yaml.load(stream)
meter_file = 'settings/gh-meters.yaml'
meters_with_feet = load_yaml(meter_file)
meters_with_feet.keys()
['==-/-=-=/-==', '--=-/=-==/--=-/=-==', '-=-=/--==/-=-=/==-', '-=-=/--==/-=-=/--==-', '=-=/-===/=-=/-===-', '--==/--==/==', '--==/--==/==-', '=-==/-=-=/--=-', '=-==/-=-=/==', '=-=/-===-/=-=/-===', '=-==/=-==/=-=-', '==-/=-=-/-==-/=-=-', '==-/-=-=/-==-', '--==/-=-=/==-', '--==/--==/--==/==', '=-=/-===-/=-=/-===-', '--==/--==/--=-', '--==/-=-=/--=-', '==-/=-==//==-/=-==-', '==-/-===//==-/-===-', '==-/-==-/-==-/-==', '=-==/=-==/=-==/=-=-', '==-/-===-//==-/-===', '-=-=/--==/-=-=/==', '=--=/-=-=-//=--=/-=-=', '-===/-===/-===/-===-', '=--=/=-=-/=--=/=', '==-/=-==//==-/=-==', '=-==/=-==/=-==/=-=', '-==/-==/-==/-==', '=-==/-=-=/--=', '-=-=/--==/-=-=/--==', '=-==/--==/==-', '=-=/-===/=-=/-===', '--==/-=-=/--=', '=-==/--==/--==/--=', '=--=/-=-=-//=--=/-=-=-', '--==/--==/--==/--=-', '-===/-===/-==', '=--=/=-=-/=--=/=-', '--==/--==/--==/--=', '===/=-=/-==', '-==/-==/-==/-==-', '==-/=-==-//==-/=-==', '=-==/--==/--=', '--==/--==/--==/==-', '=--=/-=-=//=--=/-=-=', '--=-/=-==/--=-/=-==-', '==-/-===//==-/-===', '==-/-==-/-==-/-==-', '--==/-=-=/==', '=-==/--==/--==/==', '=-==/--==/==', '=-==/-=-=/==-', '==-/-===-//==-/-===-', '-===/-===/-==-', '==-/=-=-/-==-/=-=', '=--=/-=-=//=--=/-=-=-', '==-/=-==-//==-/=-==-', '===/=-=/-==-', '=-==/--==/--=-', '--==/--==/--=', '=-==/--==/--==/--=-', '-=-=/--==/-=-=/--=', '-===/-===/-===/-===', '-=-=/--==/-=-=/--=-', '=-==/=-==/=-=', '=-==/--==/--==/==-']
# grab bad combos and arrange by long-short combos into bad_types
#
# e.g. bad_types[('-','-')] = unacceptable combinations between two shorts
import csv
bad_combos_in = []
with open('settings/bad_combos.csv', 'rb') as csvfile:
reader = csv.reader(csvfile, delimiter=',', quotechar="'")
for row in reader:
assert len(row) == 2
bad_combos_in.append(tuple(row))
bad_combos = bad_combos_in
bad_types = {}
for x in (('-','-'),('-','='),('=','-'),('=','=')):
bad_types[x]=[]
def type_of_bad_combo(s):
assert s in ['l','s']
if s=='l':
return '='
elif s=='s':
return '-'
for bad in bad_combos:
a = bad[0][0]
b = bad[1][0]
assert a,b in ['l','s']
a_type = type_of_bad_combo(a)
b_type = type_of_bad_combo(b)
bad_types[(a_type,b_type)].append(bad)
# conver to tuple
for k in bad_types.keys():
bad_types[k] = tuple(bad_types[k])
for combo in bad_types.keys():
print combo,bad_types[combo]
bad_types
('=', '=') (('l_bcsc', 'l_v'), ('l_bsc', 'l_v'), ('l_bcsc', 'l_z'), ('l_csc', 'l_z'), ('l_csc', 'l_v')) ('=', '-') (('l_bcsc', 's_v<b>'), ('l_bsc', 's_v<b>'), ('l_bcsc', 's_z'), ('l_csc', 's_z'), ('l_csc', 's_v<b>')) ('-', '-') (('s_c', 's_v<b>'), ('s_c', 's_c<h+wb>'), ('s_c', 's_z'), ('s_bcs', 's_c'), ('s_bcs', 's_c<h+wb>'), ('s_cs', 's_c<h+wb>'), ('s_bs', 's_c'), ('s_cs', 's_c')) ('-', '=') (('s_c', 'l_v'), ('s_c', 'l_v<ii+z>z'), ('s_c', 'l_z'))
{('-', '-'): (('s_c', 's_v<b>'), ('s_c', 's_c<h+wb>'), ('s_c', 's_z'), ('s_bcs', 's_c'), ('s_bcs', 's_c<h+wb>'), ('s_cs', 's_c<h+wb>'), ('s_bs', 's_c'), ('s_cs', 's_c')), ('-', '='): (('s_c', 'l_v'), ('s_c', 'l_v<ii+z>z'), ('s_c', 'l_z')), ('=', '-'): (('l_bcsc', 's_v<b>'), ('l_bsc', 's_v<b>'), ('l_bcsc', 's_z'), ('l_csc', 's_z'), ('l_csc', 's_v<b>')), ('=', '='): (('l_bcsc', 'l_v'), ('l_bsc', 'l_v'), ('l_bcsc', 'l_z'), ('l_csc', 'l_z'), ('l_csc', 'l_v'))}
# create digraph of meters (DG) using networkx
# each node has a type (=,-, and 0 [for the first one])
# the node at the end of each meter has a meter_type and meter_full_description attribute
import networkx as nx
DG=nx.DiGraph()
DG.add_node(0,type='0') # this is the start
for meter in meters_with_feet:
meter_type = meters_with_feet[meter]
meter_full_description = meter
meter = meter.replace('/','') # ignore feet denominator for now
node_id = 0
curr_node = 0
for i,c in enumerate(meter):
found_it = False
for n in DG.successors(curr_node):
node = DG.node[n]
if node['type']==c:
curr_node = n
found_it = True
break
if found_it==False:
new_node = len(DG.nodes())
DG.add_node(new_node, type=c)
DG.add_edge(curr_node,new_node)
# now add restraints to edge, this is a 'bad_combo' attribute
# print DG.node[curr_node]['type']
old_c = DG.node[curr_node]['type']
if (old_c,c) in bad_types:
DG[curr_node][new_node]['bad_combos'] = bad_types[(old_c, c)]
# print 'yes found ',old_c,c, bad_types[(old_c,c)]
curr_node = new_node
if i==len(meter)-1: # store some metrical data at the last node
DG.node[curr_node]['meter_type'] = meter_type
DG.node[curr_node]['meter_full_description'] = meter_full_description
# test to make sure all meters match
# note: there are multiple metrical matches with the same meter id
for test_meter in meters_with_feet.keys():
orig_meter = test_meter # contains feet
test_meter = test_meter.replace('/','')
node_id = 0
for ch in test_meter:
found=False
for s in DG.successors_iter(node_id):
if DG.node[s]['type']==ch:
found = True
#found=True
node_id = s
assert found==True
final_node = DG.node[node_id]
print final_node['meter_type'],final_node['meter_full_description']
assert final_node['meter_type']==meters_with_feet[orig_meter]
assert final_node['meter_full_description']==orig_meter
G19 ==-/-=-=/-== G6 --=-/=-==/--=-/=-== G9 -=-=/--==/-=-=/==- G16 -=-=/--==/-=-=/--==- G4 =-=/-===/=-=/-===- G11 --==/--==/== G11 --==/--==/==- G8 =-==/-=-=/--=- G8 =-==/-=-=/== G4 =-=/-===-/=-=/-=== G14 =-==/=-==/=-=- G3 ==-/=-=-/-==-/=-=- G19 ==-/-=-=/-==- G8 --==/-=-=/==- G5 --==/--==/--==/== G4 =-=/-===-/=-=/-===- G11 --==/--==/--=- G8 --==/-=-=/--=- G10 ==-/=-==//==-/=-==- G18 ==-/-===//==-/-===- G13 ==-/-==-/-==-/-== G1 =-==/=-==/=-==/=-=- G18 ==-/-===-//==-/-=== G9 -=-=/--==/-=-=/== G15 =--=/-=-=-//=--=/-=-= G2 -===/-===/-===/-===- G17 =--=/=-=-/=--=/= G10 ==-/=-==//==-/=-== G1 =-==/=-==/=-==/=-= G12 -==/-==/-==/-== G8 =-==/-=-=/--= G16 -=-=/--==/-=-=/--== G11 =-==/--==/==- G4 =-=/-===/=-=/-=== G8 --==/-=-=/--= G5 =-==/--==/--==/--= G15 =--=/-=-=-//=--=/-=-=- G5 --==/--==/--==/--=- G7 -===/-===/-== G17 =--=/=-=-/=--=/=- G5 --==/--==/--==/--= G19 ===/=-=/-== G12 -==/-==/-==/-==- G10 ==-/=-==-//==-/=-== G11 =-==/--==/--= G5 --==/--==/--==/==- G15 =--=/-=-=//=--=/-=-= G6 --=-/=-==/--=-/=-==- G18 ==-/-===//==-/-=== G13 ==-/-==-/-==-/-==- G8 --==/-=-=/== G5 =-==/--==/--==/== G11 =-==/--==/== G8 =-==/-=-=/==- G18 ==-/-===-//==-/-===- G7 -===/-===/-==- G3 ==-/=-=-/-==-/=-= G15 =--=/-=-=//=--=/-=-=- G10 ==-/=-==-//==-/=-==- G19 ===/=-=/-==- G11 =-==/--==/--=- G11 --==/--==/--= G5 =-==/--==/--==/--=- G9 -=-=/--==/-=-=/--= G2 -===/-===/-===/-=== G9 -=-=/--==/-=-=/--=- G14 =-==/=-==/=-= G5 =-==/--==/--==/==-
# pretty picture of the graph.
# note: Haven't figured out how to best display this. A moveable SVG would be nice.
def draw_meter_graph(g):
pos=nx.spring_layout(g)
import matplotlib.pyplot as plt
plt.figure(figsize=(15,15))
labels=dict((n,d['type']) for n,d in g.nodes(data=True)) # need to change labels for 0,1,etc.
nx.draw(g,labels=labels,node_color='#A0CBE2',node_size=200)
draw_meter_graph(DG) # T-Shirts?
#nx.write_graphml(DG, 'myxml.xml')#
# some data on how many nodes have "successors"
# shows how much recursion is being avoided
successor_count = {}
successor_count[0] = 0
successor_count[1] = 0
successor_count[2] = 0
for n in DG.nodes():
num_successors = len(DG.successors(n))
# print n, num_successors
assert (num_successors<3)
successor_count[num_successors] +=1
print "number of nodes with # of successors"
successor_count
number of nodes with # of successors
{0: 34, 1: 205, 2: 33}
# for now, loading in the old Scanner to get the different parsers
# note: turning the parsers into graphs will be killer, based on a character level
# note: will be adding constraints to the EDGES so that scanner won't descend if conditions are not met
from scanner import Scanner
scanner = Scanner()
# this does the basic scan of the input text and classifys as consonant(c),long vowel(v),etc
pp = scanner.pp # this does the basic scan of text
pp.parse(' shaan shaan')
'bcvcbcvc'
# this is the parser for the short syllables
sp = scanner.sp # parser for shorts (-)
sp.parse('c')
's_c'
# this is the parser for the long syllables
lp = scanner.lp # parser for longs (=)
lp.parse('cv')
'l_cv'
# this is the tokenizer, which is used in the graph_scan
# TODO: check the that the long parser tokenizer is exactly the same as the short tokenizer
lp.tokenize('cv')
['c', 'v']
sp.tokenize('cv') # need to check this is the same as lp
['c', 'v']
from graph_parser import GraphParser
new_sp = GraphParser('settings/short.yaml')
tkns = new_sp.tokenize('cv')
new_sp.match_first_at(tkns, 0)
passed edge rule
ParserRule(production='s_cv<b>', prev_class=None, prev_tokens=None, tokens=['c', 'v'], next_tokens=None, next_class='b')
new_lp = GraphParser('settings/long.yaml')
# just checking the lp vs. sp parser here. Can do this programmatically later.
s = ' naqsh faryaadii'
scan = pp.parse(s)
print scan
print 'long tokenizer: ',new_lp.tokenize(scan)
print 'short tokenizer:',new_sp.tokenize(scan)
bcsccbcsccvcv long tokenizer: ['b', 'c', 's', 'c', 'c', 'b', 'c', 's', 'c', 'c', 'v', 'c', 'v'] short tokenizer: ['b', 'c', 's', 'c', 'c', 'b', 'c', 's', 'c', 'c', 'v', 'c', 'v']
lp = new_lp
sp = new_sp
# this is the graph scan.
from collections import namedtuple
ScanResult = namedtuple('ScanResult',["scan","matches"]) # used for completed scans
NodeMatch = namedtuple('NodeMatch', ["node_type", # = or -
"matched_tokens", # tokens matched at node
"node_id", # id of node in graph
"orig_tokens", # original tokens that were matched
"found", # production of parser
"token_i"]) # used for matches at nodes in graph
def graph_scan(in_string):
completed_scans = [] # holds complete scans
#from collections import namedtuple
# this is the scan of the input string to bcv, etc.
scan = pp.parse(in_string)
pd = pp.parse_details # details on the matched tokens, rules, etc.
# this generates scan_tokens from the scan of the input string, e.g. ['b','c','v'], using the long parser (lp)
scan_tokens = lp.tokenize(scan)
# This is a check to see that the short and long parsers match
# TODO: remove later
import collections
assert collections.Counter(scan_tokens) == collections.Counter(lp.tokenize(scan))
# this function descends into node (node_id), passing current token_i, matches, and a string represent
def descend_node(node_id, token_i, matches, matched_so_far):
for successor_id in DG.successors(node_id):
node_type = DG.node[successor_id]['type']
assert node_type in ('=','-')
if node_type=='=':
parser = lp
else:
parser = sp
# for each match m at token_i of scan_tokens
# m contains ['tokens', 'start', 'rule_id', 'rule']
# m['rule'] contain ['tokens', 'production']
# TODO: declunkify.
for m in parser.match_all_at(scan_tokens, token_i):
#print 'found', m['tokens']
# next, check to make sure that this is not a bad combination
# do so by looking for constraints on the edge
# note: this could be added as a constraint to match_all_at() as not_starting_with ...
if len(matches)>0: # if already matched something
a = matches[-1].found # details of previous match
b = m.production#**['rule']['production'] # details of current match
if 'bad_combos' in DG[node_id][successor_id]: # if 'bad_combos' in the a,b's edge
if (a,b) in DG[node_id][successor_id]['bad_combos']: # if it's bad
# print 'aborting! found ',a,b
continue # abort! bad combination
# determine orig_tokens
# meaning, what is matched from original input and parsed to b,c,s, etc.
orig_tokens =[]
for i in range(token_i, token_i+len(m.tokens)):#['tokens'])):
orig_tokens.append(pd[i]['rule']['tokens']) ## parser details here
# this will break if 'rule' is None
# except TypeError:
# print 'error','i=',i
# print 'pd[]i]',pd[i]
# print 'error',m['tokens'], 'i',i
# rule','tokens',"\n",m
# advance token index based on length of match tokens
# generate match data
matched_tokens = m.tokens
match_data = NodeMatch(node_type=node_type,
matched_tokens = matched_tokens,
node_id=node_id,
orig_tokens=orig_tokens,
found=m.production,
token_i=token_i)
# advance token_i
new_token_i = token_i + len(matched_tokens)
matches.append(match_data)
so_far=matched_so_far + node_type
#print so_far, matched_tokens, orig_tokens
# if we're at the end
if new_token_i == len(scan_tokens) and 'meter_type' in DG.node[successor_id]:
#and 'meter_type' in DG.node[s]:# and len(DG.successors(s))==0:
#print 'made it!', successor_id, DG.successors(successor_id),DG.node[successor_id], so_far
completed_scans.append(ScanResult(scan=so_far, matches=matches))
#,"matches"]) # used for completed scans
# for x in matches:
# print x
match_node = successor_id
#print DG.node(match_node,data=True)#[match_node]
else:
descend_node(successor_id, new_token_i,matches,so_far)
# start descent into node 0 of the graph, at token_i 0, with no matches
descend_node(0, 0, [], '')
return completed_scans
print graph_scan(" naqsh faryaadii hai kis kii sho;xii-e ta;hriir kaa")#letaa huu;n maktab-e ;gam-e dil me;n sabaq hanuuz")#" naqsh faryaadii hai kis kii sho;xii-e ta;hriir kaa")#faa((ilaatun faa((ilaatun faa((ilaatun faa((ilun")#naqsh faryaadii hai kis kii sho;xii-e tahriir kaa")
passed edge rule passed edge rule passed edge rule passed edge rule passed edge rule passed edge rule passed edge rule passed edge rule passed edge rule passed edge rule passed edge rule passed edge rule passed edge rule passed edge rule passed edge rule passed edge rule passed edge rule passed edge rule passed edge rule passed edge rule passed edge rule passed edge rule passed edge rule passed edge rule passed edge rule passed edge rule passed edge rule passed edge rule passed edge rule passed edge rule passed edge rule passed edge rule passed edge rule passed edge rule passed edge rule [ScanResult(scan='=-===-===-===-=', matches=[NodeMatch(node_type='=', matched_tokens=['b', 'c', 's', 'c'], node_id=0, orig_tokens=[' ', ['n'], ['a'], ['q']], found='l_bcsc', token_i=0), NodeMatch(node_type='-', matched_tokens=['c'], node_id=1, orig_tokens=[['sh']], found='s_c', token_i=4), NodeMatch(node_type='-', matched_tokens=['b', 'c', 's'], node_id=46, orig_tokens=[' ', ['f'], ['a']], found='s_bcs', token_i=5), NodeMatch(node_type='=', matched_tokens=['b', 'c', 's', 'c'], node_id=46, orig_tokens=[' ', ['f'], ['a'], ['r']], found='l_bcsc', token_i=5), NodeMatch(node_type='-', matched_tokens=['c'], node_id=47, orig_tokens=[['y']], found='s_c', token_i=9), NodeMatch(node_type='=', matched_tokens=['c', 'v'], node_id=47, orig_tokens=[['y'], ['aa']], found='l_cv', token_i=9), NodeMatch(node_type='-', matched_tokens=['c', 'v'], node_id=68, orig_tokens=[['d'], ['ii']], found='s_cv<b>', token_i=11), NodeMatch(node_type='-', matched_tokens=['b', 'c', 'v'], node_id=69, orig_tokens=[' ', ['h'], ['ai']], found='s_bcv<b>', token_i=13), NodeMatch(node_type='=', matched_tokens=['b', 'c', 's', 'c'], node_id=214, orig_tokens=[' ', ['k'], ['i'], ['s']], found='l_bcsc', token_i=16), NodeMatch(node_type='=', matched_tokens=['b', 'c', 'v'], node_id=215, orig_tokens=[' ', ['k'], ['ii']], found='l_bcv', token_i=20), NodeMatch(node_type='=', matched_tokens=['b', 'c', 'v'], node_id=216, orig_tokens=[' ', ['sh'], ['o']], found='l_bcv', token_i=23), NodeMatch(node_type='=', matched_tokens=['c', 'v<ii+z>'], node_id=217, orig_tokens=[[';x'], ['ii']], found='l_cv<ii+z>', token_i=26), NodeMatch(node_type='-', matched_tokens=['z'], node_id=218, orig_tokens=[['-e']], found='s_z', token_i=28), NodeMatch(node_type='=', matched_tokens=['b', 'c', 'v'], node_id=69, orig_tokens=[' ', ['h'], ['ai']], found='l_bcv', token_i=13), NodeMatch(node_type='-', matched_tokens=['b', 'c', 's'], node_id=70, orig_tokens=[' ', ['k'], ['i']], found='s_bcs', token_i=16), NodeMatch(node_type='-', matched_tokens=['c'], node_id=68, orig_tokens=[['d']], found='s_c', token_i=11), NodeMatch(node_type='=', matched_tokens=['c', 'v'], node_id=68, orig_tokens=[['d'], ['ii']], found='l_cv', token_i=11), NodeMatch(node_type='-', matched_tokens=['b', 'c', 'v'], node_id=87, orig_tokens=[' ', ['h'], ['ai']], found='s_bcv<b>', token_i=13), NodeMatch(node_type='=', matched_tokens=['b', 'c', 's', 'c'], node_id=88, orig_tokens=[' ', ['k'], ['i'], ['s']], found='l_bcsc', token_i=16), NodeMatch(node_type='=', matched_tokens=['b', 'c', 'v'], node_id=89, orig_tokens=[' ', ['k'], ['ii']], found='l_bcv', token_i=20), NodeMatch(node_type='=', matched_tokens=['b', 'c', 'v'], node_id=90, orig_tokens=[' ', ['sh'], ['o']], found='l_bcv', token_i=23), NodeMatch(node_type='-', matched_tokens=['c', 'v<ii+z>'], node_id=91, orig_tokens=[[';x'], ['ii']], found='s_cv<ii+z>', token_i=26), NodeMatch(node_type='=', matched_tokens=['z'], node_id=92, orig_tokens=[['-e']], found='l_z', token_i=28), NodeMatch(node_type='=', matched_tokens=['b', 'c', 's', 'c'], node_id=93, orig_tokens=[' ', ['t'], ['a'], [';h']], found='l_bcsc', token_i=29), NodeMatch(node_type='=', matched_tokens=['c', 'v'], node_id=153, orig_tokens=[['r'], ['ii']], found='l_cv', token_i=33), NodeMatch(node_type='-', matched_tokens=['c'], node_id=154, orig_tokens=[['r']], found='s_c', token_i=35), NodeMatch(node_type='=', matched_tokens=['b', 'c', 'v'], node_id=155, orig_tokens=[' ', ['k'], ['aa']], found='l_bcv', token_i=36), NodeMatch(node_type='-', matched_tokens=['b', 'c', 's'], node_id=93, orig_tokens=[' ', ['t'], ['a']], found='s_bcs', token_i=29), NodeMatch(node_type='-', matched_tokens=['c'], node_id=91, orig_tokens=[[';x']], found='s_c', token_i=26), NodeMatch(node_type='-', matched_tokens=['b', 'c', 's'], node_id=0, orig_tokens=[' ', ['n'], ['a']], found='s_bcs', token_i=0)])]
data = {}
import csv
verses = {}
multiple_matches = []
with open('data/verses.csv', 'rb') as csvfile:
versereader = csv.reader(csvfile, delimiter=',', quotechar='|')
count = 0
multiple_match_count = 0
for row in versereader:
(verse_id, input_string, real_scan) = row
data[verse_id] = {'input_string': input_string, 'real_scan': real_scan}
s= data['001.02.1']['input_string']
graph_scan(s)
[ScanResult(scan='=-===-===-===-=', matches=[NodeMatch(node_type='=', matched_tokens=['b', 'c', 's', 'c'], node_id=0, orig_tokens=[' ', ['.s'], ['u'], ['b']], found='l_bcsc', token_i=0), NodeMatch(node_type='-', matched_tokens=['c'], node_id=1, orig_tokens=[[';h']], found='s_c', token_i=4), NodeMatch(node_type='-', matched_tokens=['b', 'c', 's'], node_id=46, orig_tokens=[' ', ['k'], ['a']], found='s_bcs', token_i=5), NodeMatch(node_type='=', matched_tokens=['b', 'c', 's', 'c'], node_id=46, orig_tokens=[' ', ['k'], ['a'], ['r']], found='l_bcsc', token_i=5), NodeMatch(node_type='-', matched_tokens=['c', 'v'], node_id=47, orig_tokens=[['n'], ['aa']], found='s_cv<b>', token_i=9), NodeMatch(node_type='=', matched_tokens=['b', 'c', 'v'], node_id=48, orig_tokens=[' ', ['sh'], ['aa']], found='l_bcv', token_i=11), NodeMatch(node_type='-', matched_tokens=['c'], node_id=47, orig_tokens=[['n']], found='s_c', token_i=9), NodeMatch(node_type='=', matched_tokens=['c', 'v'], node_id=47, orig_tokens=[['n'], ['aa']], found='l_cv', token_i=9), NodeMatch(node_type='=', matched_tokens=['b', 'c', 'v'], node_id=68, orig_tokens=[' ', ['sh'], ['aa']], found='l_bcv', token_i=11), NodeMatch(node_type='-', matched_tokens=['c'], node_id=87, orig_tokens=[['m']], found='s_c', token_i=14), NodeMatch(node_type='=', matched_tokens=['b', 'c', 'v'], node_id=88, orig_tokens=[' ', ['k'], ['aa']], found='l_bcv', token_i=15), NodeMatch(node_type='=', matched_tokens=['b', 'c', 'v'], node_id=89, orig_tokens=[' ', ['l'], ['aa']], found='l_bcv', token_i=18), NodeMatch(node_type='=', matched_tokens=['c', 'v'], node_id=90, orig_tokens=[['n'], ['aa']], found='l_cv', token_i=21), NodeMatch(node_type='-', matched_tokens=['b', 'c', 'v'], node_id=91, orig_tokens=[' ', ['h'], ['ai']], found='s_bcv<b>', token_i=23), NodeMatch(node_type='=', matched_tokens=['b', 'c', 'v'], node_id=92, orig_tokens=[' ', ['j'], ['uu']], found='l_bcv', token_i=26), NodeMatch(node_type='=', matched_tokens=['z'], node_id=93, orig_tokens=[['-e']], found='l_z', token_i=29), NodeMatch(node_type='=', matched_tokens=['b', 'c', 'v'], node_id=153, orig_tokens=[' ', ['sh'], ['ii']], found='l_bcv', token_i=30), NodeMatch(node_type='-', matched_tokens=['c'], node_id=154, orig_tokens=[['r']], found='s_c', token_i=33), NodeMatch(node_type='=', matched_tokens=['b', 'c', 'v'], node_id=155, orig_tokens=[' ', ['k'], ['aa']], found='l_bcv', token_i=34), NodeMatch(node_type='-', matched_tokens=['z'], node_id=93, orig_tokens=[['-e']], found='s_z', token_i=29), NodeMatch(node_type='-', matched_tokens=['b', 'c', 's'], node_id=0, orig_tokens=[' ', ['.s'], ['u']], found='s_bcs', token_i=0)])]
#print data['004.10x.0']
#graph_scan(data['004.10x.0']['input_string'])
def graph_scan_all():
for k in sorted(data.keys()):
#print k
graph_scan(data[k]['input_string'])
def trad_scan_all():
for k in sorted(data.keys()):
scanner.scan(data[k]['input_string'], known_only = True)
import cProfile
cProfile.run('trad_scan_all()')
157263342 function calls (156164871 primitive calls) in 152.962 seconds Ordered by: standard name ncalls tottime percall cumtime percall filename:lineno(function) 1 0.024 0.024 152.962 152.962 <ipython-input-108-c2a5a03a3c0a>:7(trad_scan_all) 1 0.000 0.000 152.962 152.962 <string>:1(<module>) 3314 0.004 0.000 0.064 0.000 parser.py:154(tokenize) 3314 24.024 0.007 36.788 0.011 parser.py:157(parse) 15426462 5.292 0.000 5.292 0.000 parser.py:181(<genexpr>) 279914 51.097 0.000 82.610 0.000 parser.py:227(match_all_at) 39695932 14.124 0.000 14.124 0.000 parser.py:250(<genexpr>) 320319 0.378 0.000 1.374 0.000 re.py:134(match) 210724 0.251 0.000 28.198 0.000 re.py:139(search) 531043 1.274 0.000 24.059 0.000 re.py:228(_compile) 210724 0.249 0.000 28.446 0.000 scanner.py:43(meter_ok) 259310 0.585 0.000 0.585 0.000 scanner.py:46(bad_combo) 3314 2.820 0.001 152.936 0.046 scanner.py:66(scan) 401482/101899 3.146 0.000 4.418 0.000 sre_compile.py:32(_compile) 101899 1.328 0.000 4.974 0.000 sre_compile.py:361(_compile_info) 203798 0.165 0.000 0.197 0.000 sre_compile.py:474(isstring) 101899 0.268 0.000 9.676 0.000 sre_compile.py:480(_code) 101899 0.609 0.000 22.269 0.000 sre_compile.py:495(compile) 599166 0.296 0.000 0.375 0.000 sre_parse.py:126(__len__) 1198332 0.872 0.000 1.371 0.000 sre_parse.py:130(__getitem__) 1198708 0.665 0.000 0.970 0.000 sre_parse.py:138(append) 401482/101899 2.414 0.000 2.895 0.000 sre_parse.py:140(getwidth) 101899 0.116 0.000 0.322 0.000 sre_parse.py:178(__init__) 1803988 2.363 0.000 2.765 0.000 sre_parse.py:182(__next) 800926 0.454 0.000 1.177 0.000 sre_parse.py:195(match) 1302645 0.812 0.000 2.648 0.000 sre_parse.py:201(get) 99861 0.271 0.000 0.347 0.000 sre_parse.py:257(_escape) 201760/101899 1.667 0.000 10.631 0.000 sre_parse.py:301(_parse_sub) 301621/101899 2.886 0.000 10.277 0.000 sre_parse.py:379(_parse) 101899 0.526 0.000 11.702 0.000 sre_parse.py:663(parse) 101899 0.084 0.000 0.084 0.000 sre_parse.py:67(__init__) 401482 0.255 0.000 0.255 0.000 sre_parse.py:90(__init__) 101899 0.172 0.000 0.172 0.000 {_sre.compile} 23913376 10.187 0.000 23.674 0.000 {all} 1504029 0.599 0.000 0.599 0.000 {isinstance} 31885364/31685642 4.102 0.000 4.200 0.000 {len} 199722 0.066 0.000 0.066 0.000 {max} 5857358 1.115 0.000 1.115 0.000 {method 'append' of 'list' objects} 1019 0.047 0.000 0.047 0.000 {method 'clear' of 'dict' objects} 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects} 4076 0.002 0.000 0.002 0.000 {method 'extend' of 'list' objects} 6628 0.128 0.000 0.128 0.000 {method 'findall' of '_sre.SRE_Pattern' objects} 730765 0.311 0.000 0.311 0.000 {method 'get' of 'dict' objects} 101899 0.028 0.000 0.028 0.000 {method 'items' of 'dict' objects} 1 0.000 0.000 0.000 0.000 {method 'keys' of 'dict' objects} 320319 0.294 0.000 0.294 0.000 {method 'match' of '_sre.SRE_Pattern' objects} 139957 0.072 0.000 0.072 0.000 {method 'pop' of 'list' objects} 210724 4.589 0.000 4.589 0.000 {method 'search' of '_sre.SRE_Pattern' objects} 1002686 0.415 0.000 0.415 0.000 {min} 899125 0.135 0.000 0.135 0.000 {ord} 23913376 11.379 0.000 11.379 0.000 {range} 1 0.002 0.002 0.002 0.002 {sorted}
cProfile.run('graph_scan_all()')
#t.timeit()
100477739 function calls (100337067 primitive calls) in 97.243 seconds Ordered by: standard name ncalls tottime percall cumtime percall filename:lineno(function) 3314 0.070 0.000 97.359 0.029 <ipython-input-105-ddef7d259286>:12(graph_scan) 143986/3314 2.551 0.000 59.205 0.018 <ipython-input-105-ddef7d259286>:32(descend_node) 1 0.008 0.008 97.369 97.369 <ipython-input-108-c2a5a03a3c0a>:3(graph_scan_all) 1 0.000 0.000 97.369 97.369 <string>:1(<module>) 4039 0.004 0.000 0.006 0.000 <string>:8(__new__) 13256 0.017 0.000 0.017 0.000 _weakrefset.py:68(__contains__) 6628 0.027 0.000 0.047 0.000 abc.py:128(__instancecheck__) 6628 0.025 0.000 0.290 0.000 collections.py:406(__init__) 6628 0.171 0.000 0.264 0.000 collections.py:469(update) 143986 0.120 0.000 0.163 0.000 digraph.py:659(successors_iter) 143986 0.470 0.000 0.633 0.000 digraph.py:676(successors) 322363 0.119 0.000 0.119 0.000 graph.py:293(__getitem__) 6628 0.007 0.000 0.120 0.000 parser.py:154(tokenize) 3314 24.696 0.007 37.675 0.011 parser.py:157(parse) 15426462 5.332 0.000 5.332 0.000 parser.py:181(<genexpr>) 179881 34.347 0.000 55.374 0.000 parser.py:227(match_all_at) 25914452 9.357 0.000 9.357 0.000 parser.py:250(<genexpr>) 18305985 7.697 0.000 17.846 0.000 {all} 148750 0.105 0.000 0.105 0.000 {built-in method __new__ of type object at 0x1560c0} 6628 0.003 0.000 0.003 0.000 {getattr} 6628 0.011 0.000 0.058 0.000 {isinstance} 143986 0.043 0.000 0.043 0.000 {iter} 19994994 2.536 0.000 2.536 0.000 {len} 824216 0.168 0.000 0.168 0.000 {method 'append' of 'list' objects} 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects} 9942 0.184 0.000 0.184 0.000 {method 'findall' of '_sre.SRE_Pattern' objects} 260358 0.035 0.000 0.035 0.000 {method 'get' of 'dict' objects} 1 0.000 0.000 0.000 0.000 {method 'keys' of 'dict' objects} 18450696 9.136 0.000 9.136 0.000 {range} 1 0.002 0.002 0.002 0.002 {sorted}