Notebook

In [1]:

# load the meters

def load_yaml(filename):
    import yaml
    stream = file(filename)
    return yaml.load(stream)

meter_file = 'settings/gh-meters.yaml'

meters_with_feet = load_yaml(meter_file)
meters_with_feet.keys()

Out[1]:

['==-/-=-=/-==',
 '--=-/=-==/--=-/=-==',
 '-=-=/--==/-=-=/==-',
 '-=-=/--==/-=-=/--==-',
 '=-=/-===/=-=/-===-',
 '--==/--==/==',
 '--==/--==/==-',
 '=-==/-=-=/--=-',
 '=-==/-=-=/==',
 '=-=/-===-/=-=/-===',
 '=-==/=-==/=-=-',
 '==-/=-=-/-==-/=-=-',
 '==-/-=-=/-==-',
 '--==/-=-=/==-',
 '--==/--==/--==/==',
 '=-=/-===-/=-=/-===-',
 '--==/--==/--=-',
 '--==/-=-=/--=-',
 '==-/=-==//==-/=-==-',
 '==-/-===//==-/-===-',
 '==-/-==-/-==-/-==',
 '=-==/=-==/=-==/=-=-',
 '==-/-===-//==-/-===',
 '-=-=/--==/-=-=/==',
 '=--=/-=-=-//=--=/-=-=',
 '-===/-===/-===/-===-',
 '=--=/=-=-/=--=/=',
 '==-/=-==//==-/=-==',
 '=-==/=-==/=-==/=-=',
 '-==/-==/-==/-==',
 '=-==/-=-=/--=',
 '-=-=/--==/-=-=/--==',
 '=-==/--==/==-',
 '=-=/-===/=-=/-===',
 '--==/-=-=/--=',
 '=-==/--==/--==/--=',
 '=--=/-=-=-//=--=/-=-=-',
 '--==/--==/--==/--=-',
 '-===/-===/-==',
 '=--=/=-=-/=--=/=-',
 '--==/--==/--==/--=',
 '===/=-=/-==',
 '-==/-==/-==/-==-',
 '==-/=-==-//==-/=-==',
 '=-==/--==/--=',
 '--==/--==/--==/==-',
 '=--=/-=-=//=--=/-=-=',
 '--=-/=-==/--=-/=-==-',
 '==-/-===//==-/-===',
 '==-/-==-/-==-/-==-',
 '--==/-=-=/==',
 '=-==/--==/--==/==',
 '=-==/--==/==',
 '=-==/-=-=/==-',
 '==-/-===-//==-/-===-',
 '-===/-===/-==-',
 '==-/=-=-/-==-/=-=',
 '=--=/-=-=//=--=/-=-=-',
 '==-/=-==-//==-/=-==-',
 '===/=-=/-==-',
 '=-==/--==/--=-',
 '--==/--==/--=',
 '=-==/--==/--==/--=-',
 '-=-=/--==/-=-=/--=',
 '-===/-===/-===/-===',
 '-=-=/--==/-=-=/--=-',
 '=-==/=-==/=-=',
 '=-==/--==/--==/==-']

In [1]:

In [2]:

# grab bad combos and arrange by long-short combos into bad_types
#
# e.g. bad_types[('-','-')] = unacceptable combinations between two shorts
import csv
bad_combos_in = []
with open('settings/bad_combos.csv', 'rb') as csvfile:
    reader = csv.reader(csvfile, delimiter=',', quotechar="'")
    for row in reader:
        assert len(row) == 2
        bad_combos_in.append(tuple(row))
    bad_combos = bad_combos_in

bad_types = {}
for x in (('-','-'),('-','='),('=','-'),('=','=')):
    bad_types[x]=[]

def type_of_bad_combo(s):
    assert s in ['l','s']
    if s=='l':
        return '='
    elif s=='s':
        return '-'
        
for bad in bad_combos:
    
    a  = bad[0][0]
    b = bad[1][0]
    assert a,b in ['l','s']
    a_type = type_of_bad_combo(a)
    b_type = type_of_bad_combo(b)
    bad_types[(a_type,b_type)].append(bad)

# conver to tuple
for k in bad_types.keys():
    bad_types[k] = tuple(bad_types[k])
    
for combo in bad_types.keys():
    print combo,bad_types[combo]
bad_types

('=', '=') (('l_bcsc', 'l_v'), ('l_bsc', 'l_v'), ('l_bcsc', 'l_z'), ('l_csc', 'l_z'), ('l_csc', 'l_v'))
('=', '-') (('l_bcsc', 's_v<b>'), ('l_bsc', 's_v<b>'), ('l_bcsc', 's_z'), ('l_csc', 's_z'), ('l_csc', 's_v<b>'))
('-', '-') (('s_c', 's_v<b>'), ('s_c', 's_c<h+wb>'), ('s_c', 's_z'), ('s_bcs', 's_c'), ('s_bcs', 's_c<h+wb>'), ('s_cs', 's_c<h+wb>'), ('s_bs', 's_c'), ('s_cs', 's_c'))
('-', '=') (('s_c', 'l_v'), ('s_c', 'l_v<ii+z>z'), ('s_c', 'l_z'))

Out[2]:

{('-', '-'): (('s_c', 's_v<b>'),
  ('s_c', 's_c<h+wb>'),
  ('s_c', 's_z'),
  ('s_bcs', 's_c'),
  ('s_bcs', 's_c<h+wb>'),
  ('s_cs', 's_c<h+wb>'),
  ('s_bs', 's_c'),
  ('s_cs', 's_c')),
 ('-', '='): (('s_c', 'l_v'), ('s_c', 'l_v<ii+z>z'), ('s_c', 'l_z')),
 ('=', '-'): (('l_bcsc', 's_v<b>'),
  ('l_bsc', 's_v<b>'),
  ('l_bcsc', 's_z'),
  ('l_csc', 's_z'),
  ('l_csc', 's_v<b>')),
 ('=', '='): (('l_bcsc', 'l_v'),
  ('l_bsc', 'l_v'),
  ('l_bcsc', 'l_z'),
  ('l_csc', 'l_z'),
  ('l_csc', 'l_v'))}

In [3]:

# create digraph of meters (DG) using networkx
# each node has a type (=,-, and 0 [for the first one])
# the node at the end of each meter has a meter_type and meter_full_description attribute

import networkx as nx

DG=nx.DiGraph()

DG.add_node(0,type='0') # this is the start

for meter in meters_with_feet:
    
    meter_type = meters_with_feet[meter]
    meter_full_description = meter
    
    meter = meter.replace('/','') # ignore feet denominator for now
    
    node_id = 0
    
    curr_node = 0
    
    
    for i,c in enumerate(meter):

        found_it = False

        for n in DG.successors(curr_node):

            node = DG.node[n]

            if node['type']==c:
                curr_node = n
                found_it = True
                break

        if found_it==False:
            new_node = len(DG.nodes())            
            DG.add_node(new_node, type=c)            
            DG.add_edge(curr_node,new_node)
            
            # now add restraints to edge, this is a 'bad_combo' attribute 
          #  print DG.node[curr_node]['type']
            old_c = DG.node[curr_node]['type']
            if (old_c,c) in bad_types:
                DG[curr_node][new_node]['bad_combos'] = bad_types[(old_c, c)]
       #         print 'yes found ',old_c,c, bad_types[(old_c,c)]
            curr_node = new_node
        if i==len(meter)-1: # store some metrical data at the last node
           
            DG.node[curr_node]['meter_type'] = meter_type
            DG.node[curr_node]['meter_full_description'] = meter_full_description
    

In [4]:

# test to make sure all meters match
# note: there are multiple metrical matches with the same meter id

for test_meter in meters_with_feet.keys():
    orig_meter = test_meter # contains feet
    test_meter = test_meter.replace('/','')
    node_id = 0
    for ch in test_meter:
        found=False
        for s in DG.successors_iter(node_id):
            if DG.node[s]['type']==ch:
                found = True
                #found=True
                node_id = s
        assert found==True
    final_node = DG.node[node_id]
    print final_node['meter_type'],final_node['meter_full_description']

    assert final_node['meter_type']==meters_with_feet[orig_meter]
    assert final_node['meter_full_description']==orig_meter

G19 ==-/-=-=/-==
G6 --=-/=-==/--=-/=-==
G9 -=-=/--==/-=-=/==-
G16 -=-=/--==/-=-=/--==-
G4 =-=/-===/=-=/-===-
G11 --==/--==/==
G11 --==/--==/==-
G8 =-==/-=-=/--=-
G8 =-==/-=-=/==
G4 =-=/-===-/=-=/-===
G14 =-==/=-==/=-=-
G3 ==-/=-=-/-==-/=-=-
G19 ==-/-=-=/-==-
G8 --==/-=-=/==-
G5 --==/--==/--==/==
G4 =-=/-===-/=-=/-===-
G11 --==/--==/--=-
G8 --==/-=-=/--=-
G10 ==-/=-==//==-/=-==-
G18 ==-/-===//==-/-===-
G13 ==-/-==-/-==-/-==
G1 =-==/=-==/=-==/=-=-
G18 ==-/-===-//==-/-===
G9 -=-=/--==/-=-=/==
G15 =--=/-=-=-//=--=/-=-=
G2 -===/-===/-===/-===-
G17 =--=/=-=-/=--=/=
G10 ==-/=-==//==-/=-==
G1 =-==/=-==/=-==/=-=
G12 -==/-==/-==/-==
G8 =-==/-=-=/--=
G16 -=-=/--==/-=-=/--==
G11 =-==/--==/==-
G4 =-=/-===/=-=/-===
G8 --==/-=-=/--=
G5 =-==/--==/--==/--=
G15 =--=/-=-=-//=--=/-=-=-
G5 --==/--==/--==/--=-
G7 -===/-===/-==
G17 =--=/=-=-/=--=/=-
G5 --==/--==/--==/--=
G19 ===/=-=/-==
G12 -==/-==/-==/-==-
G10 ==-/=-==-//==-/=-==
G11 =-==/--==/--=
G5 --==/--==/--==/==-
G15 =--=/-=-=//=--=/-=-=
G6 --=-/=-==/--=-/=-==-
G18 ==-/-===//==-/-===
G13 ==-/-==-/-==-/-==-
G8 --==/-=-=/==
G5 =-==/--==/--==/==
G11 =-==/--==/==
G8 =-==/-=-=/==-
G18 ==-/-===-//==-/-===-
G7 -===/-===/-==-
G3 ==-/=-=-/-==-/=-=
G15 =--=/-=-=//=--=/-=-=-
G10 ==-/=-==-//==-/=-==-
G19 ===/=-=/-==-
G11 =-==/--==/--=-
G11 --==/--==/--=
G5 =-==/--==/--==/--=-
G9 -=-=/--==/-=-=/--=
G2 -===/-===/-===/-===
G9 -=-=/--==/-=-=/--=-
G14 =-==/=-==/=-=
G5 =-==/--==/--==/==-

In [5]:

# pretty picture of the graph. 
# note: Haven't figured out how to best display this. A moveable SVG would be nice.

def draw_meter_graph(g):
    pos=nx.spring_layout(g)
    import matplotlib.pyplot as plt 
    
    plt.figure(figsize=(15,15))

    labels=dict((n,d['type']) for n,d in g.nodes(data=True)) # need to change labels for 0,1,etc.

    nx.draw(g,labels=labels,node_color='#A0CBE2',node_size=200)

draw_meter_graph(DG) # T-Shirts?

In [6]:

#nx.write_graphml(DG, 'myxml.xml')#

In [96]:

# some data on how many nodes have "successors"
# shows how much recursion is being avoided

successor_count = {}

successor_count[0] = 0
successor_count[1] = 0
successor_count[2] = 0

for n in DG.nodes():
    num_successors = len(DG.successors(n))
 #   print n, num_successors
    assert (num_successors<3)
    successor_count[num_successors] +=1

print "number of nodes with # of successors"
successor_count

number of nodes with # of successors

Out[96]:

{0: 34, 1: 205, 2: 33}

In [9]:

# for now, loading in the old Scanner to get the different parsers
# note: turning the parsers into graphs will be killer, based on a character level
# note: will be adding constraints to the EDGES so that scanner won't descend if conditions are not met
from scanner import Scanner
scanner = Scanner()

In [10]:

# this does the basic scan of the input text and classifys as consonant(c),long vowel(v),etc
pp = scanner.pp # this does the basic scan of text

In [11]:

pp.parse(' shaan shaan')

Out[11]:

'bcvcbcvc'

In [12]:

# this is the parser for the short syllables
sp = scanner.sp # parser for shorts (-)
sp.parse('c')

Out[12]:

's_c'

In [13]:

# this is the parser for the long syllables
lp = scanner.lp # parser for longs (=)
lp.parse('cv')

Out[13]:

'l_cv'

In [14]:

# this is the tokenizer, which is used in the graph_scan
# TODO: check the that the long parser tokenizer is exactly the same as the short tokenizer
lp.tokenize('cv')

Out[14]:

['c', 'v']

In [15]:

sp.tokenize('cv') # need to check this is the same as lp

Out[15]:

['c', 'v']

In [7]:

from graph_parser import GraphParser
new_sp = GraphParser('settings/short.yaml')
tkns = new_sp.tokenize('cv')
new_sp.match_first_at(tkns, 0)

passed edge rule

Out[7]:

ParserRule(production='s_cv<b>', prev_class=None, prev_tokens=None, tokens=['c', 'v'], next_tokens=None, next_class='b')

In [16]:

new_lp = GraphParser('settings/long.yaml')

In [17]:

# just checking the lp vs. sp parser here. Can do this programmatically later.
s = ' naqsh faryaadii'
scan = pp.parse(s)
print scan
print 'long tokenizer: ',new_lp.tokenize(scan)
print 'short tokenizer:',new_sp.tokenize(scan)

bcsccbcsccvcv
long tokenizer:  ['b', 'c', 's', 'c', 'c', 'b', 'c', 's', 'c', 'c', 'v', 'c', 'v']
short tokenizer: ['b', 'c', 's', 'c', 'c', 'b', 'c', 's', 'c', 'c', 'v', 'c', 'v']

In [18]:

lp = new_lp
sp = new_sp

In [21]:

# this is the graph scan.
from collections import namedtuple

ScanResult = namedtuple('ScanResult',["scan","matches"]) # used for completed scans
NodeMatch  = namedtuple('NodeMatch', ["node_type",       # = or -
                                      "matched_tokens",  # tokens matched at node
                                      "node_id",         # id of node in graph
                                      "orig_tokens",     # original tokens that were matched
                                      "found",      # production of parser 
                                      "token_i"])        # used for matches at nodes in graph
      
def graph_scan(in_string): 

    completed_scans = [] # holds complete scans
    
    #from collections import namedtuple
    
    # this is the scan of the input string to bcv, etc.
    scan = pp.parse(in_string)
    pd = pp.parse_details # details on the matched tokens, rules, etc.
    
    # this generates scan_tokens from the scan of the input string, e.g. ['b','c','v'], using the long parser (lp)
    scan_tokens = lp.tokenize(scan)

    # This is a check to see that the short and long parsers match
    # TODO: remove later
    
    import collections
    assert collections.Counter(scan_tokens) == collections.Counter(lp.tokenize(scan))
    
    # this function descends into node (node_id), passing current token_i, matches, and a string represent
    def descend_node(node_id, token_i, matches, matched_so_far):
        for successor_id in DG.successors(node_id):
            
            node_type = DG.node[successor_id]['type']
            assert node_type in ('=','-')
            
            if node_type=='=': 
                parser = lp
            else:
                parser = sp
                
            # for each match m at token_i of scan_tokens 
            # m contains ['tokens', 'start', 'rule_id', 'rule']
            # m['rule'] contain ['tokens', 'production']
            # TODO: declunkify.
            
            for m in parser.match_all_at(scan_tokens, token_i):
                #print 'found', m['tokens']
                # next, check to make sure that this is not a bad combination
                # do so by looking for constraints on the edge
                # note: this could be added as a constraint to match_all_at() as not_starting_with ...

                if len(matches)>0: # if already matched something
                    a = matches[-1].found # details of previous match
                    b = m.production#**['rule']['production']   # details of current match 
                    if 'bad_combos' in DG[node_id][successor_id]: # if 'bad_combos' in the a,b's edge
                        if (a,b) in DG[node_id][successor_id]['bad_combos']: # if it's bad
       #                    print 'aborting! found ',a,b
                            continue # abort! bad combination

                # determine orig_tokens
                # meaning, what is matched from original input and parsed to b,c,s, etc.
    
                orig_tokens =[]
                for i in range(token_i, token_i+len(m.tokens)):#['tokens'])):
                    orig_tokens.append(pd[i]['rule']['tokens'])  ## parser details here
                    # this will break if 'rule' is None
                    
#                    except TypeError:
#                        print 'error','i=',i
#                        print 'pd[]i]',pd[i]
#                        print 'error',m['tokens'], 'i',i
#                        rule','tokens',"\n",m
 
                # advance token index based on length of match tokens
                
                

                
                
                
                # generate match data
                
                matched_tokens = m.tokens
                
                match_data = NodeMatch(node_type=node_type,
                                       matched_tokens = matched_tokens,
                                       node_id=node_id,
                                       orig_tokens=orig_tokens,
                                       found=m.production,
                                       token_i=token_i)
                # advance token_i 
                
                new_token_i = token_i + len(matched_tokens)
    
                matches.append(match_data)
                
                so_far=matched_so_far + node_type
                
                #print so_far, matched_tokens, orig_tokens
 
                # if we're at the end
                if new_token_i == len(scan_tokens) and 'meter_type' in DG.node[successor_id]:
                    #and 'meter_type' in DG.node[s]:# and len(DG.successors(s))==0:
                    #print 'made it!', successor_id, DG.successors(successor_id),DG.node[successor_id], so_far
                    completed_scans.append(ScanResult(scan=so_far, matches=matches))
                    #,"matches"]) # used for completed scans

               #     for x in matches:
               #         print x

                    match_node = successor_id
                    #print DG.node(match_node,data=True)#[match_node]
                else:                  
                    descend_node(successor_id, new_token_i,matches,so_far)
    # start descent into node 0 of the graph, at token_i 0, with no matches       
    descend_node(0, 0, [], '')

    return completed_scans
print graph_scan(" naqsh faryaadii hai kis kii sho;xii-e ta;hriir kaa")#letaa huu;n maktab-e ;gam-e dil me;n sabaq hanuuz")#" naqsh faryaadii hai kis kii sho;xii-e ta;hriir kaa")#faa((ilaatun faa((ilaatun faa((ilaatun faa((ilun")#naqsh faryaadii hai kis kii sho;xii-e tahriir kaa")

passed edge rule
passed edge rule
passed edge rule
passed edge rule
passed edge rule
passed edge rule
passed edge rule
passed edge rule
passed edge rule
passed edge rule
passed edge rule
passed edge rule
passed edge rule
passed edge rule
passed edge rule
passed edge rule
passed edge rule
passed edge rule
passed edge rule
passed edge rule
passed edge rule
passed edge rule
passed edge rule
passed edge rule
passed edge rule
passed edge rule
passed edge rule
passed edge rule
passed edge rule
passed edge rule
passed edge rule
passed edge rule
passed edge rule
passed edge rule
passed edge rule
[ScanResult(scan='=-===-===-===-=', matches=[NodeMatch(node_type='=', matched_tokens=['b', 'c', 's', 'c'], node_id=0, orig_tokens=[' ', ['n'], ['a'], ['q']], found='l_bcsc', token_i=0), NodeMatch(node_type='-', matched_tokens=['c'], node_id=1, orig_tokens=[['sh']], found='s_c', token_i=4), NodeMatch(node_type='-', matched_tokens=['b', 'c', 's'], node_id=46, orig_tokens=[' ', ['f'], ['a']], found='s_bcs', token_i=5), NodeMatch(node_type='=', matched_tokens=['b', 'c', 's', 'c'], node_id=46, orig_tokens=[' ', ['f'], ['a'], ['r']], found='l_bcsc', token_i=5), NodeMatch(node_type='-', matched_tokens=['c'], node_id=47, orig_tokens=[['y']], found='s_c', token_i=9), NodeMatch(node_type='=', matched_tokens=['c', 'v'], node_id=47, orig_tokens=[['y'], ['aa']], found='l_cv', token_i=9), NodeMatch(node_type='-', matched_tokens=['c', 'v'], node_id=68, orig_tokens=[['d'], ['ii']], found='s_cv<b>', token_i=11), NodeMatch(node_type='-', matched_tokens=['b', 'c', 'v'], node_id=69, orig_tokens=[' ', ['h'], ['ai']], found='s_bcv<b>', token_i=13), NodeMatch(node_type='=', matched_tokens=['b', 'c', 's', 'c'], node_id=214, orig_tokens=[' ', ['k'], ['i'], ['s']], found='l_bcsc', token_i=16), NodeMatch(node_type='=', matched_tokens=['b', 'c', 'v'], node_id=215, orig_tokens=[' ', ['k'], ['ii']], found='l_bcv', token_i=20), NodeMatch(node_type='=', matched_tokens=['b', 'c', 'v'], node_id=216, orig_tokens=[' ', ['sh'], ['o']], found='l_bcv', token_i=23), NodeMatch(node_type='=', matched_tokens=['c', 'v<ii+z>'], node_id=217, orig_tokens=[[';x'], ['ii']], found='l_cv<ii+z>', token_i=26), NodeMatch(node_type='-', matched_tokens=['z'], node_id=218, orig_tokens=[['-e']], found='s_z', token_i=28), NodeMatch(node_type='=', matched_tokens=['b', 'c', 'v'], node_id=69, orig_tokens=[' ', ['h'], ['ai']], found='l_bcv', token_i=13), NodeMatch(node_type='-', matched_tokens=['b', 'c', 's'], node_id=70, orig_tokens=[' ', ['k'], ['i']], found='s_bcs', token_i=16), NodeMatch(node_type='-', matched_tokens=['c'], node_id=68, orig_tokens=[['d']], found='s_c', token_i=11), NodeMatch(node_type='=', matched_tokens=['c', 'v'], node_id=68, orig_tokens=[['d'], ['ii']], found='l_cv', token_i=11), NodeMatch(node_type='-', matched_tokens=['b', 'c', 'v'], node_id=87, orig_tokens=[' ', ['h'], ['ai']], found='s_bcv<b>', token_i=13), NodeMatch(node_type='=', matched_tokens=['b', 'c', 's', 'c'], node_id=88, orig_tokens=[' ', ['k'], ['i'], ['s']], found='l_bcsc', token_i=16), NodeMatch(node_type='=', matched_tokens=['b', 'c', 'v'], node_id=89, orig_tokens=[' ', ['k'], ['ii']], found='l_bcv', token_i=20), NodeMatch(node_type='=', matched_tokens=['b', 'c', 'v'], node_id=90, orig_tokens=[' ', ['sh'], ['o']], found='l_bcv', token_i=23), NodeMatch(node_type='-', matched_tokens=['c', 'v<ii+z>'], node_id=91, orig_tokens=[[';x'], ['ii']], found='s_cv<ii+z>', token_i=26), NodeMatch(node_type='=', matched_tokens=['z'], node_id=92, orig_tokens=[['-e']], found='l_z', token_i=28), NodeMatch(node_type='=', matched_tokens=['b', 'c', 's', 'c'], node_id=93, orig_tokens=[' ', ['t'], ['a'], [';h']], found='l_bcsc', token_i=29), NodeMatch(node_type='=', matched_tokens=['c', 'v'], node_id=153, orig_tokens=[['r'], ['ii']], found='l_cv', token_i=33), NodeMatch(node_type='-', matched_tokens=['c'], node_id=154, orig_tokens=[['r']], found='s_c', token_i=35), NodeMatch(node_type='=', matched_tokens=['b', 'c', 'v'], node_id=155, orig_tokens=[' ', ['k'], ['aa']], found='l_bcv', token_i=36), NodeMatch(node_type='-', matched_tokens=['b', 'c', 's'], node_id=93, orig_tokens=[' ', ['t'], ['a']], found='s_bcs', token_i=29), NodeMatch(node_type='-', matched_tokens=['c'], node_id=91, orig_tokens=[[';x']], found='s_c', token_i=26), NodeMatch(node_type='-', matched_tokens=['b', 'c', 's'], node_id=0, orig_tokens=[' ', ['n'], ['a']], found='s_bcs', token_i=0)])]

In [106]:

data = {}
import csv
verses = {}
multiple_matches = []
with open('data/verses.csv', 'rb') as csvfile:
    versereader = csv.reader(csvfile, delimiter=',', quotechar='|')
    count = 0
    multiple_match_count = 0
    for row in versereader:
        (verse_id, input_string, real_scan) = row
        data[verse_id] = {'input_string': input_string, 'real_scan': real_scan}

In [107]:

s= data['001.02.1']['input_string']
graph_scan(s)

Out[107]:

[ScanResult(scan='=-===-===-===-=', matches=[NodeMatch(node_type='=', matched_tokens=['b', 'c', 's', 'c'], node_id=0, orig_tokens=[' ', ['.s'], ['u'], ['b']], found='l_bcsc', token_i=0), NodeMatch(node_type='-', matched_tokens=['c'], node_id=1, orig_tokens=[[';h']], found='s_c', token_i=4), NodeMatch(node_type='-', matched_tokens=['b', 'c', 's'], node_id=46, orig_tokens=[' ', ['k'], ['a']], found='s_bcs', token_i=5), NodeMatch(node_type='=', matched_tokens=['b', 'c', 's', 'c'], node_id=46, orig_tokens=[' ', ['k'], ['a'], ['r']], found='l_bcsc', token_i=5), NodeMatch(node_type='-', matched_tokens=['c', 'v'], node_id=47, orig_tokens=[['n'], ['aa']], found='s_cv<b>', token_i=9), NodeMatch(node_type='=', matched_tokens=['b', 'c', 'v'], node_id=48, orig_tokens=[' ', ['sh'], ['aa']], found='l_bcv', token_i=11), NodeMatch(node_type='-', matched_tokens=['c'], node_id=47, orig_tokens=[['n']], found='s_c', token_i=9), NodeMatch(node_type='=', matched_tokens=['c', 'v'], node_id=47, orig_tokens=[['n'], ['aa']], found='l_cv', token_i=9), NodeMatch(node_type='=', matched_tokens=['b', 'c', 'v'], node_id=68, orig_tokens=[' ', ['sh'], ['aa']], found='l_bcv', token_i=11), NodeMatch(node_type='-', matched_tokens=['c'], node_id=87, orig_tokens=[['m']], found='s_c', token_i=14), NodeMatch(node_type='=', matched_tokens=['b', 'c', 'v'], node_id=88, orig_tokens=[' ', ['k'], ['aa']], found='l_bcv', token_i=15), NodeMatch(node_type='=', matched_tokens=['b', 'c', 'v'], node_id=89, orig_tokens=[' ', ['l'], ['aa']], found='l_bcv', token_i=18), NodeMatch(node_type='=', matched_tokens=['c', 'v'], node_id=90, orig_tokens=[['n'], ['aa']], found='l_cv', token_i=21), NodeMatch(node_type='-', matched_tokens=['b', 'c', 'v'], node_id=91, orig_tokens=[' ', ['h'], ['ai']], found='s_bcv<b>', token_i=23), NodeMatch(node_type='=', matched_tokens=['b', 'c', 'v'], node_id=92, orig_tokens=[' ', ['j'], ['uu']], found='l_bcv', token_i=26), NodeMatch(node_type='=', matched_tokens=['z'], node_id=93, orig_tokens=[['-e']], found='l_z', token_i=29), NodeMatch(node_type='=', matched_tokens=['b', 'c', 'v'], node_id=153, orig_tokens=[' ', ['sh'], ['ii']], found='l_bcv', token_i=30), NodeMatch(node_type='-', matched_tokens=['c'], node_id=154, orig_tokens=[['r']], found='s_c', token_i=33), NodeMatch(node_type='=', matched_tokens=['b', 'c', 'v'], node_id=155, orig_tokens=[' ', ['k'], ['aa']], found='l_bcv', token_i=34), NodeMatch(node_type='-', matched_tokens=['z'], node_id=93, orig_tokens=[['-e']], found='s_z', token_i=29), NodeMatch(node_type='-', matched_tokens=['b', 'c', 's'], node_id=0, orig_tokens=[' ', ['.s'], ['u']], found='s_bcs', token_i=0)])]

In [108]:

#print data['004.10x.0']
#graph_scan(data['004.10x.0']['input_string'])
def graph_scan_all():
    for k in sorted(data.keys()):
        #print k
        graph_scan(data[k]['input_string'])
def trad_scan_all():
    for k in sorted(data.keys()):
        scanner.scan(data[k]['input_string'], known_only = True)

In [111]:

import cProfile
cProfile.run('trad_scan_all()')

         157263342 function calls (156164871 primitive calls) in 152.962 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.024    0.024  152.962  152.962 <ipython-input-108-c2a5a03a3c0a>:7(trad_scan_all)
        1    0.000    0.000  152.962  152.962 <string>:1(<module>)
     3314    0.004    0.000    0.064    0.000 parser.py:154(tokenize)
     3314   24.024    0.007   36.788    0.011 parser.py:157(parse)
 15426462    5.292    0.000    5.292    0.000 parser.py:181(<genexpr>)
   279914   51.097    0.000   82.610    0.000 parser.py:227(match_all_at)
 39695932   14.124    0.000   14.124    0.000 parser.py:250(<genexpr>)
   320319    0.378    0.000    1.374    0.000 re.py:134(match)
   210724    0.251    0.000   28.198    0.000 re.py:139(search)
   531043    1.274    0.000   24.059    0.000 re.py:228(_compile)
   210724    0.249    0.000   28.446    0.000 scanner.py:43(meter_ok)
   259310    0.585    0.000    0.585    0.000 scanner.py:46(bad_combo)
     3314    2.820    0.001  152.936    0.046 scanner.py:66(scan)
401482/101899    3.146    0.000    4.418    0.000 sre_compile.py:32(_compile)
   101899    1.328    0.000    4.974    0.000 sre_compile.py:361(_compile_info)
   203798    0.165    0.000    0.197    0.000 sre_compile.py:474(isstring)
   101899    0.268    0.000    9.676    0.000 sre_compile.py:480(_code)
   101899    0.609    0.000   22.269    0.000 sre_compile.py:495(compile)
   599166    0.296    0.000    0.375    0.000 sre_parse.py:126(__len__)
  1198332    0.872    0.000    1.371    0.000 sre_parse.py:130(__getitem__)
  1198708    0.665    0.000    0.970    0.000 sre_parse.py:138(append)
401482/101899    2.414    0.000    2.895    0.000 sre_parse.py:140(getwidth)
   101899    0.116    0.000    0.322    0.000 sre_parse.py:178(__init__)
  1803988    2.363    0.000    2.765    0.000 sre_parse.py:182(__next)
   800926    0.454    0.000    1.177    0.000 sre_parse.py:195(match)
  1302645    0.812    0.000    2.648    0.000 sre_parse.py:201(get)
    99861    0.271    0.000    0.347    0.000 sre_parse.py:257(_escape)
201760/101899    1.667    0.000   10.631    0.000 sre_parse.py:301(_parse_sub)
301621/101899    2.886    0.000   10.277    0.000 sre_parse.py:379(_parse)
   101899    0.526    0.000   11.702    0.000 sre_parse.py:663(parse)
   101899    0.084    0.000    0.084    0.000 sre_parse.py:67(__init__)
   401482    0.255    0.000    0.255    0.000 sre_parse.py:90(__init__)
   101899    0.172    0.000    0.172    0.000 {_sre.compile}
 23913376   10.187    0.000   23.674    0.000 {all}
  1504029    0.599    0.000    0.599    0.000 {isinstance}
31885364/31685642    4.102    0.000    4.200    0.000 {len}
   199722    0.066    0.000    0.066    0.000 {max}
  5857358    1.115    0.000    1.115    0.000 {method 'append' of 'list' objects}
     1019    0.047    0.000    0.047    0.000 {method 'clear' of 'dict' objects}
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}
     4076    0.002    0.000    0.002    0.000 {method 'extend' of 'list' objects}
     6628    0.128    0.000    0.128    0.000 {method 'findall' of '_sre.SRE_Pattern' objects}
   730765    0.311    0.000    0.311    0.000 {method 'get' of 'dict' objects}
   101899    0.028    0.000    0.028    0.000 {method 'items' of 'dict' objects}
        1    0.000    0.000    0.000    0.000 {method 'keys' of 'dict' objects}
   320319    0.294    0.000    0.294    0.000 {method 'match' of '_sre.SRE_Pattern' objects}
   139957    0.072    0.000    0.072    0.000 {method 'pop' of 'list' objects}
   210724    4.589    0.000    4.589    0.000 {method 'search' of '_sre.SRE_Pattern' objects}
  1002686    0.415    0.000    0.415    0.000 {min}
   899125    0.135    0.000    0.135    0.000 {ord}
 23913376   11.379    0.000   11.379    0.000 {range}
        1    0.002    0.002    0.002    0.002 {sorted}

In [112]:

cProfile.run('graph_scan_all()')
#t.timeit()

         100477739 function calls (100337067 primitive calls) in 97.243 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
     3314    0.070    0.000   97.359    0.029 <ipython-input-105-ddef7d259286>:12(graph_scan)
143986/3314    2.551    0.000   59.205    0.018 <ipython-input-105-ddef7d259286>:32(descend_node)
        1    0.008    0.008   97.369   97.369 <ipython-input-108-c2a5a03a3c0a>:3(graph_scan_all)
        1    0.000    0.000   97.369   97.369 <string>:1(<module>)
     4039    0.004    0.000    0.006    0.000 <string>:8(__new__)
    13256    0.017    0.000    0.017    0.000 _weakrefset.py:68(__contains__)
     6628    0.027    0.000    0.047    0.000 abc.py:128(__instancecheck__)
     6628    0.025    0.000    0.290    0.000 collections.py:406(__init__)
     6628    0.171    0.000    0.264    0.000 collections.py:469(update)
   143986    0.120    0.000    0.163    0.000 digraph.py:659(successors_iter)
   143986    0.470    0.000    0.633    0.000 digraph.py:676(successors)
   322363    0.119    0.000    0.119    0.000 graph.py:293(__getitem__)
     6628    0.007    0.000    0.120    0.000 parser.py:154(tokenize)
     3314   24.696    0.007   37.675    0.011 parser.py:157(parse)
 15426462    5.332    0.000    5.332    0.000 parser.py:181(<genexpr>)
   179881   34.347    0.000   55.374    0.000 parser.py:227(match_all_at)
 25914452    9.357    0.000    9.357    0.000 parser.py:250(<genexpr>)
 18305985    7.697    0.000   17.846    0.000 {all}
   148750    0.105    0.000    0.105    0.000 {built-in method __new__ of type object at 0x1560c0}
     6628    0.003    0.000    0.003    0.000 {getattr}
     6628    0.011    0.000    0.058    0.000 {isinstance}
   143986    0.043    0.000    0.043    0.000 {iter}
 19994994    2.536    0.000    2.536    0.000 {len}
   824216    0.168    0.000    0.168    0.000 {method 'append' of 'list' objects}
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}
     9942    0.184    0.000    0.184    0.000 {method 'findall' of '_sre.SRE_Pattern' objects}
   260358    0.035    0.000    0.035    0.000 {method 'get' of 'dict' objects}
        1    0.000    0.000    0.000    0.000 {method 'keys' of 'dict' objects}
 18450696    9.136    0.000    9.136    0.000 {range}
        1    0.002    0.002    0.002    0.002 {sorted}

In [ ]: