#!/usr/bin/env python # coding: utf-8 # # PAMPAC: Complex Annotation/Text Pattern Matching # # PAMPAC stands for "PAttern Matching with PArser Combinators" and provides an easy but powerful way to describe # complex annotation and text patterns via simple Python building blocks. # # PAMPAC allows to match both the document text and annotations, with their types and features and can run arbitrary Python code for any of the matches it finds. # # NOTE: the examples in this document only cover the most important features and components of PAMPAC, in order to see the full range of features, consult the PAMPAC reference and the Python API documentation for the `gatenlp.pam.pampac` module. # In[1]: import os from gatenlp import Document from gatenlp.processing.tokenizer import NLTKTokenizer from gatenlp.pam.pampac import * import stanza from gatenlp.lib_stanza import AnnStanza # In[2]: # The following document will be used for many of the examples text = """Barack Obama was the 44th president of the US and he followed George W. Bush and was followed by Donald Trump. Before Bush, Bill Clinton was president. Also, lets include a sentence about South Korea which is called 대한민국 in Korean. And a sentence with the full name of Iran in Farsi: جمهوری اسلامی ایران and also with just the word "Iran" in Farsi: ایران Also barack obama in all lower case and SOUTH KOREA in all upper case """ doc = Document(text) # Create some annotations in the default set ann_stanza = AnnStanza(lang="en") doc = ann_stanza(doc) doc # After annotating with the AnnStanza annotator, the document has now the document text, a sequence of characters, and a sequence of Token, Sentence, PERSON and other annotations. The Token annotations have a number of features, among others, the `upos` feature which contains the universal dependencies part of speech tag. # # PAMPAC can now be used to find patterns in those annotations. # ## Using PAMPAC # # PAMPAC allows you to create complex patterns for matching annotations or text based on basic patterns (match an annotation, match some text) and means to combine them (match a sequence of something, match a repetition of something, match alternatives etc.). For any match found, some action can be performed. # # In order to do this the following steps are needed: # * create a _pattern_ (also called _parser_) which describes this sequence # * create a _rule_ for finding the pattern and performing an _action_ if something has been found # * create the `Pampac` _matcher_ from the rules and configure how it should apply the rules to a document # * create the `PampacAnnotator` _annotator_ which will actually run everything on a document # In[3]: from gatenlp.pam.pampac import PampacAnnotator, Pampac, Rule from gatenlp.pam.pampac import Ann, AnnAt, Or, And, Filter, Find, Lookahead, N, Seq, Text from gatenlp.pam.pampac import AddAnn, UpdateAnnFeatures from gatenlp.pam.pampac import GetAnn, GetEnd, GetFeature, GetFeatures, GetRegexGroup, GetStart, GetText, GetType from gatenlp.pam.matcher import isIn, IfNot, Nocase # In[ ]: # ## Example 1: Finding Annotations # # To find annotations the `Ann` parser is used. The parameters of the `Ann` parser specify which conditions have # to be satisfied to match an annotation. # # Let us create a parser to find all annotations which have type "Token" and a feature "upos" with the value "NOUN" # In[4]: pat1 = Ann(type="Token", features=dict(upos="NOUN")) # Next, create an action which adds a new annotation of type "PATTERN" # In[5]: action1 = AddAnn(type="PATTERN1") # Combine the parser and the action into a rule: # In[6]: rule1 = Rule(pat1, action1) # Once we have one or more rules, a Pampac matcher can be built. The matcher can be configured to influence # how matching rules should get choosen to perform an action (e.g. only apply the first matching rule) and how to conginue matching after a match has been found: try to match at the next position or after the longest match that has been found. # In[7]: pampac1 = Pampac(rule1, skip="longest", select="first") # Now, we can create a Pampac annotator from the matcher and define which input annotations to use and in which set to create any new annotations. Input annotations get specified as a list of tuples, where the first element of each tuple is the annotation set name and the second element is either a single type or a list of types. That way, the mix of annotations to use can be defined very flexibly. # In[8]: annt1 = PampacAnnotator(pampac1, annspec=[("", "Token")], outset_name="example1" ) # Now we can run the annotator on the document and inspect the result. # In[9]: tmpdoc = doc.clone() annt1(tmpdoc) tmpdoc # ## Example 2: Annotation constraints # # In the previous example the `Ann` parser was configured with two constraints: `type="Token"` and `features=dict(upos="NOUN")`. It is possible to specify additional constraints and use special constraint helpers to create more complex constraints. # # For example, lets assume we want to find all Token annotations where the upos feature is one of the values "NOUN", or "DET". This can be achieved with the `isIn` helper: # In[10]: pat2 = Ann(type="Token", features=dict(upos=isIn("NOUN","DET"))) action2 = AddAnn(type="PATTERN1") rule2 = Rule(pat2, action2) pampac2 = Pampac(rule2, skip="longest", select="first") annt2 = PampacAnnotator(pampac2, annspec=[("", "Token")], outset_name="example2") tmpdoc = doc.clone() annt2(tmpdoc) tmpdoc # Another way to use more complex constraints when using `Ann` is to use a regular expression in place of a string. This works with the annotation type parameter and with the feature values in the `features` and `features_eq` parameters. # # BTW, the `features` parameter will check that whatever is specified occurs in the features of an annotation, but the annotation can contain other, additional features. The `features_eq` parameter instead checks that what is specified is exactly matching the features, and there are no additional other features. # # Here a pattern that will match any annotation where the "text" feature contains an upper or lower case a anywhere. # In[11]: import re PAT2b = re.compile(r'.*[aA].*') pat2b = Ann(type="Token", features=dict(text=PAT2b)) action2b = AddAnn(type="PATTERN1") rule2b = Rule(pat2b, action2b) pampac2b = Pampac(rule2b, skip="longest", select="first") annt2b = PampacAnnotator(pampac2b, annspec=[("", "Token")], outset_name="example2b") tmpdoc = doc.clone() annt2b(tmpdoc) tmpdoc # It is also possible to use one's own function for the type or feature value parameters: if the function returns True for the type name or feature value, it is considered a match. # # Let us use a function to check whether the text feature of a Token annotation has a length that is 1 or 2: # In[12]: pat2c = Ann(type="Token", features=dict(text=lambda x: len(x) == 1 or len(x) ==2)) action2c = AddAnn(type="PATTERN1") rule2c = Rule(pat2c, action2c) pampac2c = Pampac(rule2c, skip="longest", select="first") annt2c = PampacAnnotator(pampac2c, annspec=[("", "Token")], outset_name="example2c") tmpdoc = doc.clone() annt2c(tmpdoc) tmpdoc # ## Example 3: Matching Text # # It is also possible to match Text with the `Text` parser. The `Text` parser can take either some literal text to find or a compiled regular expression. If a literal text is specified the parameter `matchcase=False` can be used to enable case-insensitive matching. # # In this example we use the `Text` parser to directly match any sequence of characters that starts and ends with an a, but does not contain whitespace: # In[13]: PAT3a = re.compile(r'[aA][^\s]*[aA]') pat3a = Text(text=PAT3a) action3a = AddAnn(type="PATTERN3a") rule3a = Rule(pat3a, action3a) pampac3a = Pampac(rule3a, skip="longest", select="first") annt3a = PampacAnnotator(pampac3a, annspec=[("", "Token")], outset_name="example3a") tmpdoc = doc.clone() annt3a(tmpdoc) tmpdoc # ## Example 4: Repetitions of annotations # # # `Ann` and `Text` are the most "basic" patterns to match, PAMPAC offers a number of ways for how to build more complex patterns from those basic patterns. One is the parser `N` which can be used to find a sequence of m to n repetitions of the same sub pattern. # # For this example, lets find any repetition of 2 to 4 Tokens with the `upos` feature equal to "PROPN". # The parser `N` allows to specify the minimum and maximum number of occurrences using the `min` and `max` parameters. # Note that not specifying a max parameter does NOT mean an unlimited number of repetitions but sets the max parameter to the default value 1. # In[14]: pat4a = N( Ann("Token", features=dict(upos="PROPN")), min=2, max=4, ) action4a = AddAnn(type="PATTERN4a") rule4a = Rule(pat4a, action4a) pampac4a = Pampac(rule4a, skip="longest", select="first") annt4a = PampacAnnotator(pampac4a, annspec=[("", "Token")], outset_name="example4a") tmpdoc = doc.clone() annt4a(tmpdoc) tmpdoc # ## Example 5: Sequence of annotations # Often, we want to find a sequence of different annotations or a sequence of patterns, where each pattern in turn is something made up of sub-patterns. # # For example, let us find all occurrences of 2 or more Tokens with the upos feature "PROPN" followed by a token with the lemma "be". So we need to combine the pattern with something that indicates that another token with some specific feature value should follow. This can be done with the `Seq` parser. # # We could create a pattern like this: # In[15]: pat5a = Seq( N( Ann("Token", features=dict(upos="PROPN")), min=2, max=3, ), Ann("Token", features=dict(lemma="be")) ) # Note, hoewever that the pattern for the 2 to 4 PROPN tokens has already been defined and assigned to the variable `pat4a` so we can simply re-use it here: # In[16]: pat5a = Seq( pat4a, Ann("Token", features=dict(lemma="be")), ) action5a = AddAnn(type="PATTERN5a") rule5a = Rule(pat5a, action5a) pampac5a = Pampac(rule5a, skip="longest", select="first") annt5a = PampacAnnotator(pampac5a, annspec=[("", "Token")], outset_name="example5a") tmpdoc = doc.clone() annt5a(tmpdoc) tmpdoc # ### Match bindings # # As can be seen in the examples above, the action (in our case, adding a new annotation) will be carried out for the span and match data for the whole match, e.g. the whole sequence as in the previous example. # # Sometimes, one would rather want to use just a specific sub-match for the action, or perform several actions, each for a different sub-part. This is possible in PAMPAC by binding sub matches to a name and then referring to that name in the action. # # To test this, lets perform the same pattern matching as above, but perform the action only for the match of the final token that matches the lemma "be": # In[17]: pat5b = Seq( pat4a, Ann("Token", features=dict(lemma="be"), name="lemma-be"), ) action5b = AddAnn(type="PATTERN5b", name="lemma-be") rule5b = Rule(pat5b, action5b) pampac5b = Pampac(rule5b, skip="longest", select="first") annt5b = PampacAnnotator(pampac5b, annspec=[("", "Token")], outset_name="example5b") tmpdoc = doc.clone() annt5b(tmpdoc) tmpdoc # ## Example 6: Alternatives # # Another powerful way to combine sub patterns is by specifying that one of several patterns should be tried to get matched. This is done with the `Or` parser which will try each sub pattern in turn and return the first successful match. # # To illustate this, let us try to match either 2 to 4 Tokens with the "upos" feature equal to "PROPN" or 1 to 2 Tokens with an "upos" feature that has a value starting with "A". # In[18]: pat6a = Or( pat4a, N( Ann(type="Token", features=dict(upos=re.compile(r"^[aA]"))), min=1, max=2, ) ) action6a = AddAnn(type="PATTERN6a") rule6a = Rule(pat6a, action6a) pampac6a = Pampac(rule6a, skip="longest", select="first") annt6a = PampacAnnotator(pampac6a, annspec=[("", "Token")], outset_name="example6a") tmpdoc = doc.clone() annt6a(tmpdoc) tmpdoc # ## Example 7: Matching next annotation at offset # # The `Ann` parser always tries to match the next annotation in the sequence of annotations described by # the `annspec` parameter. In the examples above, there was a single annotation type and annotations occurred one after the other in the document. # # In the general case however, there may be different annotation types and there may be several annotations with different or identical types and/or features starting at the same position. `gatenlp` always imposes a standard order on those annotations: they are sorted by start offset, then by annotation id (order of addition to the set). # # When there are several annotations at the same offset, we sometimes want to match any of these annotations, as long as they satisfy some constraints (e.g. have a specific type or specific feature values). This would not be possible with the `Ann` parser, because that parser always tries to match the next annotation in the annotation sequence. # # The `AnnAt` parser instead looks at the offset of the next annotation in sequence and then tries to match any of the annotations at that offset. # # In the following example we try to match any Token, followed by either a PERSON annotation, or by a upos "NOUN" Token and create a new annotation for that second Token. # In[19]: pat7a = Seq( Ann("Token"), Or( AnnAt("PERSON"), AnnAt("Token", features=dict(upos="NOUN")), ) ) action7a = AddAnn(type="PATTERN7a") rule7a = Rule(pat7a, action7a) pampac7a = Pampac(rule7a, skip="longest", select="first") annt7a = PampacAnnotator(pampac7a, annspec=[("", ["Token","PERSON"])], outset_name="example7a") tmpdoc = doc.clone() annt7a(tmpdoc) tmpdoc # ## Example 8: More than one pattern must match # # The `And` parser can be used to find locations where more than one pattern matches at the same time. # # To illustrate this, let's create a pattern which checks that at some location, there are 2 to 4 Tokens which have upos equal to "PROPN" and there are 1 or 2 Tokens where the "text" feature has a value that is all upper case. # In[20]: pat8a = And( pat4a, N( Ann(type="Token", features=dict(text=re.compile(r"^[A-Z]+$"))), min=1, max=2, ) ) action8a = AddAnn(type="PATTERN6a") rule8a = Rule(pat8a, action8a) pampac8a = Pampac(rule8a, skip="longest", select="first") annt8a = PampacAnnotator(pampac8a, annspec=[("", "Token")], outset_name="example8a") tmpdoc = doc.clone() annt8a(tmpdoc) tmpdoc # ## Alternate Syntax # # For some PAMPAC constructs, it is possible to use an alternate and more concise syntax, where Python operators are used instead of the full names. # # * Instead of `Or(A, B, C)` it is possible to write `A | B | C` # * Instead of `Seq(A, B, C, D)` it is possible to write `A >> B >> C >> D` # * Instead of `And(A, B)` it is possible to write `A & B` # * Instead of `N(A, min=i, max=i)` it is possible to write `A * i` # * Instran of `N(A, min=i, max=j)` it is possible to write `A.repeat(i,j)` # ## Example 9: Parser modifiers # # Each of the parsers above can be modified to limit matching by one of the following methods: # # * `where(predicate)`: the parser only matches if the predicate returns True on at least one of the match results # * `within(...)`: the parser only matches if the match is within an annotation with the given constraints # * `notwithin(...)`: the parser only matches if the match is not within an annotation with the given constraints # # Similar for: # * `coextensive(...)`/`notcoextensive(...)` # * `overlapping(...)`/`notoverlapping(...)` # * `covering(...)`/`notcovering(...)` # * `at(...)`/`notat(...)` # * `before(...)`/`notbefore(...)` # # To illustrate this let us again match 2 to 4 Tokens with an "upos" feature "PROPN" but only if the match does not overlap with an annotation of type "PERSON". Note that for this to work, the annotations to check for overlapping must be in the input annotation set for PAMPAC, so we need to add that type to the `annspec` parameter. # In[21]: pat9a = pat4a.notoverlapping(type="PERSON") action9a = AddAnn(type="PATTERN9a") rule9a = Rule(pat9a, action9a) pampac9a = Pampac(rule9a, skip="longest", select="first") annt9a = PampacAnnotator(pampac9a, annspec=[("", ["Token", "PERSON"])], outset_name="example9a") tmpdoc = doc.clone() annt9a(tmpdoc) tmpdoc # ### Notebook last updated # In[22]: import gatenlp print("NB last updated with gatenlp version", gatenlp.__version__)