import codecs import unicodedata with codecs.open("faust.txt","r","utf-8") as stream: text = stream.read() # !sudo locale-gen de_DE.UTF-8 import locale locale.setlocale(locale.LC_ALL,'de_DE.utf8') # C, en_US.utf8, ... import re re.search('cheese','the cheese and the bread') re.search('butter','the cheese and the bread') re.match('cheese','the cheese and the bread') re.match('the','the cheese and the bread') re.search('THE','the cheese and the bread') re.search('THE','the cheese and the bread',re.I) re.search('THE(?i)','the cheese and the bread') re.sub('cheese','butter','bread and cheese') re.subn('cheese','butter','bread and cheese') re.findall('spam','spam, spam, ham, and spam') re.split(' ','the quick brown fox') re.findall(r'THE','the cat in the hat',re.I) re.findall(r'THE(?i)','the cat in the hat') g = re.search('cheese','the cheese and the bread') g g.group(0) g.start(0),g.end(0) obj = re.compile('cheese') obj obj.search('bread and cheese') obj.match('bread and cheese') obj.sub('butter','bread and cheese') print 'a\bc' print r'a\bc' print "a\"b\"c" print r"""a\"b\"c""" print ur"""a\"b\"c""" re.search(r'\w+','the bread and the cheese').group(0) re.search(ur'\w+',u'Brot und Käse').group(0) re.search(ur'Käse',u'Der Käse und das Brot.') re.search('Käse',u'Der Käse und das Brot.') re.search(ur'Käse','Der Käse und das Brot.') re.search('Käse','Der Käse und das Brot.') s = unicodedata.normalize('NFD',u'Käse') print "(%s)"%s re.search(s,'Der Käse und das Brot') def normalizing_search(regex,s): regex = unicodedata.normalize('NFC',regex) s = unicodedata.normalize('NFC',s) return re.search(regex,s) normalizing_search(s,u'Der Käse und das Brot') re.findall('c.t','the cat on the cot') re.findall('we*t','wet cowtippers tweet frequently') re.findall('we+t','wet cowtippers tweet frequently') re.findall('we?t','wet cowtippers tweet frequently') re.findall('[ew]t','wet cowtippers tweet frequently') print re.findall(r'\^\.\^','this ^.^ is a Japanese smiley, ^_^') print re.findall(r'\^.\^','this ^.^ is a Japanese smiley, ^_^') print re.findall(r'w','wet cowtippers tweet frequently') print re.findall(r'^w','wet cowtippers tweet frequently') print re.findall(r'(tweet|twit)','wet cowtippers tweet frequently, but are twits') print re.findall(r'ab+','xyz abbbbbbc def') print re.findall(r'ab+?','xyz abbbbbbc def') print re.search(r'ab+?','xyz abbbbbbc abc def').start(0) print re.findall(r'the ([^ ]*)','the cat in the hat') print re.findall(r'(a|the) ([^ ]*)','a cat in the hat') g = re.search(r'(a|the) ([^ ]*)','a cat in the hat') g.group(0) g.group(1) g.group(2) print g.start(2),g.end(2),g.span(2) print re.findall(r'(?:a|the) ([^ ]*)','a cat in the hat') print re.search(r'(the|a) [^ ]+ near \1 [^ ]+','the cat near the cat') print re.search(r'(the|a) [^ ]+ near \1 [^ ]+','a cat near a cat') print re.search(r'(the|a) [^ ]+ near \1 [^ ]+','the cat near a cat') print re.split(r'([,;]?\s+|\W+$)','The quick, brown fox jumps; over lazy dogs!') print re.findall(r'(.)\1','aa bc dd ef') print re.findall(r'(?P.)(?P=id)','aa bc dd ef') print re.search(r'(?Pb.)','aa bc dd ef').group("id") q = r'^(<)?[^<>]+(?(1)>|)$' print re.search(q,'abc') print re.search(q,'') print re.search(q,']+(?(1)>|)$' qx = r"""(?x) ^(<)? # match optional beginning "<" [^<>]* # match any non-bracket character (?(1)>|)$ # match a ">" at the end if we did so at the beginning """ print re.search(q,'') print re.search(qx,'') re.findall(r'\w+',"The quick brown fox... jumped over the la$y dogz.") numbers = re.compile(r'((?:\d+\.\d*|\d*\.\d+)(?:e[+-]\d+)?)',re.I) numbers.findall("The fine structure constant is 7.2973525698e-3, and pi is about 3.14159.") re.findall(r"[abc](?=z)","ax by cz") re.findall(r"[abc](?!z)","ax by cz") re.findall(r"(?<=a)[xyz]","ax by cz") re.findall(r"(?)(?:s|es|ed|ing)?\b(?i)",words=words) allwords.findall("The quick brown fox jumps over the lazy dogs.") fuzzywords = regex.compile(r"\b(\L){e<=2}(?:s|es|ed|ing)?\b(?i)",words=words) print fuzzywords.findall("The quock briwn fox jxmps over the lazy dogs.") fuzzywords = regex.compile(r"\b(?=\w)(\L){e<=2}(?:s|es|ed|ing)?\b(?i)",words=words) print fuzzywords.findall("The quock briwn fox jxmps over the lazy dogs.") regex.findall(ur'\S+',u'the quick рыжая лиса') regex.findall(ur'\w+',u'the quick рыжая лиса') regex.findall(ur'\p{Script=Latin}+',u'the quick рыжая лиса') regex.findall(ur'\p{Script=Cyrillic}+',u'the quick рыжая лиса') s = u"Käse" t = unicodedata.normalize('NFD',s) print repr(s) print repr(t) re.findall(ur"\w",s),re.findall(ur"\w",t) re.findall(ur"\w(?u)",s),re.findall(ur"\w(?u)",t) regex.findall(ur"\w",s),regex.findall(ur"\w",t) regex.findall(ur".",s),regex.findall(ur".",t) regex.findall(ur"\X",s),regex.findall(ur"\X",t) import pyparsing pyparsing.nestedExpr().parseString("(a (b c) d)").asList() import string from pyparsing import oneOf,Literal,Word,Optional,StringEnd greeting = oneOf("Hi Yo") + Optional(Literal(",")) + Word(string.uppercase,string.lowercase) + Optional(oneOf(". !")) + StringEnd() greeting.parseString("Hi, Peter!") greeting.parseString("Yo, DogZ.")