In [1]:
import os
In [2]:
creds = {
    'twitter': (os.environ['TW_API_KEY'], os.environ['TW_API_SEC'], (os.environ['TW_ACC_KEY'], os.environ['TW_ACC_SEC'])),
    'facebook': os.environ['FB_API_KEY'],
    'google': os.environ['GG_API_KEY']
}

Pattern

In [3]:
import pattern

What's in it?

  • pattern.web: data mining fanciness
  • pattern.db: "ORM"-ish
    • simpler interface, tabular data
    • csv handling
  • pattern.{en, es, de, fr, it, nl}: languages!
    • PoS-tagger
    • "sentiment analysis"
    • verb conjugation, pluralisation, etc.
  • pattern.search: search by syntax / semantics
  • pattern.vector: clustering, classification, etc.
  • pattern.graph: graph analysis!
  • pattern.metrics: grab-bag of useful tools for dealing with language

Is it any good?

Kinda. But it's not perfect.

Individual modules are obviously less flexible than unipurpose modules

Nice collection of useful tools if you're dealing with anything related to linguistic processing

pattern.web

In [4]:
from pattern.web import Twitter, Facebook, DuckDuckGo, Google, Bing, Wikipedia, Wikia, Newsfeed
from pattern.web import SEARCH, NEWS, IMAGE, SPARQL, COMMENTS, LIKES
In [5]:
tw = Twitter(license=creds['twitter'])
fb = Facebook(license=creds['facebook'])
ggl = Google(license=creds['google'])
# None uses credentials shared across ALL pattern users
ddg = DuckDuckGo(license=None)
bing = Bing(license=None)
wp = Wikipedia(license=None)
wa = Wikia(license=None)
nf = Newsfeed(license=None) # RSS / atom

Twitter

In [6]:
tw.trends()
Out[6]:
[u'#HaftasonuRandevusuzOyVermekistiyorumYSK',
 u'#muratgogebakan',
 u'#ThankYouBestfriend',
 u'#amnesiaEP',
 u'#BamOfficialChartThisSunday',
 u'One Direction #BestFandom2014 Directioners',
 u'Justin Bieber #BestFandom2014 Beliebers',
 u'RIP Sir Bobby Robson',
 u'Anabel',
 u'Feliz Jueves']
In [7]:
twitter_result = tw.search('ananas', count=100)
twitter_result[0]
Out[7]:
Result({u'profile': u'http://pbs.twimg.com/profile_images/486028208689905664/r-vzT-qS_normal.jpeg', u'language': u'ru', u'author': u'shikari_ananas', u'url': u'https://twitter.com/shikari_ananas/status/494802120152207360', u'text': u'@nnnoda \u0432\u0441\u0435 \u0444\u0430\u043d\u044e\u0447\u043a\u0438 \u043d\u043e\u0434\u044b \u0442\u0430\u043a\u0438\u0435', u'date': u'Thu Jul 31 11:09:53 +0000 2014', u'id': u'494802120152207360'})
In [8]:
from collections import Counter
languages = [tweet.language for tweet in twitter_result]
Counter(languages)
Out[8]:
Counter({u'ru': 60, u'fr': 12, u'en': 7, u'pt': 6, u'tr': 4, u'de': 3, u'it': 3, u'nl': 1, u'vi': 1, u'da': 1, u'tl': 1, u'in': 1})
In [9]:
from pattern.web.locale import geocode

in_result = tw.search('#EngvInd', count=50, geo=geocode('New Delhi')[:2])
en_result = tw.search('#EngvInd', count=50, geo=geocode('London')[:2])
In [10]:
from pattern.en import sentiment
In [11]:
reduce(lambda x, y: x+y, [sentiment(t.text)[0] for t in in_result]) / len(in_result)
Out[11]:
0.14291278166278154
In [12]:
reduce(lambda x, y: x+y, [sentiment(t.text)[0] for t in en_result]) / len(en_result)
Out[12]:
0.12380667249417247

Facebook

In [13]:
fb_result = fb.search('dragon age inquisition', type=SEARCH, count=100)
fb_result[0]
Out[13]:
Result(id=u'449022588487492_753024791420602')
In [14]:
for post in fb_result:
    
    if post.likes > 0 and post.comments > 0:
        print('\n\n'.join( [l.author[1] for l in fb.search(post.id, type=LIKES)] ))
        print('-' * 10)
        print('\n\n'.join( [c.text for c in fb.search(post.id, type=COMMENTS)] ))
        break
Ÿøüčēf Fōüīnÿ

Sawako Heiwajima

Rex Larbi Poochie

Issâm Phoneix

مول الاير ماكس
----------
ET GTA V sur PC ?? c'est pour quand ? :(

Google, Bing, DuckDuckGo

In [15]:
google_result = ggl.search('dragon age inquisition', count=10)
google_result[0]
Out[15]:
Result({u'url': u'http://www.dragonage.com/', u'text': u'Beautiful vistas and incredible new possibilities await you in the latest game in <br>\nthe epic role-playing series from BioWare \u2013 <b>Dragon Age</b>: <b>Inquisition</b>.', u'date': u'', u'language': u'', u'title': u'Dragon Age: Inquisition'})

Google Translate

In [16]:
lang_id = [ggl.identify(res.text) for res in fb_result[:10]]
lang_id
Out[16]:
[(u'en', 0.9998242),
 (u'en', 0.120250024),
 (u'en', 0.6363636),
 (u'en', 0.033820875),
 (u'en', 0.2889652),
 (u'en', 0.6282051)]
In [17]:
ggl.translate(fb_result[0].text, input=lang_id[0][0], output='de')
Out[17]:
u'Dragon Age: Inquisition Kampf Anh\xe4nger burninates dem Land einen neuen Trailer zu Dragon Age: Inquisition nicht bieten neue Informationen \xfcber das Spiel der Kampf, aber es ist ermutigend zu BioWare Fokus auf der Ebene der Strategie, die von der Echtzeit-mit-Pause zu sehen System. - http://www.rheena.com/reviews-games-chat-hardware-sony/212236-dragon-age-inquisition-combat-trailer-burninates-countryside.html'

Finding emails & links

In [18]:
from pattern.web import find_urls, find_email

s = '''
Find out more at the PUGS website (http://pugs.org.sg), or email us at [email protected]!
This is a decoy URL http://pugs.org.sg.
'''
In [19]:
find_urls(s, unique=True), \
find_email(s, unique=True)
Out[19]:
([u'http://pugs.org.sg'], [u'[email protected]'])

Crowdsourced "useful" sorting

In [20]:
from pattern.web import sort, GOOGLE

terms = [
    'french',
    'german',
    'japanese',
    'chinese',
    'persian',
    'hun',
    'american',
    'russian',
    'swede',
    'polish',
    'singaporean',
    'politician',
]
sort_result = sort(terms=terms, context='dangerous', prefix=True, service=GOOGLE, license=creds['google'])
In [21]:
for weight, term in sort_result:
    print "%.2f" % (weight * 100) + '%', term
28.50% "dangerous american"
21.18% "dangerous russian"
11.85% "dangerous japanese"
11.22% "dangerous chinese"
10.10% "dangerous politician"
8.99% "dangerous german"
6.00% "dangerous french"
1.12% "dangerous polish"
0.53% "dangerous hun"
0.33% "dangerous persian"
0.17% "dangerous swede"
0.01% "dangerous singaporean"

Other miscellaneous functionality

  • DOM parser
  • HTML -> plaintext
  • PDF -> plaintext
  • Crawler
  • IMAP (!!!)

pattern.en

Manipulation

In [22]:
from pattern.en import article, referenced

article('harbour'), \
referenced('umbrella')
Out[22]:
('a', 'an umbrella')
In [23]:
from pattern.en import (pluralize as pluralise, singularize as singularise)
In [24]:
pluralise('octopus')
Out[24]:
'octopodes'
In [25]:
pluralise('octopus', classical=False)
Out[25]:
'octopuses'
In [26]:
pluralise('I'), \
pluralise('my'), \
pluralise('her')
Out[26]:
('we', 'our', 'their')
In [27]:
singularise('bacteria')
Out[27]:
'bacterium'
In [28]:
pluralise('virus')
Out[28]:
'viruss'
In [29]:
singularise('viruses'), \
singularise('virii'), \
singularise('virus')
Out[29]:
('viruse', 'virius', 'viru')
In [30]:
singularise('viri')
Out[30]:
'virus'
In [31]:
from pattern.en import comparative, superlative
In [32]:
'python is %s than ruby!' % comparative('good')
Out[32]:
'python is better than ruby!'
In [33]:
'iPython is %s python shell' % (
referenced(superlative('ideal'), article=pattern.en.DEFINITE))
Out[33]:
'iPython is the most ideal python shell'
In [34]:
from pattern.en import conjugate, lemma, lexeme, tenses
In [35]:
lemma('are')
Out[35]:
u'be'
In [36]:
tenses('be'), \
tenses('were')
Out[36]:
([('infinitive', None, None, None, None),
  ('present', None, 'plural', 'indicative', 'imperfective'),
  ('present', 1, 'plural', 'indicative', 'imperfective'),
  ('present', 1, 'singular', 'indicative', 'imperfective'),
  ('present', 2, 'plural', 'indicative', 'imperfective'),
  ('present', 2, 'singular', 'indicative', 'imperfective'),
  ('present', 3, 'plural', 'indicative', 'imperfective'),
  ('present', 3, 'singular', 'indicative', 'imperfective')],
 [('past', None, None, 'indicative', 'imperfective'),
  ('past', None, 'plural', 'indicative', 'imperfective'),
  ('past', 1, 'plural', 'indicative', 'imperfective'),
  ('past', 1, 'singular', 'indicative', 'imperfective'),
  ('past', 2, 'plural', 'indicative', 'imperfective'),
  ('past', 2, 'singular', 'indicative', 'imperfective'),
  ('past', 3, 'plural', 'indicative', 'imperfective'),
  ('past', 3, 'singular', 'indicative', 'imperfective')])
In [37]:
lexeme('be')
Out[37]:
[u'be',
 u'am',
 u'are',
 u'is',
 u'being',
 u'was',
 u'were',
 u'been',
 u'am not',
 u"aren't",
 u"isn't",
 u"wasn't",
 u"weren't"]
In [38]:
conjugate('nibble', '1sgp'), \
conjugate('nibble', '3sg')
Out[38]:
(u'nibbled', u'nibbles')
In [39]:
conjugate('google', tense=pattern.en.PARTICIPLE, parse=False), \
conjugate('google', tense=pattern.en.PARTICIPLE, parse=True)
Out[39]:
(None, 'googling')

Numbers!

In [40]:
from pattern.en import number

number('five thousand six hundred and eighty nine')
Out[40]:
5689
In [41]:
from pattern.en import numerals

numerals('42.128', round=2)
Out[41]:
'forty-two point thirteen'
In [42]:
animals = ['goose', 'goose', 'duck', 'chicken', 'chicken', 'chicken']
orangebirds = {'carrot': 100, 'parrot': 5, 'orange': 20}
In [43]:
from pattern.en import quantify

quantify(animals), \
quantify(orangebirds)
Out[43]:
('several chickens, a pair of geese and a duck',
 'dozens of carrots, a score of oranges and several parrots')

Autocorrect… ish

In [44]:
from pattern.en import suggest

suggest('psuh')
Out[44]:
[('push', 1.0)]
In [45]:
suggest('carot')
Out[45]:
[('cart', 0.9032258064516129),
 ('cabot', 0.04838709677419355),
 ('carrot', 0.03225806451612903),
 ('caret', 0.016129032258064516)]

n-grams

In [46]:
from pattern.en import ngrams
In [47]:
ngrams('This is a sentence', n=2)
Out[47]:
[('This', 'is'), ('is', 'a'), ('a', 'sentence')]
In [48]:
ngrams('This is a sentence', n=3)
Out[48]:
[('This', 'is', 'a'), ('is', 'a', 'sentence')]

Parsing

In [49]:
short_s = 'When I saw the prices for some of those apartments, I was startled.'
long_s = 'Personally, I think the only unassailable definition is the one often attributed to the great editor John W Campbell: "Science fiction is what I say it is."'
In [50]:
from pattern.en import tag

for word, pos in tag(short_s):
    if pos[:2] == 'NN':
        print(word)
prices
apartments
In [51]:
from pattern.en import parse

parsed = parse(short_s, tokenize=True, tags=True, chunks=True, relations=True, lemmata=True)
parsed
Out[51]:
u'When/WRB/O/O/O/when I/PRP/B-NP/O/NP-SBJ-1/i saw/VBD/B-VP/O/VP-1/see the/DT/B-NP/O/NP-OBJ-1/the prices/NNS/I-NP/O/NP-OBJ-1/price for/IN/B-PP/O/O/for some/DT/O/O/O/some of/IN/B-PP/B-PNP/O/of those/DT/B-NP/I-PNP/O/those apartments/NNS/I-NP/I-PNP/O/apartment ,/,/O/O/O/, I/PRP/B-NP/O/NP-SBJ-2/i was/VBD/B-VP/O/VP-2/be startled/VBN/I-VP/O/VP-2/startle ././O/O/O/.'
In [52]:
from pattern.en import pprint

pprint(parsed)
          WORD   TAG    CHUNK   ROLE   ID     PNP    LEMMA       
                                                                 
          When   WRB    -       -      -      -      when        
             I   PRP    NP      SBJ    1      -      i           
           saw   VBD    VP      -      1      -      see         
           the   DT     NP      OBJ    1      -      the         
        prices   NNS    NP ^    OBJ    1      -      price       
           for   IN     PP      -      -      -      for         
          some   DT     -       -      -      -      some        
            of   IN     PP      -      -      PNP    of          
         those   DT     NP      -      -      PNP    those       
    apartments   NNS    NP ^    -      -      PNP    apartment   
             ,   ,      -       -      -      -      ,           
             I   PRP    NP      SBJ    2      -      i           
           was   VBD    VP      -      2      -      be          
      startled   VBN    VP ^    -      2      -      startle     
             .   .      -       -      -      -      .           
In [53]:
from pattern.en import parsetree

parsetree(short_s, tokenize=True, tags=True, chunks=True, relations=True, lemmata=True)
Out[53]:
[Sentence('When/WRB/O/O/O/when I/PRP/B-NP/O/NP-SBJ-1/i saw/VBD/B-VP/O/VP-1/see the/DT/B-NP/O/NP-OBJ-1/the prices/NNS/I-NP/O/NP-OBJ-1/price for/IN/B-PP/O/O/for some/DT/O/O/O/some of/IN/B-PP/B-PNP/O/of those/DT/B-NP/I-PNP/O/those apartments/NNS/I-NP/I-PNP/O/apartment ,/,/O/O/O/, I/PRP/B-NP/O/NP-SBJ-2/i was/VBD/B-VP/O/VP-2/be startled/VBN/I-VP/O/VP-2/startle ././O/O/O/.')]
In [54]:
tr = parsetree(long_s, tokenize=True, tags=True, chunks=True, relations=True, lemmata=True)

print(type(tr))
<class 'pattern.text.tree.Text'>
In [55]:
[type(item) for item in tr]
Out[55]:
[pattern.text.tree.Sentence, pattern.text.tree.Sentence]
In [56]:
type(tr[0][0])
Out[56]:
pattern.text.tree.Word
In [57]:
for sentence in tr:
    for chunk in sentence.chunks:
        print(chunk.type, [(w.string, w.type) for w in chunk.words])
(u'ADVP', [(u'Personally', u'RB')])
(u'NP', [(u'I', u'PRP')])
(u'VP', [(u'think', u'VBP')])
(u'NP', [(u'the', u'DT'), (u'only', u'JJ'), (u'unassailable', u'JJ'), (u'definition', u'NN')])
(u'VP', [(u'is', u'VBZ')])
(u'VP', [(u'often', u'RB'), (u'attributed', u'VBN')])
(u'NP', [(u'the', u'DT'), (u'great', u'JJ'), (u'editor', u'NN'), (u'John', u'NNP'), (u'W', u'NNP'), (u'Campbell', u'NNP')])
(u'NP', [(u'Science', u'NN'), (u'fiction', u'NN')])
(u'VP', [(u'is', u'VBZ')])
(u'NP', [(u'I', u'PRP')])
(u'VP', [(u'say', u'VBP')])
(u'NP', [(u'it', u'PRP')])
(u'VP', [(u'is', u'VBZ')])

"Sentiment analysis"

In [58]:
from pattern.en import sentiment
In [59]:
sg_result = tw.search('causeway', count=50, geo=geocode('Singapore')[:2])
my_result = tw.search('causeway', count=50, geo=geocode('Kuala Lumpur')[:2])
In [60]:
sg_sentiment = sorted(sentiment(tweet.text)[0] for tweet in sg_result)
my_sentiment = sorted(sentiment(tweet.text)[0] for tweet in my_result)

sg_avg = sum(sg_sentiment) / len(sg_sentiment)
my_avg = sum(my_sentiment) / len(my_sentiment)

sg_avg, my_avg
Out[60]:
(0.079808080808080814, -0.24100000000000008)
In [61]:
x = linspace(0, 50)
plot(sg_sentiment, 'r-', label='sg')
plot(x, [sg_avg for i in xrange(0, 50)], 'r--', label='sg_avg')
plot(my_sentiment, 'g-', label='my')
plot(x, [my_avg for i in xrange(0, 50)], 'g--', label='my_avg')
grid(b=True, which='both')
legend(loc='best')
Out[61]:
<matplotlib.legend.Legend at 0x7f0cedfa2650>

WordNet

In [62]:
from pattern.en import wordnet
In [63]:
birds = wordnet.synsets('bird')
birds
Out[63]:
[Synset(u'bird'),
 Synset(u'bird'),
 Synset(u'dame'),
 Synset(u'boo'),
 Synset(u'shuttlecock')]
In [64]:
bird = birds[0]

'Definition', bird.gloss
Out[64]:
('Definition',
 u'warm-blooded egg-laying vertebrates characterized by feathers and forelimbs modified as wings')
In [65]:
'- Synonyms', bird.synonyms
Out[65]:
('- Synonyms', [u'bird'])
In [66]:
'^ Hypernyms', bird.hypernyms()
Out[66]:
('^ Hypernyms', [Synset(u'vertebrate')])
In [67]:
'v Hyponyms', bird.hyponyms()
Out[67]:
('v Hyponyms',
 [Synset(u'dickeybird'),
  Synset(u'cock'),
  Synset(u'hen'),
  Synset(u'nester'),
  Synset(u'night bird'),
  Synset(u'bird of passage'),
  Synset(u'protoavis'),
  Synset(u'archaeopteryx'),
  Synset(u'Sinornis'),
  Synset(u'Ibero-mesornis'),
  Synset(u'archaeornis'),
  Synset(u'ratite'),
  Synset(u'carinate'),
  Synset(u'passerine'),
  Synset(u'nonpasserine bird'),
  Synset(u'bird of prey'),
  Synset(u'gallinaceous bird'),
  Synset(u'parrot'),
  Synset(u'cuculiform bird'),
  Synset(u'coraciiform bird'),
  Synset(u'apodiform bird'),
  Synset(u'caprimulgiform bird'),
  Synset(u'piciform bird'),
  Synset(u'trogon'),
  Synset(u'aquatic bird'),
  Synset(u'twitterer')])
In [68]:
'^ Holonyms', bird.holonyms()
Out[68]:
('^ Holonyms', [Synset(u'Aves'), Synset(u'flock')])
In [69]:
'v Meronyms', bird.meronyms()
Out[69]:
('v Meronyms',
 [Synset(u'beak'),
  Synset(u'furcula'),
  Synset(u'feather'),
  Synset(u'wing'),
  Synset(u'pennon'),
  Synset(u"bird's foot"),
  Synset(u'uropygium'),
  Synset(u'hindquarters'),
  Synset(u'air sac'),
  Synset(u'uropygial gland'),
  Synset(u'syrinx'),
  Synset(u'bird')])
In [70]:
wordnet.synsets('owl')[0].holonyms(), wordnet.synsets('amoeba')[0].holonyms()
Out[70]:
([Synset(u'Strigiformes')], [Synset(u'Amoebida')])
In [71]:
kitty = wordnet.synsets('kitten')[0]
pup = wordnet.synsets('puppy')[0]

wordnet.ancestor(kitty, pup)
Out[71]:
Synset(u'young mammal')
In [72]:
human = wordnet.synsets('human')[0]
cuy = wordnet.synsets('guinea pig')[0]

wordnet.similarity(human, cuy), \
wordnet.similarity(human, kitty)
Out[72]:
(0.26171115255795596, 0.5710216026393958)

pattern.search

In [73]:
s = 'the fluffy brown bunnies hopped across the wet grass with much gusto.'
In [74]:
from pattern.search import search

search('NP', parsetree(s))
Out[74]:
[Match(words=[Word(u'the/DT'), Word(u'fluffy/JJ'), Word(u'brown/JJ'), Word(u'bunnies/NNS')]),
 Match(words=[Word(u'the/DT'), Word(u'wet/JJ'), Word(u'grass/NN')]),
 Match(words=[Word(u'much/JJ'), Word(u'gusto/NN')])]
In [75]:
from pattern.search import taxonomy

for animal in ('bunny', 'dog', 'cat', 'banana'):
    taxonomy.append(animal, type='animal')
    
search('ANIMAL', parsetree(s, lemmata=True))
Out[75]:
[Match(words=[Word(u'the/DT'), Word(u'fluffy/JJ'), Word(u'brown/JJ'), Word(u'bunnies/NNS')])]
In [76]:
from pattern.search import Pattern

pat = Pattern.fromstring('{JJ} {ANIMAL} {VP}')
match = pat.match(parsetree(s, lemmata=True))
for i in range(0,4):
    print(match.group(i))
[Word(u'fluffy/JJ'), Word(u'brown/JJ'), Word(u'bunnies/NNS'), Word(u'hopped/VBD')]
[Word(u'fluffy/JJ')]
[Word(u'brown/JJ'), Word(u'bunnies/NNS')]
[Word(u'hopped/VBD')]

fin

more?