In [1]:

import os

In [2]:

creds = {
    'twitter': (os.environ['TW_API_KEY'], os.environ['TW_API_SEC'], (os.environ['TW_ACC_KEY'], os.environ['TW_ACC_SEC'])),
    'facebook': os.environ['FB_API_KEY'],
    'google': os.environ['GG_API_KEY']
}

Pattern¶

In [3]:

import pattern

What's in it?¶

pattern.web: data mining fanciness
pattern.db: "ORM"-ish
- simpler interface, tabular data
- csv handling
pattern.{en, es, de, fr, it, nl}: languages!
- PoS-tagger
- "sentiment analysis"
- verb conjugation, pluralisation, etc.
pattern.search: search by syntax / semantics
pattern.vector: clustering, classification, etc.
pattern.graph: graph analysis!
pattern.metrics: grab-bag of useful tools for dealing with language

Is it any good?¶

Kinda. But it's not perfect.

Individual modules are obviously less flexible than unipurpose modules

Nice collection of useful tools if you're dealing with anything related to linguistic processing

pattern.web¶

In [4]:

from pattern.web import Twitter, Facebook, DuckDuckGo, Google, Bing, Wikipedia, Wikia, Newsfeed
from pattern.web import SEARCH, NEWS, IMAGE, SPARQL, COMMENTS, LIKES

In [5]:

tw = Twitter(license=creds['twitter'])
fb = Facebook(license=creds['facebook'])
ggl = Google(license=creds['google'])
# None uses credentials shared across ALL pattern users
ddg = DuckDuckGo(license=None)
bing = Bing(license=None)
wp = Wikipedia(license=None)
wa = Wikia(license=None)
nf = Newsfeed(license=None) # RSS / atom

Twitter¶

In [6]:

tw.trends()

Out[6]:

[u'#HaftasonuRandevusuzOyVermekistiyorumYSK',
 u'#muratgogebakan',
 u'#ThankYouBestfriend',
 u'#amnesiaEP',
 u'#BamOfficialChartThisSunday',
 u'One Direction #BestFandom2014 Directioners',
 u'Justin Bieber #BestFandom2014 Beliebers',
 u'RIP Sir Bobby Robson',
 u'Anabel',
 u'Feliz Jueves']

In [7]:

twitter_result = tw.search('ananas', count=100)
twitter_result[0]

Out[7]:

Result({u'profile': u'http://pbs.twimg.com/profile_images/486028208689905664/r-vzT-qS_normal.jpeg', u'language': u'ru', u'author': u'shikari_ananas', u'url': u'https://twitter.com/shikari_ananas/status/494802120152207360', u'text': u'@nnnoda \u0432\u0441\u0435 \u0444\u0430\u043d\u044e\u0447\u043a\u0438 \u043d\u043e\u0434\u044b \u0442\u0430\u043a\u0438\u0435', u'date': u'Thu Jul 31 11:09:53 +0000 2014', u'id': u'494802120152207360'})

In [8]:

from collections import Counter
languages = [tweet.language for tweet in twitter_result]
Counter(languages)

Out[8]:

Counter({u'ru': 60, u'fr': 12, u'en': 7, u'pt': 6, u'tr': 4, u'de': 3, u'it': 3, u'nl': 1, u'vi': 1, u'da': 1, u'tl': 1, u'in': 1})

In [9]:

from pattern.web.locale import geocode

in_result = tw.search('#EngvInd', count=50, geo=geocode('New Delhi')[:2])
en_result = tw.search('#EngvInd', count=50, geo=geocode('London')[:2])

In [10]:

from pattern.en import sentiment

In [11]:

reduce(lambda x, y: x+y, [sentiment(t.text)[0] for t in in_result]) / len(in_result)

Out[11]:

0.14291278166278154

In [12]:

reduce(lambda x, y: x+y, [sentiment(t.text)[0] for t in en_result]) / len(en_result)

Out[12]:

0.12380667249417247

Facebook¶

In [13]:

fb_result = fb.search('dragon age inquisition', type=SEARCH, count=100)
fb_result[0]

Out[13]:

Result(id=u'449022588487492_753024791420602')

In [14]:

for post in fb_result:
    
    if post.likes > 0 and post.comments > 0:
        print('\n\n'.join( [l.author[1] for l in fb.search(post.id, type=LIKES)] ))
        print('-' * 10)
        print('\n\n'.join( [c.text for c in fb.search(post.id, type=COMMENTS)] ))
        break

Ÿøüčēf Fōüīnÿ

Sawako Heiwajima

Rex Larbi Poochie

Issâm Phoneix

مول الاير ماكس
----------
ET GTA V sur PC ?? c'est pour quand ? :(

Google, Bing, DuckDuckGo¶

In [15]:

google_result = ggl.search('dragon age inquisition', count=10)
google_result[0]

Out[15]:

Result({u'url': u'http://www.dragonage.com/', u'text': u'Beautiful vistas and incredible new possibilities await you in the latest game in <br>\nthe epic role-playing series from BioWare \u2013 <b>Dragon Age</b>: <b>Inquisition</b>.', u'date': u'', u'language': u'', u'title': u'Dragon Age: Inquisition'})

Google Translate¶

In [16]:

lang_id = [ggl.identify(res.text) for res in fb_result[:10]]
lang_id

Out[16]:

[(u'en', 0.9998242),
 (u'en', 0.120250024),
 (u'en', 0.6363636),
 (u'en', 0.033820875),
 (u'en', 0.2889652),
 (u'en', 0.6282051)]

In [17]:

ggl.translate(fb_result[0].text, input=lang_id[0][0], output='de')

Out[17]:

u'Dragon Age: Inquisition Kampf Anh\xe4nger burninates dem Land einen neuen Trailer zu Dragon Age: Inquisition nicht bieten neue Informationen \xfcber das Spiel der Kampf, aber es ist ermutigend zu BioWare Fokus auf der Ebene der Strategie, die von der Echtzeit-mit-Pause zu sehen System. - http://www.rheena.com/reviews-games-chat-hardware-sony/212236-dragon-age-inquisition-combat-trailer-burninates-countryside.html'

Finding emails & links¶

In [18]:

from pattern.web import find_urls, find_email

s = '''
Find out more at the PUGS website (http://pugs.org.sg), or email us at idontknow@whatouremail.is!
This is a decoy URL http://pugs.org.sg.
'''

In [19]:

find_urls(s, unique=True), \
find_email(s, unique=True)

Out[19]:

([u'http://pugs.org.sg'], [u'idontknow@whatouremail.is'])

Crowdsourced "useful" sorting¶

In [20]:

from pattern.web import sort, GOOGLE

terms = [
    'french',
    'german',
    'japanese',
    'chinese',
    'persian',
    'hun',
    'american',
    'russian',
    'swede',
    'polish',
    'singaporean',
    'politician',
]
sort_result = sort(terms=terms, context='dangerous', prefix=True, service=GOOGLE, license=creds['google'])

In [21]:

for weight, term in sort_result:
    print "%.2f" % (weight * 100) + '%', term

28.50% "dangerous american"
21.18% "dangerous russian"
11.85% "dangerous japanese"
11.22% "dangerous chinese"
10.10% "dangerous politician"
8.99% "dangerous german"
6.00% "dangerous french"
1.12% "dangerous polish"
0.53% "dangerous hun"
0.33% "dangerous persian"
0.17% "dangerous swede"
0.01% "dangerous singaporean"

Other miscellaneous functionality¶

DOM parser
HTML -> plaintext
PDF -> plaintext
Crawler
IMAP (!!!)

pattern.en¶

Manipulation¶

In [22]:

from pattern.en import article, referenced

article('harbour'), \
referenced('umbrella')

Out[22]:

('a', 'an umbrella')

In [23]:

from pattern.en import (pluralize as pluralise, singularize as singularise)

In [24]:

pluralise('octopus')

Out[24]:

'octopodes'

In [25]:

pluralise('octopus', classical=False)

Out[25]:

'octopuses'

In [26]:

pluralise('I'), \
pluralise('my'), \
pluralise('her')

Out[26]:

('we', 'our', 'their')

In [27]:

singularise('bacteria')

Out[27]:

'bacterium'

In [28]:

pluralise('virus')

Out[28]:

'viruss'

In [29]:

singularise('viruses'), \
singularise('virii'), \
singularise('virus')

Out[29]:

('viruse', 'virius', 'viru')

In [30]:

singularise('viri')

Out[30]:

'virus'

In [31]:

from pattern.en import comparative, superlative

In [32]:

'python is %s than ruby!' % comparative('good')

Out[32]:

'python is better than ruby!'

In [33]:

'iPython is %s python shell' % (
referenced(superlative('ideal'), article=pattern.en.DEFINITE))

Out[33]:

'iPython is the most ideal python shell'

In [34]:

from pattern.en import conjugate, lemma, lexeme, tenses

In [35]:

lemma('are')

Out[35]:

u'be'

In [36]:

tenses('be'), \
tenses('were')

Out[36]:

([('infinitive', None, None, None, None),
  ('present', None, 'plural', 'indicative', 'imperfective'),
  ('present', 1, 'plural', 'indicative', 'imperfective'),
  ('present', 1, 'singular', 'indicative', 'imperfective'),
  ('present', 2, 'plural', 'indicative', 'imperfective'),
  ('present', 2, 'singular', 'indicative', 'imperfective'),
  ('present', 3, 'plural', 'indicative', 'imperfective'),
  ('present', 3, 'singular', 'indicative', 'imperfective')],
 [('past', None, None, 'indicative', 'imperfective'),
  ('past', None, 'plural', 'indicative', 'imperfective'),
  ('past', 1, 'plural', 'indicative', 'imperfective'),
  ('past', 1, 'singular', 'indicative', 'imperfective'),
  ('past', 2, 'plural', 'indicative', 'imperfective'),
  ('past', 2, 'singular', 'indicative', 'imperfective'),
  ('past', 3, 'plural', 'indicative', 'imperfective'),
  ('past', 3, 'singular', 'indicative', 'imperfective')])

In [37]:

lexeme('be')

Out[37]:

[u'be',
 u'am',
 u'are',
 u'is',
 u'being',
 u'was',
 u'were',
 u'been',
 u'am not',
 u"aren't",
 u"isn't",
 u"wasn't",
 u"weren't"]

In [38]:

conjugate('nibble', '1sgp'), \
conjugate('nibble', '3sg')

Out[38]:

(u'nibbled', u'nibbles')

In [39]:

conjugate('google', tense=pattern.en.PARTICIPLE, parse=False), \
conjugate('google', tense=pattern.en.PARTICIPLE, parse=True)

Out[39]:

(None, 'googling')

Numbers!¶

In [40]:

from pattern.en import number

number('five thousand six hundred and eighty nine')

Out[40]:

In [41]:

from pattern.en import numerals

numerals('42.128', round=2)

Out[41]:

'forty-two point thirteen'

In [42]:

animals = ['goose', 'goose', 'duck', 'chicken', 'chicken', 'chicken']
orangebirds = {'carrot': 100, 'parrot': 5, 'orange': 20}

In [43]:

from pattern.en import quantify

quantify(animals), \
quantify(orangebirds)

Out[43]:

('several chickens, a pair of geese and a duck',
 'dozens of carrots, a score of oranges and several parrots')

Autocorrect… ish¶

In [44]:

from pattern.en import suggest

suggest('psuh')

Out[44]:

[('push', 1.0)]

In [45]:

suggest('carot')

Out[45]:

[('cart', 0.9032258064516129),
 ('cabot', 0.04838709677419355),
 ('carrot', 0.03225806451612903),
 ('caret', 0.016129032258064516)]

n-grams¶

In [46]:

from pattern.en import ngrams

In [47]:

ngrams('This is a sentence', n=2)

Out[47]:

[('This', 'is'), ('is', 'a'), ('a', 'sentence')]

In [48]:

ngrams('This is a sentence', n=3)

Out[48]:

[('This', 'is', 'a'), ('is', 'a', 'sentence')]

Parsing¶

In [49]:

short_s = 'When I saw the prices for some of those apartments, I was startled.'
long_s = 'Personally, I think the only unassailable definition is the one often attributed to the great editor John W Campbell: "Science fiction is what I say it is."'

In [50]:

from pattern.en import tag

for word, pos in tag(short_s):
    if pos[:2] == 'NN':
        print(word)

prices
apartments

In [51]:

from pattern.en import parse

parsed = parse(short_s, tokenize=True, tags=True, chunks=True, relations=True, lemmata=True)
parsed

Out[51]:

u'When/WRB/O/O/O/when I/PRP/B-NP/O/NP-SBJ-1/i saw/VBD/B-VP/O/VP-1/see the/DT/B-NP/O/NP-OBJ-1/the prices/NNS/I-NP/O/NP-OBJ-1/price for/IN/B-PP/O/O/for some/DT/O/O/O/some of/IN/B-PP/B-PNP/O/of those/DT/B-NP/I-PNP/O/those apartments/NNS/I-NP/I-PNP/O/apartment ,/,/O/O/O/, I/PRP/B-NP/O/NP-SBJ-2/i was/VBD/B-VP/O/VP-2/be startled/VBN/I-VP/O/VP-2/startle ././O/O/O/.'

In [52]:

from pattern.en import pprint

pprint(parsed)

          WORD   TAG    CHUNK   ROLE   ID     PNP    LEMMA       
                                                                 
          When   WRB    -       -      -      -      when        
             I   PRP    NP      SBJ    1      -      i           
           saw   VBD    VP      -      1      -      see         
           the   DT     NP      OBJ    1      -      the         
        prices   NNS    NP ^    OBJ    1      -      price       
           for   IN     PP      -      -      -      for         
          some   DT     -       -      -      -      some        
            of   IN     PP      -      -      PNP    of          
         those   DT     NP      -      -      PNP    those       
    apartments   NNS    NP ^    -      -      PNP    apartment   
             ,   ,      -       -      -      -      ,           
             I   PRP    NP      SBJ    2      -      i           
           was   VBD    VP      -      2      -      be          
      startled   VBN    VP ^    -      2      -      startle     
             .   .      -       -      -      -      .

In [53]:

from pattern.en import parsetree

parsetree(short_s, tokenize=True, tags=True, chunks=True, relations=True, lemmata=True)

Out[53]:

[Sentence('When/WRB/O/O/O/when I/PRP/B-NP/O/NP-SBJ-1/i saw/VBD/B-VP/O/VP-1/see the/DT/B-NP/O/NP-OBJ-1/the prices/NNS/I-NP/O/NP-OBJ-1/price for/IN/B-PP/O/O/for some/DT/O/O/O/some of/IN/B-PP/B-PNP/O/of those/DT/B-NP/I-PNP/O/those apartments/NNS/I-NP/I-PNP/O/apartment ,/,/O/O/O/, I/PRP/B-NP/O/NP-SBJ-2/i was/VBD/B-VP/O/VP-2/be startled/VBN/I-VP/O/VP-2/startle ././O/O/O/.')]

In [54]:

tr = parsetree(long_s, tokenize=True, tags=True, chunks=True, relations=True, lemmata=True)

print(type(tr))

<class 'pattern.text.tree.Text'>

In [55]:

[type(item) for item in tr]

Out[55]:

[pattern.text.tree.Sentence, pattern.text.tree.Sentence]

In [56]:

type(tr[0][0])

Out[56]:

pattern.text.tree.Word

In [57]:

for sentence in tr:
    for chunk in sentence.chunks:
        print(chunk.type, [(w.string, w.type) for w in chunk.words])

(u'ADVP', [(u'Personally', u'RB')])
(u'NP', [(u'I', u'PRP')])
(u'VP', [(u'think', u'VBP')])
(u'NP', [(u'the', u'DT'), (u'only', u'JJ'), (u'unassailable', u'JJ'), (u'definition', u'NN')])
(u'VP', [(u'is', u'VBZ')])
(u'VP', [(u'often', u'RB'), (u'attributed', u'VBN')])
(u'NP', [(u'the', u'DT'), (u'great', u'JJ'), (u'editor', u'NN'), (u'John', u'NNP'), (u'W', u'NNP'), (u'Campbell', u'NNP')])
(u'NP', [(u'Science', u'NN'), (u'fiction', u'NN')])
(u'VP', [(u'is', u'VBZ')])
(u'NP', [(u'I', u'PRP')])
(u'VP', [(u'say', u'VBP')])
(u'NP', [(u'it', u'PRP')])
(u'VP', [(u'is', u'VBZ')])

"Sentiment analysis"¶

In [58]:

from pattern.en import sentiment

In [59]:

sg_result = tw.search('causeway', count=50, geo=geocode('Singapore')[:2])
my_result = tw.search('causeway', count=50, geo=geocode('Kuala Lumpur')[:2])

In [60]:

sg_sentiment = sorted(sentiment(tweet.text)[0] for tweet in sg_result)
my_sentiment = sorted(sentiment(tweet.text)[0] for tweet in my_result)

sg_avg = sum(sg_sentiment) / len(sg_sentiment)
my_avg = sum(my_sentiment) / len(my_sentiment)

sg_avg, my_avg

Out[60]:

(0.079808080808080814, -0.24100000000000008)

In [61]:

x = linspace(0, 50)
plot(sg_sentiment, 'r-', label='sg')
plot(x, [sg_avg for i in xrange(0, 50)], 'r--', label='sg_avg')
plot(my_sentiment, 'g-', label='my')
plot(x, [my_avg for i in xrange(0, 50)], 'g--', label='my_avg')
grid(b=True, which='both')
legend(loc='best')

Out[61]:

<matplotlib.legend.Legend at 0x7f0cedfa2650>

WordNet¶

In [62]:

from pattern.en import wordnet

In [63]:

birds = wordnet.synsets('bird')
birds

Out[63]:

[Synset(u'bird'),
 Synset(u'bird'),
 Synset(u'dame'),
 Synset(u'boo'),
 Synset(u'shuttlecock')]

In [64]:

bird = birds[0]

'Definition', bird.gloss

Out[64]:

('Definition',
 u'warm-blooded egg-laying vertebrates characterized by feathers and forelimbs modified as wings')

In [65]:

'- Synonyms', bird.synonyms

Out[65]:

('- Synonyms', [u'bird'])

In [66]:

'^ Hypernyms', bird.hypernyms()

Out[66]:

('^ Hypernyms', [Synset(u'vertebrate')])

In [67]:

'v Hyponyms', bird.hyponyms()

Out[67]:

('v Hyponyms',
 [Synset(u'dickeybird'),
  Synset(u'cock'),
  Synset(u'hen'),
  Synset(u'nester'),
  Synset(u'night bird'),
  Synset(u'bird of passage'),
  Synset(u'protoavis'),
  Synset(u'archaeopteryx'),
  Synset(u'Sinornis'),
  Synset(u'Ibero-mesornis'),
  Synset(u'archaeornis'),
  Synset(u'ratite'),
  Synset(u'carinate'),
  Synset(u'passerine'),
  Synset(u'nonpasserine bird'),
  Synset(u'bird of prey'),
  Synset(u'gallinaceous bird'),
  Synset(u'parrot'),
  Synset(u'cuculiform bird'),
  Synset(u'coraciiform bird'),
  Synset(u'apodiform bird'),
  Synset(u'caprimulgiform bird'),
  Synset(u'piciform bird'),
  Synset(u'trogon'),
  Synset(u'aquatic bird'),
  Synset(u'twitterer')])

In [68]:

'^ Holonyms', bird.holonyms()

Out[68]:

('^ Holonyms', [Synset(u'Aves'), Synset(u'flock')])

In [69]:

'v Meronyms', bird.meronyms()

Out[69]:

('v Meronyms',
 [Synset(u'beak'),
  Synset(u'furcula'),
  Synset(u'feather'),
  Synset(u'wing'),
  Synset(u'pennon'),
  Synset(u"bird's foot"),
  Synset(u'uropygium'),
  Synset(u'hindquarters'),
  Synset(u'air sac'),
  Synset(u'uropygial gland'),
  Synset(u'syrinx'),
  Synset(u'bird')])

In [70]:

wordnet.synsets('owl')[0].holonyms(), wordnet.synsets('amoeba')[0].holonyms()

Out[70]:

([Synset(u'Strigiformes')], [Synset(u'Amoebida')])

In [71]:

kitty = wordnet.synsets('kitten')[0]
pup = wordnet.synsets('puppy')[0]

wordnet.ancestor(kitty, pup)

Out[71]:

Synset(u'young mammal')

In [72]:

human = wordnet.synsets('human')[0]
cuy = wordnet.synsets('guinea pig')[0]

wordnet.similarity(human, cuy), \
wordnet.similarity(human, kitty)

Out[72]:

(0.26171115255795596, 0.5710216026393958)

pattern.search¶

In [73]:

s = 'the fluffy brown bunnies hopped across the wet grass with much gusto.'

In [74]:

from pattern.search import search

search('NP', parsetree(s))

Out[74]:

[Match(words=[Word(u'the/DT'), Word(u'fluffy/JJ'), Word(u'brown/JJ'), Word(u'bunnies/NNS')]),
 Match(words=[Word(u'the/DT'), Word(u'wet/JJ'), Word(u'grass/NN')]),
 Match(words=[Word(u'much/JJ'), Word(u'gusto/NN')])]

In [75]:

from pattern.search import taxonomy

for animal in ('bunny', 'dog', 'cat', 'banana'):
    taxonomy.append(animal, type='animal')
    
search('ANIMAL', parsetree(s, lemmata=True))

Out[75]:

[Match(words=[Word(u'the/DT'), Word(u'fluffy/JJ'), Word(u'brown/JJ'), Word(u'bunnies/NNS')])]

In [76]:

from pattern.search import Pattern

pat = Pattern.fromstring('{JJ} {ANIMAL} {VP}')
match = pat.match(parsetree(s, lemmata=True))
for i in range(0,4):
    print(match.group(i))

[Word(u'fluffy/JJ'), Word(u'brown/JJ'), Word(u'bunnies/NNS'), Word(u'hopped/VBD')]
[Word(u'fluffy/JJ')]
[Word(u'brown/JJ'), Word(u'bunnies/NNS')]
[Word(u'hopped/VBD')]

fin¶

more?