import os
creds = {
'twitter': (os.environ['TW_API_KEY'], os.environ['TW_API_SEC'], (os.environ['TW_ACC_KEY'], os.environ['TW_ACC_SEC'])),
'facebook': os.environ['FB_API_KEY'],
'google': os.environ['GG_API_KEY']
}
import pattern
Kinda. But it's not perfect.
Individual modules are obviously less flexible than unipurpose modules
Nice collection of useful tools if you're dealing with anything related to linguistic processing
from pattern.web import Twitter, Facebook, DuckDuckGo, Google, Bing, Wikipedia, Wikia, Newsfeed
from pattern.web import SEARCH, NEWS, IMAGE, SPARQL, COMMENTS, LIKES
tw = Twitter(license=creds['twitter'])
fb = Facebook(license=creds['facebook'])
ggl = Google(license=creds['google'])
# None uses credentials shared across ALL pattern users
ddg = DuckDuckGo(license=None)
bing = Bing(license=None)
wp = Wikipedia(license=None)
wa = Wikia(license=None)
nf = Newsfeed(license=None) # RSS / atom
tw.trends()
[u'#HaftasonuRandevusuzOyVermekistiyorumYSK', u'#muratgogebakan', u'#ThankYouBestfriend', u'#amnesiaEP', u'#BamOfficialChartThisSunday', u'One Direction #BestFandom2014 Directioners', u'Justin Bieber #BestFandom2014 Beliebers', u'RIP Sir Bobby Robson', u'Anabel', u'Feliz Jueves']
twitter_result = tw.search('ananas', count=100)
twitter_result[0]
Result({u'profile': u'http://pbs.twimg.com/profile_images/486028208689905664/r-vzT-qS_normal.jpeg', u'language': u'ru', u'author': u'shikari_ananas', u'url': u'https://twitter.com/shikari_ananas/status/494802120152207360', u'text': u'@nnnoda \u0432\u0441\u0435 \u0444\u0430\u043d\u044e\u0447\u043a\u0438 \u043d\u043e\u0434\u044b \u0442\u0430\u043a\u0438\u0435', u'date': u'Thu Jul 31 11:09:53 +0000 2014', u'id': u'494802120152207360'})
from collections import Counter
languages = [tweet.language for tweet in twitter_result]
Counter(languages)
Counter({u'ru': 60, u'fr': 12, u'en': 7, u'pt': 6, u'tr': 4, u'de': 3, u'it': 3, u'nl': 1, u'vi': 1, u'da': 1, u'tl': 1, u'in': 1})
from pattern.web.locale import geocode
in_result = tw.search('#EngvInd', count=50, geo=geocode('New Delhi')[:2])
en_result = tw.search('#EngvInd', count=50, geo=geocode('London')[:2])
from pattern.en import sentiment
reduce(lambda x, y: x+y, [sentiment(t.text)[0] for t in in_result]) / len(in_result)
0.14291278166278154
reduce(lambda x, y: x+y, [sentiment(t.text)[0] for t in en_result]) / len(en_result)
0.12380667249417247
fb_result = fb.search('dragon age inquisition', type=SEARCH, count=100)
fb_result[0]
Result(id=u'449022588487492_753024791420602')
for post in fb_result:
if post.likes > 0 and post.comments > 0:
print('\n\n'.join( [l.author[1] for l in fb.search(post.id, type=LIKES)] ))
print('-' * 10)
print('\n\n'.join( [c.text for c in fb.search(post.id, type=COMMENTS)] ))
break
Ÿøüčēf Fōüīnÿ Sawako Heiwajima Rex Larbi Poochie Issâm Phoneix مول الاير ماكس ---------- ET GTA V sur PC ?? c'est pour quand ? :(
google_result = ggl.search('dragon age inquisition', count=10)
google_result[0]
Result({u'url': u'http://www.dragonage.com/', u'text': u'Beautiful vistas and incredible new possibilities await you in the latest game in <br>\nthe epic role-playing series from BioWare \u2013 <b>Dragon Age</b>: <b>Inquisition</b>.', u'date': u'', u'language': u'', u'title': u'Dragon Age: Inquisition'})
lang_id = [ggl.identify(res.text) for res in fb_result[:10]]
lang_id
[(u'en', 0.9998242), (u'en', 0.120250024), (u'en', 0.6363636), (u'en', 0.033820875), (u'en', 0.2889652), (u'en', 0.6282051)]
ggl.translate(fb_result[0].text, input=lang_id[0][0], output='de')
u'Dragon Age: Inquisition Kampf Anh\xe4nger burninates dem Land einen neuen Trailer zu Dragon Age: Inquisition nicht bieten neue Informationen \xfcber das Spiel der Kampf, aber es ist ermutigend zu BioWare Fokus auf der Ebene der Strategie, die von der Echtzeit-mit-Pause zu sehen System. - http://www.rheena.com/reviews-games-chat-hardware-sony/212236-dragon-age-inquisition-combat-trailer-burninates-countryside.html'
from pattern.web import find_urls, find_email
s = '''
Find out more at the PUGS website (http://pugs.org.sg), or email us at idontknow@whatouremail.is!
This is a decoy URL http://pugs.org.sg.
'''
find_urls(s, unique=True), \
find_email(s, unique=True)
([u'http://pugs.org.sg'], [u'idontknow@whatouremail.is'])
from pattern.web import sort, GOOGLE
terms = [
'french',
'german',
'japanese',
'chinese',
'persian',
'hun',
'american',
'russian',
'swede',
'polish',
'singaporean',
'politician',
]
sort_result = sort(terms=terms, context='dangerous', prefix=True, service=GOOGLE, license=creds['google'])
for weight, term in sort_result:
print "%.2f" % (weight * 100) + '%', term
28.50% "dangerous american" 21.18% "dangerous russian" 11.85% "dangerous japanese" 11.22% "dangerous chinese" 10.10% "dangerous politician" 8.99% "dangerous german" 6.00% "dangerous french" 1.12% "dangerous polish" 0.53% "dangerous hun" 0.33% "dangerous persian" 0.17% "dangerous swede" 0.01% "dangerous singaporean"
from pattern.en import article, referenced
article('harbour'), \
referenced('umbrella')
('a', 'an umbrella')
from pattern.en import (pluralize as pluralise, singularize as singularise)
pluralise('octopus')
'octopodes'
pluralise('octopus', classical=False)
'octopuses'
pluralise('I'), \
pluralise('my'), \
pluralise('her')
('we', 'our', 'their')
singularise('bacteria')
'bacterium'
pluralise('virus')
'viruss'
singularise('viruses'), \
singularise('virii'), \
singularise('virus')
('viruse', 'virius', 'viru')
singularise('viri')
'virus'
from pattern.en import comparative, superlative
'python is %s than ruby!' % comparative('good')
'python is better than ruby!'
'iPython is %s python shell' % (
referenced(superlative('ideal'), article=pattern.en.DEFINITE))
'iPython is the most ideal python shell'
from pattern.en import conjugate, lemma, lexeme, tenses
lemma('are')
u'be'
tenses('be'), \
tenses('were')
([('infinitive', None, None, None, None), ('present', None, 'plural', 'indicative', 'imperfective'), ('present', 1, 'plural', 'indicative', 'imperfective'), ('present', 1, 'singular', 'indicative', 'imperfective'), ('present', 2, 'plural', 'indicative', 'imperfective'), ('present', 2, 'singular', 'indicative', 'imperfective'), ('present', 3, 'plural', 'indicative', 'imperfective'), ('present', 3, 'singular', 'indicative', 'imperfective')], [('past', None, None, 'indicative', 'imperfective'), ('past', None, 'plural', 'indicative', 'imperfective'), ('past', 1, 'plural', 'indicative', 'imperfective'), ('past', 1, 'singular', 'indicative', 'imperfective'), ('past', 2, 'plural', 'indicative', 'imperfective'), ('past', 2, 'singular', 'indicative', 'imperfective'), ('past', 3, 'plural', 'indicative', 'imperfective'), ('past', 3, 'singular', 'indicative', 'imperfective')])
lexeme('be')
[u'be', u'am', u'are', u'is', u'being', u'was', u'were', u'been', u'am not', u"aren't", u"isn't", u"wasn't", u"weren't"]
conjugate('nibble', '1sgp'), \
conjugate('nibble', '3sg')
(u'nibbled', u'nibbles')
conjugate('google', tense=pattern.en.PARTICIPLE, parse=False), \
conjugate('google', tense=pattern.en.PARTICIPLE, parse=True)
(None, 'googling')
from pattern.en import number
number('five thousand six hundred and eighty nine')
5689
from pattern.en import numerals
numerals('42.128', round=2)
'forty-two point thirteen'
animals = ['goose', 'goose', 'duck', 'chicken', 'chicken', 'chicken']
orangebirds = {'carrot': 100, 'parrot': 5, 'orange': 20}
from pattern.en import quantify
quantify(animals), \
quantify(orangebirds)
('several chickens, a pair of geese and a duck', 'dozens of carrots, a score of oranges and several parrots')
from pattern.en import suggest
suggest('psuh')
[('push', 1.0)]
suggest('carot')
[('cart', 0.9032258064516129), ('cabot', 0.04838709677419355), ('carrot', 0.03225806451612903), ('caret', 0.016129032258064516)]
from pattern.en import ngrams
ngrams('This is a sentence', n=2)
[('This', 'is'), ('is', 'a'), ('a', 'sentence')]
ngrams('This is a sentence', n=3)
[('This', 'is', 'a'), ('is', 'a', 'sentence')]
short_s = 'When I saw the prices for some of those apartments, I was startled.'
long_s = 'Personally, I think the only unassailable definition is the one often attributed to the great editor John W Campbell: "Science fiction is what I say it is."'
from pattern.en import tag
for word, pos in tag(short_s):
if pos[:2] == 'NN':
print(word)
prices apartments
from pattern.en import parse
parsed = parse(short_s, tokenize=True, tags=True, chunks=True, relations=True, lemmata=True)
parsed
u'When/WRB/O/O/O/when I/PRP/B-NP/O/NP-SBJ-1/i saw/VBD/B-VP/O/VP-1/see the/DT/B-NP/O/NP-OBJ-1/the prices/NNS/I-NP/O/NP-OBJ-1/price for/IN/B-PP/O/O/for some/DT/O/O/O/some of/IN/B-PP/B-PNP/O/of those/DT/B-NP/I-PNP/O/those apartments/NNS/I-NP/I-PNP/O/apartment ,/,/O/O/O/, I/PRP/B-NP/O/NP-SBJ-2/i was/VBD/B-VP/O/VP-2/be startled/VBN/I-VP/O/VP-2/startle ././O/O/O/.'
from pattern.en import pprint
pprint(parsed)
WORD TAG CHUNK ROLE ID PNP LEMMA When WRB - - - - when I PRP NP SBJ 1 - i saw VBD VP - 1 - see the DT NP OBJ 1 - the prices NNS NP ^ OBJ 1 - price for IN PP - - - for some DT - - - - some of IN PP - - PNP of those DT NP - - PNP those apartments NNS NP ^ - - PNP apartment , , - - - - , I PRP NP SBJ 2 - i was VBD VP - 2 - be startled VBN VP ^ - 2 - startle . . - - - - .
from pattern.en import parsetree
parsetree(short_s, tokenize=True, tags=True, chunks=True, relations=True, lemmata=True)
[Sentence('When/WRB/O/O/O/when I/PRP/B-NP/O/NP-SBJ-1/i saw/VBD/B-VP/O/VP-1/see the/DT/B-NP/O/NP-OBJ-1/the prices/NNS/I-NP/O/NP-OBJ-1/price for/IN/B-PP/O/O/for some/DT/O/O/O/some of/IN/B-PP/B-PNP/O/of those/DT/B-NP/I-PNP/O/those apartments/NNS/I-NP/I-PNP/O/apartment ,/,/O/O/O/, I/PRP/B-NP/O/NP-SBJ-2/i was/VBD/B-VP/O/VP-2/be startled/VBN/I-VP/O/VP-2/startle ././O/O/O/.')]
tr = parsetree(long_s, tokenize=True, tags=True, chunks=True, relations=True, lemmata=True)
print(type(tr))
<class 'pattern.text.tree.Text'>
[type(item) for item in tr]
[pattern.text.tree.Sentence, pattern.text.tree.Sentence]
type(tr[0][0])
pattern.text.tree.Word
for sentence in tr:
for chunk in sentence.chunks:
print(chunk.type, [(w.string, w.type) for w in chunk.words])
(u'ADVP', [(u'Personally', u'RB')]) (u'NP', [(u'I', u'PRP')]) (u'VP', [(u'think', u'VBP')]) (u'NP', [(u'the', u'DT'), (u'only', u'JJ'), (u'unassailable', u'JJ'), (u'definition', u'NN')]) (u'VP', [(u'is', u'VBZ')]) (u'VP', [(u'often', u'RB'), (u'attributed', u'VBN')]) (u'NP', [(u'the', u'DT'), (u'great', u'JJ'), (u'editor', u'NN'), (u'John', u'NNP'), (u'W', u'NNP'), (u'Campbell', u'NNP')]) (u'NP', [(u'Science', u'NN'), (u'fiction', u'NN')]) (u'VP', [(u'is', u'VBZ')]) (u'NP', [(u'I', u'PRP')]) (u'VP', [(u'say', u'VBP')]) (u'NP', [(u'it', u'PRP')]) (u'VP', [(u'is', u'VBZ')])
from pattern.en import sentiment
sg_result = tw.search('causeway', count=50, geo=geocode('Singapore')[:2])
my_result = tw.search('causeway', count=50, geo=geocode('Kuala Lumpur')[:2])
sg_sentiment = sorted(sentiment(tweet.text)[0] for tweet in sg_result)
my_sentiment = sorted(sentiment(tweet.text)[0] for tweet in my_result)
sg_avg = sum(sg_sentiment) / len(sg_sentiment)
my_avg = sum(my_sentiment) / len(my_sentiment)
sg_avg, my_avg
(0.079808080808080814, -0.24100000000000008)
x = linspace(0, 50)
plot(sg_sentiment, 'r-', label='sg')
plot(x, [sg_avg for i in xrange(0, 50)], 'r--', label='sg_avg')
plot(my_sentiment, 'g-', label='my')
plot(x, [my_avg for i in xrange(0, 50)], 'g--', label='my_avg')
grid(b=True, which='both')
legend(loc='best')
<matplotlib.legend.Legend at 0x7f0cedfa2650>
from pattern.en import wordnet
birds = wordnet.synsets('bird')
birds
[Synset(u'bird'), Synset(u'bird'), Synset(u'dame'), Synset(u'boo'), Synset(u'shuttlecock')]
bird = birds[0]
'Definition', bird.gloss
('Definition', u'warm-blooded egg-laying vertebrates characterized by feathers and forelimbs modified as wings')
'- Synonyms', bird.synonyms
('- Synonyms', [u'bird'])
'^ Hypernyms', bird.hypernyms()
('^ Hypernyms', [Synset(u'vertebrate')])
'v Hyponyms', bird.hyponyms()
('v Hyponyms', [Synset(u'dickeybird'), Synset(u'cock'), Synset(u'hen'), Synset(u'nester'), Synset(u'night bird'), Synset(u'bird of passage'), Synset(u'protoavis'), Synset(u'archaeopteryx'), Synset(u'Sinornis'), Synset(u'Ibero-mesornis'), Synset(u'archaeornis'), Synset(u'ratite'), Synset(u'carinate'), Synset(u'passerine'), Synset(u'nonpasserine bird'), Synset(u'bird of prey'), Synset(u'gallinaceous bird'), Synset(u'parrot'), Synset(u'cuculiform bird'), Synset(u'coraciiform bird'), Synset(u'apodiform bird'), Synset(u'caprimulgiform bird'), Synset(u'piciform bird'), Synset(u'trogon'), Synset(u'aquatic bird'), Synset(u'twitterer')])
'^ Holonyms', bird.holonyms()
('^ Holonyms', [Synset(u'Aves'), Synset(u'flock')])
'v Meronyms', bird.meronyms()
('v Meronyms', [Synset(u'beak'), Synset(u'furcula'), Synset(u'feather'), Synset(u'wing'), Synset(u'pennon'), Synset(u"bird's foot"), Synset(u'uropygium'), Synset(u'hindquarters'), Synset(u'air sac'), Synset(u'uropygial gland'), Synset(u'syrinx'), Synset(u'bird')])
wordnet.synsets('owl')[0].holonyms(), wordnet.synsets('amoeba')[0].holonyms()
([Synset(u'Strigiformes')], [Synset(u'Amoebida')])
kitty = wordnet.synsets('kitten')[0]
pup = wordnet.synsets('puppy')[0]
wordnet.ancestor(kitty, pup)
Synset(u'young mammal')
human = wordnet.synsets('human')[0]
cuy = wordnet.synsets('guinea pig')[0]
wordnet.similarity(human, cuy), \
wordnet.similarity(human, kitty)
(0.26171115255795596, 0.5710216026393958)
s = 'the fluffy brown bunnies hopped across the wet grass with much gusto.'
from pattern.search import search
search('NP', parsetree(s))
[Match(words=[Word(u'the/DT'), Word(u'fluffy/JJ'), Word(u'brown/JJ'), Word(u'bunnies/NNS')]), Match(words=[Word(u'the/DT'), Word(u'wet/JJ'), Word(u'grass/NN')]), Match(words=[Word(u'much/JJ'), Word(u'gusto/NN')])]
from pattern.search import taxonomy
for animal in ('bunny', 'dog', 'cat', 'banana'):
taxonomy.append(animal, type='animal')
search('ANIMAL', parsetree(s, lemmata=True))
[Match(words=[Word(u'the/DT'), Word(u'fluffy/JJ'), Word(u'brown/JJ'), Word(u'bunnies/NNS')])]
from pattern.search import Pattern
pat = Pattern.fromstring('{JJ} {ANIMAL} {VP}')
match = pat.match(parsetree(s, lemmata=True))
for i in range(0,4):
print(match.group(i))
[Word(u'fluffy/JJ'), Word(u'brown/JJ'), Word(u'bunnies/NNS'), Word(u'hopped/VBD')] [Word(u'fluffy/JJ')] [Word(u'brown/JJ'), Word(u'bunnies/NNS')] [Word(u'hopped/VBD')]