#imports from bs4 import BeautifulSoup, NavigableString, Tag import nltk import pandas as pd import re from itertools import islice from collections import Counter import os # Our tools from get_wiki_links import WikiUrlFetch, WikiUrlFetchNonDBPedia from get_wiki_text import Wiki2Plain PATH = 'Chapter1.xml' soup = BeautifulSoup(open(PATH, 'rt').read()) book_tags = set([tag.name for tag in soup.findAll(True)]) print(book_tags) # The following script looks for terms within select tags and creates a Pandas DataFrame object. # The DataFrame consists of "count" (term counter), "tag" (the XML tag for which the term is an attribute), # and "term" (the term found within the tag). list_for_df = [] tags = ['keyword','author','indexterm','orgname','personname','phrase'] for tag in tags: for i in soup.findAll(tag): if i.string == None: list_for_df.append({ 'term':" ".join([ child.lower().encode('utf-8') for child in i.stripped_strings ]), \ 'tag': tag, 'count': 1 }) else: list_for_df.append({ 'term': re.sub(' +','', i.string.lower().encode('utf-8')), 'tag': tag, 'count': 1 }) book_df = pd.DataFrame(list_for_df) # First 10 entries of the DataFrame. book_df[0:10] # We used NLTK (Natural Language Toolkit) to identify terms within the text that are not markedup # with XML. # First we create one large text by stripping the markup from XML. all_xml = " ".join([ line for line in soup.stripped_strings]) text = nltk.clean_html(all_xml) # We then clean the text through string subsitutions. text = re.sub(r"[\n\.,\(\)\?\!\-':]","",text) # Finally, we use nltk.pos_tag to insert part-of-speech tags to each word in the text. tagged_text = nltk.pos_tag(text.split(' ')) pos = ['NN','NNS','NNP','NNPS'] ## nltk noun tags terms = [] chain = [] for term in tagged_text: ## loop through tagged text (in its original order) if term[1] in pos: ## if a term is in our pos list, append the term to the chain and to the noun list chain.append(term) terms.append(term[0].lower().encode('UTF-8')) terms.append(" ".join([ item[0].lower().encode('UTF-8') for item in chain ])) else: ## at the end of a sequence of nouns, append the chain to the noun list and reset the chain if len(chain) > 0: terms.append(" ".join([ item[0].lower().encode('UTF-8') for item in chain ])) chain = [] terms = [ re.sub(' +',' ',term) for term in terms if term != '' ] # clean extra spaces from noun chains terms = [ re.sub('^ ','',term) for term in terms ] ## sample of terms scrapped print terms[100:130] terms_for_df = [] term_count = Counter(terms) # count_threshold should be a number that returns a good amount of information within a reasonable amount of time. # This depends on what the reader is looking for, and how much computational power they have. # Theoretically speaking, it could be 0 if a reader wants to see all terms, and can process that amount of data on their computer. count_threshold = 2 for term,count in term_count.iteritems(): if count >= count_threshold: terms_for_df.append({ 'term': term,'count': count }) #print term, count # check state of original book_df (366 terms) book_df ## create list of dict for appending to df list_for_df = [] for item in terms_for_df: if item['term'] != "" and item['term'] != " ": list_for_df.append({ 'term': item['term'], 'count': item['count'], 'tag': 'raw'}) book_df = book_df.append(pd.DataFrame(list_for_df)) ## check the raw terms that were appended df_rawterms = book_df[book_df['tag']=='raw'] df_rawterms # check state of new book_df (3229 terms) book_df ## drop duplicated terms book_df.drop_duplicates() from get_wiki_links import WikiUrlFetch, WikiUrlFetchNonDBPedia ## example query with WikiUrlFetch w = WikiUrlFetch() print w.fetch_wiki("Nunberg Geoff") from itertools import islice import time list_for_wiki_df = [] w = WikiUrlFetch() for row in islice(book_df.iterrows(),1000): # iterations limited 100 because WikiUrlFetch because of API limitations wikis = w.fetch_wiki(row[1]['term']) for wiki in wikis: print wiki if wiki['match'] == 'exact' or wiki['match'] == 'good-partial' or wiki['match'] == 'partial': list_for_wiki_df.append( { 'term': row[1]['term'], 'matched_term': wiki['term'], \ 'count': row[1]['count'], 'match': wiki['match'], 'url': wiki['wiki_url'] } ) print len(list_for_wiki_df) new_cols_df = pd.DataFrame(list_for_wiki_df) ## extract the exact matches exact_match = new_cols_df[new_cols_df['match'] == 'exact'] exact_match[:10] ## extract the good matches good_match = new_cols_df[new_cols_df['match'] == 'good-partial'] ## extract the partial matches partial_match = new_cols_df[new_cols_df['match'] == 'partial'] ## Create a DataFrame with only good-partial and exact match terms matched_terms = good_match.append(exact_match) matched_terms ## first 10 matched_terms matched_terms[:10] ## function using Wiki2Plain results to create two new DataFrame columns using apply def get_wiki_textimage(url): print url try: wiki = Wiki2Plain(url) except: return "none","none" return wiki.text, wiki.image() ## apply and use tuple result to create two DataFrames ## from http://manishamde.github.io/blog/2013/03/07/pandas-and-python-top-10/#create matched_terms['wiki_text'],matched_terms['wiki_image'] = zip(*matched_terms['url'].apply(get_wiki_textimage)) ## first ten terms matched_terms[:10] # http://ipython.org/ipython-doc/dev/api/generated/IPython.core.display.html#IPython.core.display.Image # width is IPython 14+, this is IPython version 0.13.1 from IPython.core.display import Image print matched_terms.ix[243]['wiki_text'] Image(url=matched_terms.ix[243]['wiki_image']) ## load imports and API Keys from flickr_api.auth import AuthHandler from flickr_api import FlickrError import flickr_api from tidings import Guardian, NewYorkTimes def load_api_keys(): if os.path.exists('settings.py'): from settings import FLICKR_KEY, FLICKR_SECRET, guardian_key, nyt_key secrets = {'api_key': FLICKR_KEY, 'api_secret': FLICKR_SECRET} news_keys = {'guardian_key': guardian_key, 'nyt_key': nyt_key } return secrets, news_keys else: print("settings.py not found, add to directory or run without flickr and news") sys.exit() secrets, news_keys = load_api_keys() def get_flickr_url(term): flickr_api.set_keys(**secrets) photos = flickr_api.Photo.search(tags=term, sort='date-posted-desc') if len(photos) == 0 or type(photos) == dict: return [] flickr_urls = [] for photo in photos[:3]: try: flickr_urls.append("http://farm{farm}.staticflickr.com/{server}/{id}_{secret}_m.jpg".format(**photo.getInfo())) except FlickrError: pass return flickr_urls matched_terms['flickr_urls'] = matched_terms['term'].apply(get_flickr_url) ## which flickr images did we get for the term 'museums'? print matched_terms.ix[565]['term'] flickr_url = matched_terms.ix[565]['flickr_urls'][1] Image(url=flickr_url) ## blocked from API? def get_news_url(term): g = Guardian(news_keys['guardian_key']) print term g_links = g.query(re.sub("\W","",term), from_date='2013-01-01', to_date='2013-4-30') return g_links[:3] matched_terms['news_urls'], = matched_terms['term'].apply(get_news_url) matched_terms[:10] ## write DataFrame import os path = os.getcwd() + '/wiki_urls_saved.csv' encoding = 'UTF-8' sep = ',' matched_terms.to_csv(path,sep,encoding) ## read DataFrame new_df = pd.read_csv(path) new_df = new_df.drop(['Unnamed: 0'],axis=1) #!/usr/bin/env python from urllib2 import urlopen import urllib2 import re import json from nltk import metrics from bs4 import BeautifulSoup, Tag class WikiUrlFetch(): def __init__(self): self.cleaned_term = None def fetch_wiki(self,term): self.cleaned_term = self.clean_term(term) self.results = self.get_wiki_url(self.cleaned_term) return self.results def clean_term(self,term): return re.sub(r"[^A-Za-z0-9 ]","",term.lower()) def check_dbpedia(self, term): api = 'http://lookup.dbpedia.org/api/search.asmx/KeywordSearch?MaxHits=5&QueryString=' #api = 'http://lookup.dbpedia.org/api/search.asmx/PrefixSearch?MaxHits=10&QueryString=' #print term try: response = urlopen(api+term) except: return "" soup = BeautifulSoup(response.read()) results = [] for result in soup.findAll('result'): for child in result.children: if isinstance(child,Tag): if child.name == 'label': current_label = child.string.lower() if child.name == 'uri': results.append({ 'term': current_label.encode('utf-8'), 'url': child.string.encode('utf-8') }) return self.rank_dbpedia_results(results,term) def normalize(self,string): strings = string.split(" ") strings.sort() return " ".join(strings) def rank_dbpedia_results(self,results,term): """ logic: if edit distance 0, exact match if edit distance 1-4, good-partial match if edit distance > 4, partial match (results unsorted) """ matches = [] for result in results: matches.append([metrics.edit_distance(self.normalize(result['term']), self.normalize(term)), result]) matches.sort() #print matches if len(matches) == 0: return [ { 'match': 'none', 'term': term.encode('utf-8') } ] elif matches[0][0] == 0: new_results = [ matches[0][1] ] new_results[0]['match'] = 'exact' return new_results elif matches[0][0] <= 8: new_results = [] for match in matches: if match[0] <= 3: result = match[1] result['match'] = 'good-partial' new_results.append(result) """ # needs refinement to provide good matches elif match[0] <= 5: words = self.normalize(result['term']).split(' ') if self.normalize(term) in words: result = match[1] result['match'] = 'good-partial' new_results.append(result) """ return new_results else: new_results = [] for result in results[0:2]: result['match'] = 'partial' new_results.append(result) return new_results def wiki_url(self,url): term = url[url.rfind('/'):] entity_page = 'http://dbpedia.org/data/{}.json'.format(term) wiki_type = 'http://xmlns.com/foaf/0.1/primaryTopic' try: response = urlopen(entity_page) except: return data = json.loads(response.read()) for key,value in data.items(): 'http://xmlns.com/foaf/0.1/primaryTopic' if 'http://xmlns.com/foaf/0.1/primaryTopic' in value: #print key return key.encode('utf-8') def get_wiki_url(self, term): results = self.check_dbpedia(term) for result in results: if result['match'] != 'none' and result['match'] != 'partial': wiki = self.wiki_url(result['url']) result['wiki_url'] = wiki return results class WikiUrlFetchNonDBPedia(): def __init__(self): self.wiki_api = 'http://en.wikipedia.org/w/api.php?action=query&list=search&format=json&srsearch=' self.cleaned_term = None def fetch_wiki(self,term): self.cleaned_term = self.clean_term(term) self.results = self.get_wiki_url(self.cleaned_term) return self.results def check_wikipedia_api(self,term): url = self.wiki_api+re.sub(' ','_',term) request = urllib2.Request(url) request.add_header('User-Agent', 'Mozilla/5.0') try: response = urlopen(request) except urllib2.HTTPError, e: print e.code return except urllib2.URLError, e: print e.reason return results = [] data = json.loads(response.read()) for key,value in data.items(): if 'search' in value: if type(value['search']) == list: results = self.parse_wikipedia_results(value['search']) if len(results) > 0: return self.rank_dbpedia_results(results,term) else: return [ { 'match': 'none', 'term': term.encode('utf-8') } ] def parse_wikipedia_results(self,results): base_wikipedia_url = 'http://en.wikipedia.org/wiki/' wiki_urls = [] for result in results: if 'title' in result: wiki_urls.append( \ { 'url': (base_wikipedia_url + re.sub(' ','_',result['title'])).encode('UTF-8') , \ 'term': result['title'].encode('UTF-8') } ) return wiki_urls def clean_term(self,term): return re.sub(r"[^A-Za-z0-9 ]","",term.lower()) def normalize(self,string): strings = string.lower().split(" ") strings.sort() return " ".join(strings) def rank_dbpedia_results(self,results,term): """ logic: if edit distance 0, exact match if edit distance 1-4, good-partial match if edit distance > 4, partial match (results unsorted) """ matches = [] for result in results: matches.append([metrics.edit_distance(self.normalize(result['term']), self.normalize(term)), result]) matches.sort() print term,matches print if len(matches) == 0: return [ { 'match': 'none', 'term': term.encode('utf-8') } ] elif matches[0][0] == 0: new_results = [ matches[0][1] ] new_results[0]['match'] = 'exact' return new_results elif matches[0][0] <= 5: new_results = [] for match in matches: if match[0] <= 3: result = match[1] result['match'] = 'good-partial' new_results.append(result) # needs refinement to provide good matches elif match[0] <= 5: words = self.normalize(result['term']).split(' ') if self.normalize(term) in words: result = match[1] result['match'] = 'good-partial' new_results.append(result) return new_results else: new_results = [] for result in results[0:2]: result['match'] = 'partial' new_results.append(result) return new_results def get_wiki_url(self, term): results = self.check_wikipedia_api(term) for result in results: if result['match'] != 'none' and result['match'] != 'partial': result['wiki_url'] = result['url'] #print results return results # source: http://stackoverflow.com/questions/4460921/extract-the-first-paragraph-from-a-wikipedia-article-python/4461624#4461624 #!/usr/bin/env python import re import yaml import urllib import urllib2 class WikipediaError(Exception): pass class Wikipedia: url_article = 'http://%s.wikipedia.org/w/index.php?action=raw&title=%s' url_image = 'http://%s.wikipedia.org/w/index.php?title=Special:FilePath&file=%s' url_search = 'http://%s.wikipedia.org/w/api.php?action=query&list=search&srsearch=%s&sroffset=%d&srlimit=%d&format=yaml' def __init__(self, lang): self.lang = lang def __fetch(self, url): request = urllib2.Request(url) request.add_header('User-Agent', 'Mozilla/5.0') try: result = urllib2.urlopen(request) except urllib2.HTTPError, e: raise WikipediaError(e.code) except urllib2.URLError, e: raise WikipediaError(e.reason) return result def article(self, url): article = url[url.rfind('/')+1:] url = self.url_article % (self.lang, urllib.quote_plus(article)) content = self.__fetch(url).read() if content.upper().startswith('#REDIRECT'): match = re.match('(?i)#REDIRECT \[\[([^\[\]]+)\]\]', content) if not match == None: return self.article(match.group(1)) raise WikipediaError('Can\'t found redirect article.') return content def image(self, image, thumb=None): url = self.url_image % (self.lang, image) result = self.__fetch(url) content = result.read() if thumb: url = result.geturl() + '/' + thumb + 'px-' + image url = url.replace('/commons/', '/commons/thumb/') url = url.replace('/' + self.lang + '/', '/' + self.lang + '/thumb/') return self.__fetch(url).read() return content def search(self, query, page=1, limit=10): offset = (page - 1) * limit url = self.url_search % (self.lang, urllib.quote_plus(query), offset, limit) content = self.__fetch(url).read() parsed = yaml.load(content) search = parsed['query']['search'] results = [] if search: for article in search: title = article['title'].strip() snippet = article['snippet'] snippet = re.sub(r'(?m)<.*?>', '', snippet) snippet = re.sub(r'\s+', ' ', snippet) snippet = snippet.replace(' . ', '. ') snippet = snippet.replace(' , ', ', ') snippet = snippet.strip() wordcount = article['wordcount'] results.append({ 'title' : title, 'snippet' : snippet, 'wordcount' : wordcount }) # yaml.dump(results, default_style='', default_flow_style=False, # allow_unicode=True) return results class Wiki2Plain: # url_article = 'http://%s.wikipedia.org/w/index.php?action=raw&title=%s' url_image = 'http://%s.wikipedia.org/w/index.php?title=Special:FilePath&file=%s' # url_search = 'http://%s.wikipedia.org/w/api.php?action=query&list=search&srsearch=%s&sroffset=%d&srlimit=%d&format=yaml' def __init__(self, url): self.wiki = Wikipedia('en') self.wiki_article = self.wiki.article(url) self.text = self.wiki_article self.text = self.unhtml(self.text) self.text = self.unwiki(self.text) self.text = self.punctuate(self.text) self.text = self.get_summary(self.text) def __str__(self): return self.text def unwiki(self, wiki): """ Remove wiki markup from the text. """ wiki = re.sub(r'(?i)\{\{IPA(\-[^\|\{\}]+)*?\|([^\|\{\}]+)(\|[^\{\}]+)*?\}\}', lambda m: m.group(2), wiki) wiki = re.sub(r'(?i)\{\{Lang(\-[^\|\{\}]+)*?\|([^\|\{\}]+)(\|[^\{\}]+)*?\}\}', lambda m: m.group(2), wiki) wiki = re.sub(r'\{\{[^\{\}]+\}\}', '', wiki) wiki = re.sub(r'(?m)\{\{[^\{\}]+\}\}', '', wiki) wiki = re.sub(r'(?m)\{\|[^\{\}]*?\|\}', '', wiki) wiki = re.sub(r'(?i)\[\[Category:[^\[\]]*?\]\]', '', wiki) wiki = re.sub(r'(?i)\[\[Image:[^\[\]]*?\]\]', '', wiki) wiki = re.sub(r'(?i)\[\[File:[^\[\]]*?\]\]', '', wiki) wiki = re.sub(r'\[\[[^\[\]]*?\|([^\[\]]*?)\]\]', lambda m: m.group(1), wiki) wiki = re.sub(r'\[\[([^\[\]]+?)\]\]', lambda m: m.group(1), wiki) wiki = re.sub(r'\[\[([^\[\]]+?)\]\]', '', wiki) wiki = re.sub(r'(?i)File:[^\[\]]*?', '', wiki) wiki = re.sub(r'\[[^\[\]]*? ([^\[\]]*?)\]', lambda m: m.group(1), wiki) wiki = re.sub(r"''+", '', wiki) wiki = re.sub(r'(?m)^\*$', '', wiki) return wiki def unhtml(self, html): """ Remove HTML from the text. """ html = re.sub(r'(?i) ', ' ', html) html = re.sub(r'(?i)', '\n', html) html = re.sub(r'(?m))?\s*([^\\/:*?<>"|%]+\.[^\\/:*?<>"|%]{3,4})', self.wiki) match = re.search(r'= (\b[\w ]+\b)+.(gif|jpg|jpeg|png|bmp)', self.wiki_article) if match: image_url = url_image + '%s.%s' % match.groups() image_url = re.sub(' ', '_', image_url) return image_url return None