#imports
from bs4 import BeautifulSoup, NavigableString, Tag
import nltk
import pandas as pd
import re
from itertools import islice
from collections import Counter
import os

# Our tools
from get_wiki_links import WikiUrlFetch, WikiUrlFetchNonDBPedia
from get_wiki_text import Wiki2Plain

PATH = 'Chapter1.xml'
soup = BeautifulSoup(open(PATH, 'rt').read())

book_tags = set([tag.name for tag in soup.findAll(True)])
print(book_tags)

# The following script looks for terms within select tags and creates a Pandas DataFrame object.
# The DataFrame consists of "count" (term counter), "tag" (the XML tag for which the term is an attribute),
# and "term" (the term found within the tag).

list_for_df = []
tags = ['keyword','author','indexterm','orgname','personname','phrase']
for tag in tags:
    for i in soup.findAll(tag):
        if i.string == None:
            list_for_df.append({ 'term':" ".join([ child.lower().encode('utf-8') for child in i.stripped_strings ]), \
                                'tag': tag, 'count': 1 })
        else:
            list_for_df.append({ 'term': re.sub('  +','', i.string.lower().encode('utf-8')), 'tag': tag, 'count': 1 })
        
book_df = pd.DataFrame(list_for_df)

# First 10 entries of the DataFrame.
book_df[0:10]

# We used NLTK (Natural Language Toolkit) to identify terms within the text that are not markedup
# with XML.

# First we create one large text by stripping the markup from XML.
all_xml =  " ".join([ line for line in soup.stripped_strings])
text = nltk.clean_html(all_xml)

# We then clean the text through string subsitutions.
text = re.sub(r"[\n\.,\(\)\?\!\-':]","",text)

# Finally, we use nltk.pos_tag to insert part-of-speech tags to each word in the text.
tagged_text = nltk.pos_tag(text.split(' '))

pos = ['NN','NNS','NNP','NNPS'] ## nltk noun tags

terms = []
chain = []
for term in tagged_text: ## loop through tagged text (in its original order)
    
    if term[1] in pos: ## if a term is in our pos list, append the term to the chain and to the noun list
        chain.append(term)
        terms.append(term[0].lower().encode('UTF-8'))
        terms.append(" ".join([ item[0].lower().encode('UTF-8') for item in chain ]))
        
    else: ## at the end of a sequence of nouns, append the chain to the noun list and reset the chain
        if len(chain) > 0:
            terms.append(" ".join([ item[0].lower().encode('UTF-8') for item in chain ]))
            chain = []
            
terms = [ re.sub(' +',' ',term) for term in terms if term != '' ] # clean extra spaces from noun chains
terms = [ re.sub('^ ','',term) for term in terms ] 

## sample of terms scrapped
print terms[100:130]

terms_for_df = []
term_count = Counter(terms)

# count_threshold should be a number that returns a good amount of information within a reasonable amount of time.
# This depends on what the reader is looking for, and how much computational power they have.
# Theoretically speaking, it could be 0 if a reader wants to see all terms, and can process that amount of data on their computer.
count_threshold = 2

for term,count in term_count.iteritems():
    if count >= count_threshold:
        terms_for_df.append({ 'term': term,'count': count })
        #print term, count

# check state of original book_df (366 terms)
book_df

## create list of dict for appending to df
list_for_df = []
for item in terms_for_df:
    if item['term'] != "" and item['term'] != " ":
        list_for_df.append({ 'term': item['term'], 'count': item['count'], 'tag': 'raw'})

book_df = book_df.append(pd.DataFrame(list_for_df))

## check the raw terms that were appended
df_rawterms = book_df[book_df['tag']=='raw']

df_rawterms

# check state of new book_df (3229 terms)
book_df

## drop duplicated terms
book_df.drop_duplicates()

from get_wiki_links import WikiUrlFetch, WikiUrlFetchNonDBPedia

## example query with WikiUrlFetch
w = WikiUrlFetch()
print w.fetch_wiki("Nunberg Geoff")

from itertools import islice
import time

list_for_wiki_df = []
w = WikiUrlFetch()

for row in islice(book_df.iterrows(),1000): # iterations limited 100 because WikiUrlFetch because of API limitations

    wikis = w.fetch_wiki(row[1]['term'])
    
    for wiki in wikis:
        print wiki
        if wiki['match'] == 'exact' or wiki['match'] == 'good-partial' or wiki['match'] == 'partial':
            list_for_wiki_df.append( { 'term': row[1]['term'], 'matched_term': wiki['term'], \
            'count': row[1]['count'], 'match': wiki['match'], 'url': wiki['wiki_url'] } )

print len(list_for_wiki_df)
new_cols_df = pd.DataFrame(list_for_wiki_df)

## extract the exact matches
exact_match = new_cols_df[new_cols_df['match'] == 'exact']
exact_match[:10]

## extract the good matches
good_match = new_cols_df[new_cols_df['match'] == 'good-partial']

## extract the partial matches
partial_match = new_cols_df[new_cols_df['match'] == 'partial']

## Create a DataFrame with only good-partial and exact match terms
matched_terms = good_match.append(exact_match)

matched_terms

## first 10 matched_terms
matched_terms[:10]

## function using Wiki2Plain results to create two new DataFrame columns using apply
def get_wiki_textimage(url):
    print url
    try:
        wiki = Wiki2Plain(url)
    except:
        return "none","none"
    return wiki.text, wiki.image()

## apply and use tuple result to create two DataFrames
## from http://manishamde.github.io/blog/2013/03/07/pandas-and-python-top-10/#create

matched_terms['wiki_text'],matched_terms['wiki_image'] = zip(*matched_terms['url'].apply(get_wiki_textimage))

## first ten terms
matched_terms[:10]

# http://ipython.org/ipython-doc/dev/api/generated/IPython.core.display.html#IPython.core.display.Image
# width is IPython 14+, this is IPython version 0.13.1
from IPython.core.display import Image
print matched_terms.ix[243]['wiki_text']
Image(url=matched_terms.ix[243]['wiki_image'])

## load imports and API Keys
from flickr_api.auth import AuthHandler
from flickr_api import FlickrError
import flickr_api
from tidings import Guardian, NewYorkTimes

def load_api_keys():
    if os.path.exists('settings.py'):
        from settings import FLICKR_KEY, FLICKR_SECRET, guardian_key, nyt_key
        secrets = {'api_key': FLICKR_KEY, 'api_secret': FLICKR_SECRET}
        news_keys = {'guardian_key': guardian_key, 'nyt_key': nyt_key }
        return secrets, news_keys
    else:
        print("settings.py not found, add to directory or run without flickr and news")
        sys.exit()

secrets, news_keys = load_api_keys()

def get_flickr_url(term):
    flickr_api.set_keys(**secrets)
    photos = flickr_api.Photo.search(tags=term, sort='date-posted-desc')
 
    if len(photos) == 0 or type(photos) == dict:
        return []
    
    flickr_urls = []
    for photo in photos[:3]:
        try:
            flickr_urls.append("http://farm{farm}.staticflickr.com/{server}/{id}_{secret}_m.jpg".format(**photo.getInfo()))
        except FlickrError:
            pass

    return flickr_urls

matched_terms['flickr_urls'] = matched_terms['term'].apply(get_flickr_url)

## which flickr images did we get for the term 'museums'?
print matched_terms.ix[565]['term']
flickr_url = matched_terms.ix[565]['flickr_urls'][1]
Image(url=flickr_url)

## blocked from API?
def get_news_url(term):
    g = Guardian(news_keys['guardian_key'])
    print term
    g_links = g.query(re.sub("\W","",term), from_date='2013-01-01', to_date='2013-4-30')
    return g_links[:3]

matched_terms['news_urls'], = matched_terms['term'].apply(get_news_url)

matched_terms[:10]

## write DataFrame
import os
path = os.getcwd() + '/wiki_urls_saved.csv'
encoding = 'UTF-8'
sep = ','
matched_terms.to_csv(path,sep,encoding)

## read DataFrame
new_df = pd.read_csv(path)
new_df = new_df.drop(['Unnamed: 0'],axis=1)

#!/usr/bin/env python

from urllib2 import urlopen
import urllib2
import re
import json
from nltk import metrics
from bs4 import BeautifulSoup, Tag

class WikiUrlFetch():

    def __init__(self):
        self.cleaned_term = None

    def fetch_wiki(self,term):
        self.cleaned_term = self.clean_term(term)
        self.results = self.get_wiki_url(self.cleaned_term)
        return self.results

    def clean_term(self,term):
        return re.sub(r"[^A-Za-z0-9 ]","",term.lower())
        
    def check_dbpedia(self, term):
        api = 'http://lookup.dbpedia.org/api/search.asmx/KeywordSearch?MaxHits=5&QueryString='
        #api = 'http://lookup.dbpedia.org/api/search.asmx/PrefixSearch?MaxHits=10&QueryString='
       
        #print term

        try:
            response = urlopen(api+term)
        except:
            return ""
    
        soup = BeautifulSoup(response.read())
    
        results = []
        for result in soup.findAll('result'):
            for child in result.children:
                if isinstance(child,Tag):
                    if child.name == 'label':
                        current_label = child.string.lower()
                    if child.name == 'uri':
                        results.append({ 'term': current_label.encode('utf-8'), 'url': child.string.encode('utf-8') })
        
        return self.rank_dbpedia_results(results,term)
    
    def normalize(self,string):
        strings = string.split(" ")
        strings.sort()
        return " ".join(strings)

    def rank_dbpedia_results(self,results,term):
        """
        logic:
            if edit distance 0, exact match
            if edit distance 1-4, good-partial match
            if edit distance > 4, partial match (results unsorted)
        """

        matches = []
        for result in results:
            matches.append([metrics.edit_distance(self.normalize(result['term']), self.normalize(term)), result])

        matches.sort()
        #print matches
        if len(matches) == 0:
            return [ { 'match': 'none', 'term': term.encode('utf-8') } ] 
        
        elif matches[0][0] == 0:
            new_results = [ matches[0][1] ]
            new_results[0]['match'] = 'exact'
            return new_results

        elif matches[0][0] <= 8:
            new_results = []
            for match in matches:
                if match[0] <= 3:
                    result = match[1]
                    result['match'] = 'good-partial'
                    new_results.append(result)

                """ # needs refinement to provide good matches
                elif match[0] <= 5:
                    words = self.normalize(result['term']).split(' ')
                    if self.normalize(term) in words:
                        result = match[1]
                        result['match'] = 'good-partial'
                        new_results.append(result)
                """
            return new_results

        else:
            new_results = []
            for result in results[0:2]:
                result['match'] = 'partial'
                new_results.append(result)
            return new_results
    
    def wiki_url(self,url):

        term = url[url.rfind('/'):]
        entity_page = 'http://dbpedia.org/data/{}.json'.format(term)
    
        wiki_type = 'http://xmlns.com/foaf/0.1/primaryTopic'
    
        try:
            response = urlopen(entity_page)
        except:
            return
    
        data = json.loads(response.read())
        for key,value in data.items():
            'http://xmlns.com/foaf/0.1/primaryTopic'
            if 'http://xmlns.com/foaf/0.1/primaryTopic' in value:
                #print key
                return key.encode('utf-8')

    def get_wiki_url(self, term):
    
        results = self.check_dbpedia(term)

        for result in results:
            if result['match'] != 'none' and result['match'] != 'partial':
                wiki = self.wiki_url(result['url'])
                result['wiki_url'] = wiki

        return results

class WikiUrlFetchNonDBPedia():

    def __init__(self):
        self.wiki_api =  'http://en.wikipedia.org/w/api.php?action=query&list=search&format=json&srsearch='
        self.cleaned_term = None

    def fetch_wiki(self,term):
        self.cleaned_term = self.clean_term(term)
        self.results = self.get_wiki_url(self.cleaned_term)
        return self.results

    def check_wikipedia_api(self,term):

        url = self.wiki_api+re.sub(' ','_',term)
        request = urllib2.Request(url)
        request.add_header('User-Agent', 'Mozilla/5.0')

        try:
            response = urlopen(request)
        except urllib2.HTTPError, e:
            print e.code
            return
        except urllib2.URLError, e:
            print e.reason
            return

        results = []
        data = json.loads(response.read())
        for key,value in data.items():
            if 'search' in value:
                if type(value['search']) == list:
                    results = self.parse_wikipedia_results(value['search'])

        if len(results) > 0:
            return self.rank_dbpedia_results(results,term)
        else:
            return [ { 'match': 'none', 'term': term.encode('utf-8') } ]

    def parse_wikipedia_results(self,results):
        base_wikipedia_url = 'http://en.wikipedia.org/wiki/'
        wiki_urls = []
        for result in results:
            if 'title' in result:
                wiki_urls.append( \
                { 'url': (base_wikipedia_url + re.sub(' ','_',result['title'])).encode('UTF-8') , \
                'term': result['title'].encode('UTF-8') } )

        return wiki_urls

    def clean_term(self,term):
        return re.sub(r"[^A-Za-z0-9 ]","",term.lower())

    def normalize(self,string):
        strings = string.lower().split(" ")
        strings.sort()
        return " ".join(strings)

    def rank_dbpedia_results(self,results,term):
        """
        logic:
            if edit distance 0, exact match
            if edit distance 1-4, good-partial match
            if edit distance > 4, partial match (results unsorted)
        """

        matches = []
        for result in results:
            matches.append([metrics.edit_distance(self.normalize(result['term']), self.normalize(term)), result])

        matches.sort()
        print term,matches
        print
        if len(matches) == 0:
            return [ { 'match': 'none', 'term': term.encode('utf-8') } ] 

        elif matches[0][0] == 0:
            new_results = [ matches[0][1] ]
            new_results[0]['match'] = 'exact'
            return new_results

        elif matches[0][0] <= 5:
            new_results = []
            for match in matches:
                if match[0] <= 3:
                    result = match[1]
                    result['match'] = 'good-partial'
                    new_results.append(result)

                # needs refinement to provide good matches
                elif match[0] <= 5:
                    words = self.normalize(result['term']).split(' ')
                    if self.normalize(term) in words:
                        result = match[1]
                        result['match'] = 'good-partial'
                        new_results.append(result)

            return new_results

        else:
            new_results = []
            for result in results[0:2]:
                result['match'] = 'partial'
                new_results.append(result)
            return new_results

    def get_wiki_url(self, term):

        results = self.check_wikipedia_api(term)

        for result in results:
            if result['match'] != 'none' and result['match'] != 'partial':
                result['wiki_url'] = result['url']
                

        #print results
        return results

# source: http://stackoverflow.com/questions/4460921/extract-the-first-paragraph-from-a-wikipedia-article-python/4461624#4461624
#!/usr/bin/env python

import re
import yaml
import urllib
import urllib2

class WikipediaError(Exception):
    pass

class Wikipedia:
    url_article = 'http://%s.wikipedia.org/w/index.php?action=raw&title=%s'
    url_image = 'http://%s.wikipedia.org/w/index.php?title=Special:FilePath&file=%s'
    url_search = 'http://%s.wikipedia.org/w/api.php?action=query&list=search&srsearch=%s&sroffset=%d&srlimit=%d&format=yaml'
    
    def __init__(self, lang):
        self.lang = lang
    
    def __fetch(self, url):
        request = urllib2.Request(url)
        request.add_header('User-Agent', 'Mozilla/5.0')
        
        try:
            result = urllib2.urlopen(request)
        except urllib2.HTTPError, e:
            raise WikipediaError(e.code)
        except urllib2.URLError, e:
            raise WikipediaError(e.reason)
        
        return result
    
    def article(self, url):
        article = url[url.rfind('/')+1:]
        url = self.url_article % (self.lang, urllib.quote_plus(article))
        content = self.__fetch(url).read()
        
        if content.upper().startswith('#REDIRECT'):
            match = re.match('(?i)#REDIRECT \[\[([^\[\]]+)\]\]', content)
            
            if not match == None:
                return self.article(match.group(1))
            
            raise WikipediaError('Can\'t found redirect article.')
        
        return content
    
    def image(self, image, thumb=None):
        url = self.url_image % (self.lang, image)
        result = self.__fetch(url)
        content = result.read()
        
        if thumb:
            url = result.geturl() + '/' + thumb + 'px-' + image
            url = url.replace('/commons/', '/commons/thumb/')
            url = url.replace('/' + self.lang + '/', '/' + self.lang + '/thumb/')
            
            return self.__fetch(url).read()
        
        return content
    
    def search(self, query, page=1, limit=10):
        offset = (page - 1) * limit
        url = self.url_search % (self.lang, urllib.quote_plus(query), offset, limit)
        content = self.__fetch(url).read()
        
        parsed = yaml.load(content)
        search = parsed['query']['search']
        
        results = []
        
        if search:
            for article in search:
                title = article['title'].strip()
                
                snippet = article['snippet']
                snippet = re.sub(r'(?m)<.*?>', '', snippet)
                snippet = re.sub(r'\s+', ' ', snippet)
                snippet = snippet.replace(' . ', '. ')
                snippet = snippet.replace(' , ', ', ')
                snippet = snippet.strip()
                
                wordcount = article['wordcount']
                
                results.append({
                    'title' : title,
                    'snippet' : snippet,
                    'wordcount' : wordcount
                })
        
        # yaml.dump(results, default_style='', default_flow_style=False,
        #     allow_unicode=True)
        return results

class Wiki2Plain:
#    url_article = 'http://%s.wikipedia.org/w/index.php?action=raw&title=%s'
    url_image = 'http://%s.wikipedia.org/w/index.php?title=Special:FilePath&file=%s'
#    url_search = 'http://%s.wikipedia.org/w/api.php?action=query&list=search&srsearch=%s&sroffset=%d&srlimit=%d&format=yaml'
    
    def __init__(self, url):
        self.wiki = Wikipedia('en')
        self.wiki_article = self.wiki.article(url)

        self.text = self.wiki_article
        self.text = self.unhtml(self.text)
        self.text = self.unwiki(self.text)
        self.text = self.punctuate(self.text)
        self.text = self.get_summary(self.text)
    
    def __str__(self):
        return self.text
    
    def unwiki(self, wiki):
        """
        Remove wiki markup from the text.
        """
        wiki = re.sub(r'(?i)\{\{IPA(\-[^\|\{\}]+)*?\|([^\|\{\}]+)(\|[^\{\}]+)*?\}\}', lambda m: m.group(2), wiki)
        wiki = re.sub(r'(?i)\{\{Lang(\-[^\|\{\}]+)*?\|([^\|\{\}]+)(\|[^\{\}]+)*?\}\}', lambda m: m.group(2), wiki)
        wiki = re.sub(r'\{\{[^\{\}]+\}\}', '', wiki)
        wiki = re.sub(r'(?m)\{\{[^\{\}]+\}\}', '', wiki)
        wiki = re.sub(r'(?m)\{\|[^\{\}]*?\|\}', '', wiki)
        wiki = re.sub(r'(?i)\[\[Category:[^\[\]]*?\]\]', '', wiki)
        wiki = re.sub(r'(?i)\[\[Image:[^\[\]]*?\]\]', '', wiki)
        wiki = re.sub(r'(?i)\[\[File:[^\[\]]*?\]\]', '', wiki)
        wiki = re.sub(r'\[\[[^\[\]]*?\|([^\[\]]*?)\]\]', lambda m: m.group(1), wiki)
        wiki = re.sub(r'\[\[([^\[\]]+?)\]\]', lambda m: m.group(1), wiki)
        wiki = re.sub(r'\[\[([^\[\]]+?)\]\]', '', wiki)
        wiki = re.sub(r'(?i)File:[^\[\]]*?', '', wiki)
        wiki = re.sub(r'\[[^\[\]]*? ([^\[\]]*?)\]', lambda m: m.group(1), wiki)
        wiki = re.sub(r"''+", '', wiki)
        wiki = re.sub(r'(?m)^\*$', '', wiki)
        
        return wiki
    
    def unhtml(self, html):
        """
        Remove HTML from the text.
        """
        html = re.sub(r'(?i)&nbsp;', ' ', html)
        html = re.sub(r'(?i)<br[ \\]*?>', '\n', html)
        html = re.sub(r'(?m)<!--.*?--\s*>', '', html)
        html = re.sub(r'(?i)<ref[^>]*>[^>]*<\/ ?ref>', '', html)
        html = re.sub(r'(?m)<.*?>', '', html)
        html = re.sub(r'(?i)&amp;', '&', html)
        
        return html
    
    def punctuate(self, text):
        """
        Convert every text part into well-formed one-space
        separate paragraph.
        """
        text = re.sub(r'\r\n|\n|\r', '\n', text)
        text = re.sub(r'\n\n+', '\n\n', text)
        
        parts = text.split('\n\n')
        partsParsed = []
        
        for part in parts:
            part = part.strip()
            
            if len(part) == 0:
                continue
            
            partsParsed.append(part)
        
        return '\n\n'.join(partsParsed)

    def get_summary(self,text):
        text = text[:text.find('==')]
        return text

    def image(self):
        url_image = 'http://simple.wikipedia.org/w/index.php?title=Special:FilePath&file=' 
        """
        Retrieve the first image in the document.
        """
        # match = re.search(r'(?i)\|?\s*(image|img|image_flag)\s*=\s*(<!--.*-->)?\s*([^\\/:*?<>"|%]+\.[^\\/:*?<>"|%]{3,4})', self.wiki)
        match = re.search(r'= (\b[\w ]+\b)+.(gif|jpg|jpeg|png|bmp)', self.wiki_article)
        if match:
            image_url = url_image + '%s.%s' % match.groups()
            image_url = re.sub(' ', '_', image_url)
            return image_url
        
        return None