# Import packages to extended the functionality of basic Python

# Let Python 2 behave like Python 3
from __future__ import division, unicode_literals, print_function

# Utility data structures
from collections import Counter

# Plotting
import matplotlib.pyplot as plt
import matplotlib.cm as cm
% matplotlib inline

# Natural language processing package
import nltk
import nltk.corpus

# Numerical 
import numpy as np

# Operating system functions
import os

# Data analysis package
import pandas as pd

# Kenneth Reitz' module to download data from the Web. (instead of urllib)
import requests

# Specify a URL to download information from the Semantic MediaWiki 'WikiLit'
# Such a query can be constructed on the web site and the resulting URL copy-and-pasted here.
url = ("http://wikilit.referata.com/" 
       "wiki/Special:Ask/"
       "-5B-5BCategory:Publications-5D-5D/" 
       "-3FHas-20author%3DAuthor(s)/-3FYear/"
       "-3FPublished-20in/-3FAbstract/-3FHas-20topic%3DTopic(s)/" 
       "-3FHas-20domain%3DDomain(s)/" 
       "format%3D-20csv/limit%3D-20600/offset%3D0")

# Download and read data as comma-separated values (CSV) information into a Pandas DataFrame
documents = pd.read_csv(url)

filename = os.path.expanduser('~/data/dtu02819/wikilit.csv')

# Write to a comma-separated values file at the local file system
documents.to_csv(filename)

# Read the comma-separated values file from the local file system
documents = pd.read_csv(filename, index_col=0)

documents.head()

# Example on word tokenization of the first sentence in the first abstract
sentences = nltk.sent_tokenize(documents.ix[0, 'Abstract'])

# Show the sentences as a Python list of strings
print(nltk.word_tokenize(sentences[0]))

# Tokenize all the text and count tokens

# A extra attribute to contain the new data in the documents object
documents.data = []

# Token counter
token_counts = Counter()

# Iterate over all documents and all sentence and all words
for abstract in documents['Abstract']:
    datum = {}
    datum['sentences'] = nltk.sent_tokenize(abstract)
    datum['tokenlist'] = [word.lower() for sent in datum['sentences'] 
                                       for word in nltk.word_tokenize(sent)]
    token_counts.update(datum['tokenlist'])
    documents.data.append(datum)

# The five most common tokens in the entire WikiLit corpus
token_counts.most_common(5)

# Read stopword list ('the', 'a', 'for')
stopwords = nltk.corpus.stopwords.words('english')
relevant_tokens = {token: count for token, count in token_counts.items()
                                if count > 2 and token not in stopwords and token.isalpha()}

# Show the most common tokens in the reduced token set
Counter(relevant_tokens).most_common(5)

# Exclude the work 'wikipedia'
relevant_tokens.pop('wikipedia', 0)

# Construct a dense document-term matrix with word counts in the elements
# as a Numpy matrix
tokens = relevant_tokens.keys()    # as list
M = np.asmatrix(np.zeros([len(documents), len(tokens)]))
for n in range(len(documents)):
    for m, token in enumerate(tokens):
        M[n, m] = documents.data[n]['tokenlist'].count(token)
M.shape

# Value of the element in the first row in the column corresponding to the word 'software'.
M[0, tokens.index('software')]

# Plot part of the matrix as an image
plt.imshow(M[:100, :100], cmap=cm.gray_r, interpolation='nearest')
plt.xlabel('Tokens')
plt.ylabel('Documents')
plt.show()

def nmf(M, components=5, iterations=500):
    """Factorize matrix with non-negative matrix factorization."""
    # Initialize to matrices
    W = np.asmatrix(np.random.random(([M.shape[0], components])))
    H = np.asmatrix(np.random.random(([components, M.shape[1]])))
    for n in range(0, iterations):
        H = np.multiply(H, (W.T * M) / (W.T * W * H + 0.001))
        W = np.multiply(W, (M * H.T) / (W * (H * H.T) + 0.001))
    return (W, H)

# Perform the actual computation
W, H = nmf(M, iterations=50, components=3)

W.max(), H.max()

# Show the results in some format - This could be written nicer, using Jinja2
for component in range(W.shape[1]):
    print("=" * 80)
    print("COMPONENT %d: " % (component + 1,))
    indices = (-H[component, :]).getA1().argsort()
    print(" - ".join([tokens[i] for i in indices[:6] ]))
    print("-")
    indices = (-W[:, component]).getA1().argsort()
    print("\n".join([documents.ix[i, 0][:80] for i in indices[:5] ]))