# Import packages to extended the functionality of basic Python
# Let Python 2 behave like Python 3
from __future__ import division, unicode_literals, print_function
# Utility data structures
from collections import Counter
# Plotting
import matplotlib.pyplot as plt
import matplotlib.cm as cm
% matplotlib inline
# Natural language processing package
import nltk
import nltk.corpus
# Numerical
import numpy as np
# Operating system functions
import os
# Data analysis package
import pandas as pd
# Kenneth Reitz' module to download data from the Web. (instead of urllib)
import requests
# Specify a URL to download information from the Semantic MediaWiki 'WikiLit'
# Such a query can be constructed on the web site and the resulting URL copy-and-pasted here.
url = ("http://wikilit.referata.com/"
"wiki/Special:Ask/"
"-5B-5BCategory:Publications-5D-5D/"
"-3FHas-20author%3DAuthor(s)/-3FYear/"
"-3FPublished-20in/-3FAbstract/-3FHas-20topic%3DTopic(s)/"
"-3FHas-20domain%3DDomain(s)/"
"format%3D-20csv/limit%3D-20600/offset%3D0")
# Download and read data as comma-separated values (CSV) information into a Pandas DataFrame
documents = pd.read_csv(url)
filename = os.path.expanduser('~/data/dtu02819/wikilit.csv')
# Write to a comma-separated values file at the local file system
documents.to_csv(filename)
# Read the comma-separated values file from the local file system
documents = pd.read_csv(filename, index_col=0)
documents.head()
# Example on word tokenization of the first sentence in the first abstract
sentences = nltk.sent_tokenize(documents.ix[0, 'Abstract'])
# Show the sentences as a Python list of strings
print(nltk.word_tokenize(sentences[0]))
# Tokenize all the text and count tokens
# A extra attribute to contain the new data in the documents object
documents.data = []
# Token counter
token_counts = Counter()
# Iterate over all documents and all sentence and all words
for abstract in documents['Abstract']:
datum = {}
datum['sentences'] = nltk.sent_tokenize(abstract)
datum['tokenlist'] = [word.lower() for sent in datum['sentences']
for word in nltk.word_tokenize(sent)]
token_counts.update(datum['tokenlist'])
documents.data.append(datum)
# The five most common tokens in the entire WikiLit corpus
token_counts.most_common(5)
# Read stopword list ('the', 'a', 'for')
stopwords = nltk.corpus.stopwords.words('english')
relevant_tokens = {token: count for token, count in token_counts.items()
if count > 2 and token not in stopwords and token.isalpha()}
# Show the most common tokens in the reduced token set
Counter(relevant_tokens).most_common(5)
# Exclude the work 'wikipedia'
relevant_tokens.pop('wikipedia', 0)
# Construct a dense document-term matrix with word counts in the elements
# as a Numpy matrix
tokens = relevant_tokens.keys() # as list
M = np.asmatrix(np.zeros([len(documents), len(tokens)]))
for n in range(len(documents)):
for m, token in enumerate(tokens):
M[n, m] = documents.data[n]['tokenlist'].count(token)
M.shape
# Value of the element in the first row in the column corresponding to the word 'software'.
M[0, tokens.index('software')]
# Plot part of the matrix as an image
plt.imshow(M[:100, :100], cmap=cm.gray_r, interpolation='nearest')
plt.xlabel('Tokens')
plt.ylabel('Documents')
plt.show()
def nmf(M, components=5, iterations=500):
"""Factorize matrix with non-negative matrix factorization."""
# Initialize to matrices
W = np.asmatrix(np.random.random(([M.shape[0], components])))
H = np.asmatrix(np.random.random(([components, M.shape[1]])))
for n in range(0, iterations):
H = np.multiply(H, (W.T * M) / (W.T * W * H + 0.001))
W = np.multiply(W, (M * H.T) / (W * (H * H.T) + 0.001))
return (W, H)
# Perform the actual computation
W, H = nmf(M, iterations=50, components=3)
W.max(), H.max()
# Show the results in some format - This could be written nicer, using Jinja2
for component in range(W.shape[1]):
print("=" * 80)
print("COMPONENT %d: " % (component + 1,))
indices = (-H[component, :]).getA1().argsort()
print(" - ".join([tokens[i] for i in indices[:6] ]))
print("-")
indices = (-W[:, component]).getA1().argsort()
print("\n".join([documents.ix[i, 0][:80] for i in indices[:5] ]))