# Import packages to extended the functionality of basic Python # Let Python 2 behave like Python 3 from __future__ import division, unicode_literals, print_function # Utility data structures from collections import Counter # Plotting import matplotlib.pyplot as plt import matplotlib.cm as cm % matplotlib inline # Natural language processing package import nltk import nltk.corpus # Numerical import numpy as np # Operating system functions import os # Data analysis package import pandas as pd # Kenneth Reitz' module to download data from the Web. (instead of urllib) import requests # Specify a URL to download information from the Semantic MediaWiki 'WikiLit' # Such a query can be constructed on the web site and the resulting URL copy-and-pasted here. url = ("http://wikilit.referata.com/" "wiki/Special:Ask/" "-5B-5BCategory:Publications-5D-5D/" "-3FHas-20author%3DAuthor(s)/-3FYear/" "-3FPublished-20in/-3FAbstract/-3FHas-20topic%3DTopic(s)/" "-3FHas-20domain%3DDomain(s)/" "format%3D-20csv/limit%3D-20600/offset%3D0") # Download and read data as comma-separated values (CSV) information into a Pandas DataFrame documents = pd.read_csv(url) filename = os.path.expanduser('~/data/dtu02819/wikilit.csv') # Write to a comma-separated values file at the local file system documents.to_csv(filename) # Read the comma-separated values file from the local file system documents = pd.read_csv(filename, index_col=0) documents.head() # Example on word tokenization of the first sentence in the first abstract sentences = nltk.sent_tokenize(documents.ix[0, 'Abstract']) # Show the sentences as a Python list of strings print(nltk.word_tokenize(sentences[0])) # Tokenize all the text and count tokens # A extra attribute to contain the new data in the documents object documents.data = [] # Token counter token_counts = Counter() # Iterate over all documents and all sentence and all words for abstract in documents['Abstract']: datum = {} datum['sentences'] = nltk.sent_tokenize(abstract) datum['tokenlist'] = [word.lower() for sent in datum['sentences'] for word in nltk.word_tokenize(sent)] token_counts.update(datum['tokenlist']) documents.data.append(datum) # The five most common tokens in the entire WikiLit corpus token_counts.most_common(5) # Read stopword list ('the', 'a', 'for') stopwords = nltk.corpus.stopwords.words('english') relevant_tokens = {token: count for token, count in token_counts.items() if count > 2 and token not in stopwords and token.isalpha()} # Show the most common tokens in the reduced token set Counter(relevant_tokens).most_common(5) # Exclude the work 'wikipedia' relevant_tokens.pop('wikipedia', 0) # Construct a dense document-term matrix with word counts in the elements # as a Numpy matrix tokens = relevant_tokens.keys() # as list M = np.asmatrix(np.zeros([len(documents), len(tokens)])) for n in range(len(documents)): for m, token in enumerate(tokens): M[n, m] = documents.data[n]['tokenlist'].count(token) M.shape # Value of the element in the first row in the column corresponding to the word 'software'. M[0, tokens.index('software')] # Plot part of the matrix as an image plt.imshow(M[:100, :100], cmap=cm.gray_r, interpolation='nearest') plt.xlabel('Tokens') plt.ylabel('Documents') plt.show() def nmf(M, components=5, iterations=500): """Factorize matrix with non-negative matrix factorization.""" # Initialize to matrices W = np.asmatrix(np.random.random(([M.shape[0], components]))) H = np.asmatrix(np.random.random(([components, M.shape[1]]))) for n in range(0, iterations): H = np.multiply(H, (W.T * M) / (W.T * W * H + 0.001)) W = np.multiply(W, (M * H.T) / (W * (H * H.T) + 0.001)) return (W, H) # Perform the actual computation W, H = nmf(M, iterations=50, components=3) W.max(), H.max() # Show the results in some format - This could be written nicer, using Jinja2 for component in range(W.shape[1]): print("=" * 80) print("COMPONENT %d: " % (component + 1,)) indices = (-H[component, :]).getA1().argsort() print(" - ".join([tokens[i] for i in indices[:6] ])) print("-") indices = (-W[:, component]).getA1().argsort() print("\n".join([documents.ix[i, 0][:80] for i in indices[:5] ]))