In [4]:
# Import packages to extended the functionality of basic Python

# Let Python 2 behave like Python 3
from __future__ import division, unicode_literals, print_function

# Utility data structures
from collections import Counter

# Plotting
import matplotlib.pyplot as plt
import matplotlib.cm as cm
% matplotlib inline

# Natural language processing package
import nltk
import nltk.corpus

# Numerical 
import numpy as np

# Operating system functions
import os

# Data analysis package
import pandas as pd

# Kenneth Reitz' module to download data from the Web. (instead of urllib)
import requests
In [2]:
# Specify a URL to download information from the Semantic MediaWiki 'WikiLit'
# Such a query can be constructed on the web site and the resulting URL copy-and-pasted here.
url = ("http://wikilit.referata.com/" 
       "wiki/Special:Ask/"
       "-5B-5BCategory:Publications-5D-5D/" 
       "-3FHas-20author%3DAuthor(s)/-3FYear/"
       "-3FPublished-20in/-3FAbstract/-3FHas-20topic%3DTopic(s)/" 
       "-3FHas-20domain%3DDomain(s)/" 
       "format%3D-20csv/limit%3D-20600/offset%3D0")
In [3]:
# Download and read data as comma-separated values (CSV) information into a Pandas DataFrame
documents = pd.read_csv(url)
In [6]:
filename = os.path.expanduser('~/data/dtu02819/wikilit.csv')
In [7]:
# Write to a comma-separated values file at the local file system
documents.to_csv(filename)
In [12]:
# Read the comma-separated values file from the local file system
documents = pd.read_csv(filename, index_col=0)
In [14]:
documents.head()
Out[14]:
Unnamed: 0.1 Author(s) Year Published in Abstract Topic(s) Domain(s)
0 'Wikipedia, the free encyclopedia' as a role m... Gordon Müller-Seitz,Guido Reger 2010 International Journal of Technology Management Accounts of open source software (OSS) develop... Contributor motivation,Policies and governance... Information systems
1 A 'resource review' of Wikipedia Cormac Lawler 2006 Counselling & Psychotherapy Research The article offers information on Wikipedia, a... Miscellaneous topics Information systems
2 A Persian web page classifier applying a combi... Mojgan Farhoodi,Alireza Yari,Maryam Mahmoudi 2009 International Journal of Information Studies There are many automatic classification method... Text classification Computer science
3 A Wikipedia literature review Owen S. Martin 2010 ArXiv This paper was originally designed as a litera... Literature review Mathematics
4 A Wikipedia matching approach to contextual ad... Alexander N. Pak,Chin-Wan Chung 2010 World Wide Web Contextual advertising is an important part of... Other information retrieval topics Computer science
In [16]:
# Example on word tokenization of the first sentence in the first abstract
sentences = nltk.sent_tokenize(documents.ix[0, 'Abstract'])

# Show the sentences as a Python list of strings
print(nltk.word_tokenize(sentences[0]))
['Accounts', 'of', 'open', 'source', 'software', '(', 'OSS', ')', 'development', 'projects', 'frequently', 'stress', 'their', 'democratic', ',', 'sometimes', 'even', 'anarchic', 'nature', ',', 'in', 'contrast', 'to', 'for-profit', 'organisations', '.']
In [17]:
# Tokenize all the text and count tokens

# A extra attribute to contain the new data in the documents object
documents.data = []

# Token counter
token_counts = Counter()

# Iterate over all documents and all sentence and all words
for abstract in documents['Abstract']:
    datum = {}
    datum['sentences'] = nltk.sent_tokenize(abstract)
    datum['tokenlist'] = [word.lower() for sent in datum['sentences'] 
                                       for word in nltk.word_tokenize(sent)]
    token_counts.update(datum['tokenlist'])
    documents.data.append(datum)
In [18]:
# The five most common tokens in the entire WikiLit corpus
token_counts.most_common(5)
Out[18]:
[('the', 4866), ('of', 3826), (',', 3785), ('.', 3585), ('and', 2856)]
In [19]:
# Read stopword list ('the', 'a', 'for')
stopwords = nltk.corpus.stopwords.words('english')
relevant_tokens = {token: count for token, count in token_counts.items()
                                if count > 2 and token not in stopwords and token.isalpha()}
In [20]:
# Show the most common tokens in the reduced token set
Counter(relevant_tokens).most_common(5)
Out[20]:
[('wikipedia', 1428),
 ('information', 486),
 ('knowledge', 341),
 ('articles', 279),
 ('online', 265)]
In [21]:
# Exclude the work 'wikipedia'
relevant_tokens.pop('wikipedia', 0)
Out[21]:
1428
In [22]:
# Construct a dense document-term matrix with word counts in the elements
# as a Numpy matrix
tokens = relevant_tokens.keys()    # as list
M = np.asmatrix(np.zeros([len(documents), len(tokens)]))
for n in range(len(documents)):
    for m, token in enumerate(tokens):
        M[n, m] = documents.data[n]['tokenlist'].count(token)
M.shape
Out[22]:
(525, 2899)
In [23]:
# Value of the element in the first row in the column corresponding to the word 'software'.
M[0, tokens.index('software')]
Out[23]:
1.0
In [24]:
# Plot part of the matrix as an image
plt.imshow(M[:100, :100], cmap=cm.gray_r, interpolation='nearest')
plt.xlabel('Tokens')
plt.ylabel('Documents')
plt.show()
In [25]:
def nmf(M, components=5, iterations=500):
    """Factorize matrix with non-negative matrix factorization."""
    # Initialize to matrices
    W = np.asmatrix(np.random.random(([M.shape[0], components])))
    H = np.asmatrix(np.random.random(([components, M.shape[1]])))
    for n in range(0, iterations):
        H = np.multiply(H, (W.T * M) / (W.T * W * H + 0.001))
        W = np.multiply(W, (M * H.T) / (W * (H * H.T) + 0.001))
    return (W, H)
In [26]:
# Perform the actual computation
W, H = nmf(M, iterations=50, components=3)
In [27]:
W.max(), H.max()
Out[27]:
(15.094951905539212, 1.8585767533145232)
In [28]:
# Show the results in some format - This could be written nicer, using Jinja2
for component in range(W.shape[1]):
    print("=" * 80)
    print("COMPONENT %d: " % (component + 1,))
    indices = (-H[component, :]).getA1().argsort()
    print(" - ".join([tokens[i] for i in indices[:6] ]))
    print("-")
    indices = (-W[:, component]).getA1().argsort()
    print("\n".join([documents.ix[i, 0][:80] for i in indices[:5] ]))
================================================================================
COMPONENT 1: 
authors - content - community - number - articles - analysis
-
Wikipedia - a quantitative analysis
Open content and value creation
Sharing knowledge and building communities: a narrative of the formation, develo
Extracting content holes by comparing community-type content with Wikipedia
An analysis of open content systems
================================================================================
COMPONENT 2: 
knowledge - document - clustering - linkage - topic - algorithm
-
Exploiting external/domain knowledge to enhance traditional text mining using gr
Wikitology: a novel hybrid knowledge base derived from Wikipedia
The WikiID: an alternative approach to the body of knowledge
Breaking the knowledge acquisition bottleneck through conversational knowledge m
Extracting lexical semantic knowledge from Wikipedia and Wiktionary
================================================================================
COMPONENT 3: 
information - web - use - students - search - results
-
Gender differences in information behavior concerning Wikipedia, an unorthodox i
How and why do college students use Wikipedia?
Where does the information come from? Information source use patterns in Wikiped
What is the quality of surgery-related information on the Internet? Lessons lear
Reliability of Wikipedia as a medication information source for pharmacy student
In [137]: