This notebook uses the Linked Open Data repository of the Biblioteca Virtual Miguel de Cervantes.
This example is based on the journal Doxa. Cuadernos de Filosofía del Derecho that is a periodical publication issued every year since 1984 to promote the interchange between philosophers of law from Latin America and Latin Europe. The information regarding this publication has been published as LOD in the repository, including metadata and text, and is accessible by means of the SPARQL endpoint.
As an introduction, we provide this notebook to introduce the concepts that we use in this example.
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import pickle
import re
import os
from pathlib import Path
import requests
from collections import Counter
import matplotlib.pyplot as plt
from numpy import mean, ones
from scipy.sparse import csr_matrix
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
Relationships between the resources are described in RDA. Manifestations representing journals, volumes and articles are linked by means of the property rdam:wholePartManifestationRelationship
The journal Doxa. Cuadernos de Filosofía del Derecho is a periodical publication issued every year since 1984 to promote the interchange between philosophers of law from Latin America and Latin Europe.
The information regarding this publication has been published as LOD in the repository, including metadata and text, and is accessible by means of the SPARQL endpoint.
We will create a CSV file containing the results. By using the instruction VALUES we can configure the SPARQL query to filter the results using particular years, as is shown below.
url = 'http://data.cervantesvirtual.com/bvmc-lod/repositories/data'
query = """
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdaa: <http://rdaregistry.info/Elements/a/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX rdam: <http://rdaregistry.info/Elements/m/>
PREFIX rda: <http://www.rdaregistry.info/>
PREFIX rdai: <http://rdaregistry.info/Elements/i/>
select ?num ?numTitle ?article ?articleTitle ?date ?noteEdition ?carrierCharacteristic ?pdf
where {
VALUES ?date { <http://data.cervantesvirtual.com/date/2010> <http://data.cervantesvirtual.com/date/2011> <http://data.cervantesvirtual.com/date/2012> <http://data.cervantesvirtual.com/date/2013> <http://data.cervantesvirtual.com/date/2014> <http://data.cervantesvirtual.com/date/2015>
<http://data.cervantesvirtual.com/date/2016> <http://data.cervantesvirtual.com/date/2017> <http://data.cervantesvirtual.com/date/2018>}
?num rdam:wholePartManifestationRelationship <http://data.cervantesvirtual.com/manifestation/237680> .
?num rdam:title ?numTitle .
?num rdam:dateOfPublication ?date .
?article rdam:wholePartManifestationRelationship ?num .
?article rdam:title ?articleTitle .
?article rdam:exemplarOfManifestation ?item .
?article rdam:noteOnEditionStatement ?noteEdition .
?item rdai:identifierForTheItem ?pdf .
?item rdai:itemSpecificCarrierCharacteristic ?carrierCharacteristic;
}
"""
r = requests.get(url, params = {'format': 'text/plain', 'query': query})
# save the result
f = open("results-doxa-dates.csv", "w")
f.write(r.text)
f.close()
We can also retrieve all the articles without specifying any date.
url = 'http://data.cervantesvirtual.com/bvmc-lod/repositories/data'
query = """
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdaa: <http://rdaregistry.info/Elements/a/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX rdam: <http://rdaregistry.info/Elements/m/>
PREFIX rda: <http://www.rdaregistry.info/>
PREFIX rdai: <http://rdaregistry.info/Elements/i/>
select ?num ?numTitle ?article ?articleTitle ?date ?noteEdition ?carrierCharacteristic ?pdf
where {
?num rdam:wholePartManifestationRelationship <http://data.cervantesvirtual.com/manifestation/237680> .
?num rdam:title ?numTitle .
?num rdam:dateOfPublication ?date .
?article rdam:wholePartManifestationRelationship ?num .
?article rdam:title ?articleTitle .
?article rdam:exemplarOfManifestation ?item .
?article rdam:noteOnEditionStatement ?noteEdition .
?item rdai:identifierForTheItem ?pdf .
?item rdai:itemSpecificCarrierCharacteristic ?carrierCharacteristic;
}
"""
r = requests.get(url, params = {'format': 'text/plain', 'query': query})
# save the result
f = open("results-doxa.csv", "w")
f.write(r.text)
f.close()
This puts the data in a Pandas DataFrame
df = pd.read_csv('results-doxa-dates.csv')
df
Note: This step may take a while to process due to the size of the PDF files.
for index, row in df.iterrows():
print(index, row['pdf'])
response = requests.get(row['pdf'])
filename = Path('doxa/{}.pdf'.format(row['pdf'].replace('http://www.cervantesvirtual.com/descargaPdf/','').replace('/', '')))
filename.write_bytes(response.content)
from tika import parser
raw = parser.from_file('doxa/agustin-squella-valparaiso.pdf')
print(raw['content'])
for index,row in df.iterrows():
file = 'doxa/{}.pdf'.format(row['pdf'].replace('http://www.cervantesvirtual.com/descargaPdf/','').replace('/', ''));
print(file)
raw = parser.from_file(file)
df.loc[index, 'original_text'] = raw['content'].replace('\n','')
df.head(3)
Dates are defined in the LOD repository using URLs such as http://data.cervantesvirtual.com/date/2000. Let's extract the year.
for index,row in df.iterrows():
try:
df.loc[index, 'year'] = int(row['date'].replace('http://data.cervantesvirtual.com/date/','').replace('/', ''))
except:
#print("An exception occurred", sys.exc_info()[0])
df.loc[index, 'year'] = ''
df.head(3)
We create an auxiliar class to store the terms and the codes
A minimal perfect hash is a birectional mapping between objects and consecutive integers
class MPHash(object):
# create from iterable
def __init__(self, terms):
self.term = list(terms)
self.code = {t:n for n, t in enumerate(self.term)}
def __len__(self):
return len(self.term)
def get_code(self, term):
return self.code.get(term)
def get_term(self, code):
return self.term[code]
This class recibes the texts to extract the emergent topics.
# A sample is a collection of texts and publication dates
# For each text, the sample stores its year and word counts.
class Sample(object):
pattern = pattern = r"(?:\w+[-])*\w*[^\W\d_]\w*(?:[-'’`]\w+)*"
# Create Sample from data stored in a DataFrame with at least columns
# TEXT, YEAR
# n = maximal ngram size
def __init__(self, data, ngram_length):
self.size = len(data)
self.year = data.year.tolist()
texts = tuple(data.original_text)
vectorizer = CountVectorizer(token_pattern = Sample.pattern,
#stop_words='spanish',
stop_words=stopwords.words('spanish'),
max_df=0.1,
ngram_range=(1, ngram_length))
matrix = vectorizer.fit_transform(texts).transpose()
# remove all hapax legomena to save space
terms = vectorizer.get_feature_names()
frequencies = matrix.sum(axis=1).A1
selected = [m for m, f in enumerate(frequencies) if f > 1]
hapax_rate = 1 - len(selected) / len(frequencies)
print('Removing hapax legomena ({:.1f}%)'.format(100 * hapax_rate))
self.matrix = matrix[selected, :]
self.term_codes = MPHash([terms[m] for m in selected])
# store array with global term frequencies
self.term_frequencies = self.matrix.sum(axis=1).A1
# store doc frequencies
self.doc_frequencies = self.matrix.getnnz(axis=1)
# store most common capitalization of terms
print('Obtaining most common capitalizations')
vectorizer.lowercase = False
matrix = vectorizer.fit_transform(texts).transpose()
terms = vectorizer.get_feature_names()
frequencies = matrix.sum(axis=1).A1
forms = dict()
for t, f in zip(terms, frequencies):
low = t.lower()
if forms.get(low, (None, 0))[1] < f:
forms[low] = (t, f)
self.capitals = {k:v[0] for k, v in forms.items()}
print('Computed stats for', len(self.term_codes), 'terms')
# return the number of texts stored in this Sample
def __len__(self):
return self.size
# return term frequency of the specified term
def get_tf(self, term):
code = self.term_codes.get_code(term.lower())
return self.term_frequencies[code]
# return document frequency of the specified term
def get_df(self, term):
code = self.term_codes.get_code(term.lower())
return self.doc_frequencies[code]
# return the most frequent capitalization form
# (also for stopwords not in dictionary)
def most_frequent_capitalization(self, term):
return self.capitals.get(term.lower(), term)
# return the average submission year of texts containing every term
def average_year(self, period, tf_threshold=20, df_threshold=3):
docs = [n for n, y in enumerate(self.year)\
if period[0] <= y <= period[1]]
tf_matrix = self.matrix[:, docs]
tf_sum = tf_matrix.sum(axis=1).A1
df_sum = tf_matrix.getnnz(axis=1)
terms = [m for m, tf in enumerate(tf_sum)\
if tf >= tf_threshold and df_sum[m] >= df_threshold]
tf_matrix = tf_matrix[terms, :]
rows, cols = tf_matrix.nonzero()
df_matrix = csr_matrix((ones(len(rows)), (rows, cols)))
year = [self.year[n] for n in docs]
res = df_matrix @ year / df_matrix.getnnz(axis=1) # @ operator = matrix multiplication
return {self.term_codes.get_term(terms[m]):res[m] for m in range(len(res))}
# return the number of occurrences (doc frequency) for every term
def get_df_per_year(self, term):
m = self.term_codes.get_code(term)
row = self.matrix.getrow(m)
_, docs = row.nonzero()
c = Counter(map(self.year.__getitem__, docs))
return c
# return the number of occurrences (term frequency) for every term
def tf_per_year(self, period=None):
rows, cols = self.matrix.nonzero()
res = {m:Counter() for m in rows}
for m, n in zip(rows, cols):
year = self.year[n]
if period == None or period[0] <= year <= period[1]:
res[m][year] += self.matrix[m, n]
return res
def plot_tf_series(self, term, period, relative=False):
m = self.term_codes.get_code(term)
if relative:
norm = Counter(self.year)
else:
norm = Counter(set(self.year))
if m:
row = self.matrix.getrow(m)
_, cols = row.nonzero()
c = Counter()
for n in cols:
year = self.year[n]
if period == None or period[0] <= year <= period[1]:
c[year] += row[0, n]
X = sorted(c.keys())
Y = [c[x] / norm[x] for x in X]
plt.plot(X, Y, 'o-')
plt.ylim(0, 1.2 * max(Y))
plt.title(term)
else:
raise ValueError('{} is not in store'.format(term))
# return dictionary with a list of text-years per term
# period = pair of years (min _year, max_year) inclusive
# keep_all = true if unlisted texts are not ignored
def document_years(self, period=None, keep_all=True):
rows, cols = self.matrix.nonzero()
res = {m:list() for m in rows}
for m, n in zip(rows, cols):
if keep_all or self.listed[n]:
year = self.year[n]
print(year)
if period == None or period[0] <= year <= period[1]:
res[m].append(year)
return res
# return dictionary with Counter of abstract-years per term
def df_per_year(self, period=None, keep_all=True):
doc_years = self.document_years(period, keep_all)
return {m:Counter(v) for m, v in doc_years.items()}
# create a plot with document frequency of terms
def plot_df(self, terms, period, keep_all=True):
dfs = self.df_per_year(period, keep_all)
for term in terms:
m = self.term_codes.get_code(term.lower())
df = dfs[m]
X = range(*period)
Y = [df.get(x, 0) for x in X]
plt.clf()
plt.plot(X, Y)
plt.title(term)
filename = 'plots/{}.png'.format(term)
print('Saving', filename)
plt.savefig(filename, dpi=200)
# compute the average age in the specified period of documents containing
# each term with global term-frequency above tf_threshold
# and annual document frequency above df_threshold (one year at least)
# period = optional pair of years (min _year, max_year) inclusive
def get_ages(self, period=None,
tf_threshold=20, df_threshold=3, keep_all=True):
res = dict()
doc_years = self.document_years(period, keep_all)
for m, values in doc_years.items():
term = self.term_codes.get_term(m)
if len(values) > 0:
df = Counter(values).most_common(1)[0][1]
tf = self.term_frequencies[m]
#break;
if df >= df_threshold and tf >= tf_threshold:
res[term] = mean(values)
return res
# return abstract numbers containing any term in this set of terms
def docs_with_term(self, terms, period=None):
rows, cols = self.matrix.nonzero()
res = set()
for m, n in zip(rows, cols):
term = self.term_codes.get_term(m)
if terms == None or term in terms:
year = self.year[n]
if period == None or period[0] <= year <= period[1]:
res.add(n)
return res
def search(self, term):
m = self.term_codes.get_code(term)
docs = self.matrix.getrow(m).nonzero()[1]
return [(self.year[n], self.type[n], self.panel[n]) for n in docs]
data = df
data = data[data.original_text.str.len() > 40]
print('Processing', len(data), 'texts')
s = Sample(data, 2)
with open('sample-doxa.pkl', 'wb') as f:
pickle.dump(s, f)
with open('sample-doxa.pkl', 'rb') as f:
s = pickle.load(f)
print('Loaded stats for', len(s), 'texts')
period = (2010, 2018)
ages = s.get_ages(period)
top = pd.DataFrame.from_dict(ages, orient='index').reset_index()
print(top)
top.columns = ['TERM', 'AGE']
#top = top.sort_values('AGE', ascending=False).head(250)
top = top.sort_values('AGE', ascending=False)#.head(250)
top['DOC FREQ'] = top.TERM.apply(s.get_df)
top['TERM FREQ'] = top.TERM.apply(s.get_tf)
# prepare to export
top['TERM'] = top.TERM.apply(s.most_frequent_capitalization)
print(top.set_index('TERM').head())
ts = pd.datetime.now().strftime("%Y-%m-%d_%H.%M")
filename = 'output/vocabulary_{}.xlsx'.format(ts)
with pd.ExcelWriter(filename) as writer:
top.set_index('TERM').to_excel(writer, sheet_name='terms')
print('vocabulary saved to', filename)