#!/usr/bin/env python # coding: utf-8 # # Co-Occurring Tag Analysis # # Analysing how tags co-occur across various Parliamentary publications. The idea behind this is to see whether there are naturally occurring groupings of topic tags by virtue of their co-occurence when used to tag different classes of Parlimanetary publication. # # The data is provided as a set of Linked Data triples exported as *Turtle* (`.ttl`) data files. The data represents, among other things, Parlimentary resources (such as early day motions or other proceedings records) and subject/topic labels they are tagged with. # # The data allows us to generate a graph that associates tags with resources, and from that a graph that directly associates tags with other tags by virtue of their commonly tagging the same resource or set of resources. # In[2]: #Data files get_ipython().system('ls ../data/dataexport') # ## Utils # Import a library that lets us work with the data files: # In[3]: #Data is provided as Turtle/ttl files - rdflib handles those #!pip3 install rdflib from rdflib import Graph # Simple utility to load all the `.ttl` files in a particular directory into a graph: # In[4]: import os def ttl_graphbuilder(path,g=None,debug=False): #We can add the triples to an existing graph or create a new one for them if g is None: g=Graph() #Loop through all the files in the directory and then load the ones that have a .ttl suffix for ttl in [f for f in os.listdir(path) if f.endswith('.ttl')]: if debug: print(ttl) g.parse('{}/{}'.format(path,ttl), format='turtle') return g # Tools for running queries over a graph and either printing the result or putting it into a `pandas` dataframe: # In[5]: def rdfQuery(graph,q): ans=graph.query(q) for row in ans: for el in row: print(el,end=" ") print() #ish via https://github.com/schemaorg/schemaorg/blob/sdo-callisto/scripts/dashboard.ipynb import pandas as pd def sparql2df(graph,q, cast_to_numeric=True): a=graph.query(q) c = [] for b in a.bindings: rowvals=[] for k in a.vars: rowvals.append(b[k]) c.append(rowvals) df = pd.DataFrame(c) df.columns = [str(v) for v in a.vars] if cast_to_numeric: df = df.apply(lambda x: pd.to_numeric(x, errors='ignore')) return df # Tools to support the export and display of graphs - `networkx` package is handy in this respect, eg exporting to GEXF format for use with Gephi. We can also run projections on the graph quite easily. # In[6]: import networkx as nx # ## Exploring the Data - Terms # In[7]: path='../data/dataexport/terms' termgraph=ttl_graphbuilder(path) # In[8]: #What's in the graph generally? q=''' SELECT DISTINCT ?x ?y ?z { ?x ?y ?z. } LIMIT 10 ''' rdfQuery(termgraph,q) # In[9]: #What does a term have associated with it more specifically? q=''' SELECT DISTINCT ?y ?z { ?y ?z. } LIMIT 10 ''' rdfQuery(termgraph,q) # Looks like the `prefLabel` is what we want: # In[10]: q=''' SELECT DISTINCT ?z ?topic { ?z ?topic. } LIMIT 10 ''' sparql2df(termgraph,q) # ## Exploring the Data - EDMS # In[11]: path='../data/dataexport/edms' g=ttl_graphbuilder(path) # In[12]: #See what's there generally... q=''' SELECT DISTINCT ?x ?y ?z { ?x ?y ?z. } LIMIT 10 ''' rdfQuery(g,q) # In[13]: #Explore a specific EDM q=''' SELECT DISTINCT ?y ?z { ?y ?z. } ''' rdfQuery(g,q) # Let's merge the EDM graph data with the terms data. # In[15]: path='../data/dataexport/edms' g=ttl_graphbuilder(path,termgraph) # Now we can look at the term labels associated with a particular EDM. # In[16]: q=''' SELECT DISTINCT ?t ?z { ?z. ?z ?t. } LIMIT 10 ''' rdfQuery(g,q) # We can also create a table that links topic labels with EDMs. # In[17]: q=''' SELECT DISTINCT ?edms ?topic { ?edms . ?edms ?z. ?z ?topic. } ''' g_df=sparql2df(g,q) g_df.head() # From this table, we can a generate a bipartite `networkx` graph that links topic labels with EDMs. # In[18]: nxg=nx.from_pandas_dataframe(g_df, 'edms', 'topic') #nx.write_gexf(nxg,'edms.gexf') # We can then project this bipartite graph onto just the topic label nodes - edges will now connect nodes that are linked through one or more common EDMs. # In[19]: from networkx.algorithms import bipartite #We can find the sets of names/tags associated with the disjoint sets in the graph #I think the directedness of the graph means we can be reasonably sure the variable names are correctly ordered? edms,topic=bipartite.sets(nxg) #Collapse the bipartite graph to a graph of topic labels connected via a common EDM topicgraph= bipartite.projected_graph(nxg, topic) nx.write_gexf(topicgraph,'edms_topics.gexf') # We can also generate a weighted graph, where edges are weighted relative to how many times topics are linked through different EDMs. # In[20]: topicgraph_weighted= bipartite.weighted_projected_graph(nxg, topic) nx.write_gexf(topicgraph_weighted,'edms_topics_weighted.gexf') # ## Predicting Topics # In[39]: #!pip3 install sklearn # In[73]: #via https://stackoverflow.com/a/19172087/454773 import numpy as np from sklearn.pipeline import Pipeline from sklearn.feature_extraction.text import CountVectorizer from sklearn.svm import LinearSVC from sklearn.feature_extraction.text import TfidfTransformer from sklearn.multiclass import OneVsRestClassifier from sklearn.preprocessing import MultiLabelBinarizer # In[56]: #via https://stackoverflow.com/questions/22219004/grouping-rows-in-list-in-pandas-groupby g_df['topic']=g_df['topic'].astype(str) topicsbyedm_df=g_df.groupby('edms')['topic'].apply(list).to_frame().reset_index() topicsbyedm_df.head() # In[57]: q=''' SELECT DISTINCT ?edms ?motiontext { ?edms . ?edms ?motiontext. } ''' m_df=sparql2df(g,q) m_df=m_df.merge(topicsbyedm_df,on='edms') m_df.head() # In[69]: X_train= np.array(m_df['motiontext'][:-100].tolist()) X_test = np.array(m_df['motiontext'][-100:].tolist()) # In[70]: target_names=g_df['topic'].astype(str).tolist() target_names[:3] # In[76]: #ytrain= [[target_names.index(i) for i in t] for t in m_df['topic'][:-100] ] #ytrain[:3] y_train_text = [ t for t in m_df['topic'][:-100] ] y_train_text[:3] # In[96]: mlb = MultiLabelBinarizer() Y = mlb.fit_transform(y_train_text) classifier = Pipeline([ ('vectorizer', CountVectorizer(analyzer='word',stop_words='english')), ('tfidf', TfidfTransformer()), ('clf', OneVsRestClassifier(LinearSVC()))]) classifier.fit(X_train, Y) predicted = classifier.predict(X_test) all_labels = mlb.inverse_transform(predicted) hits=[] misses=[] for item, labels in zip(X_test, all_labels): if labels!=(): hits.append('{0} => {1}'.format(item, ', '.join(labels))) else: misses.append('{0} => {1}'.format(item, ', '.join(labels))) print("some hits:\n{}\n\nsome misses:\n{}".format('\n'.join(hits[:3]),'\n'.join(misses[:3]))) # In[94]: labels # ## Exploring the Data - proceedings # In[ ]: path='../data/dataexport/proceedings' p=ttl_graphbuilder(path,debug=True) # In[ ]: get_ipython().system('ls {path}') # In[ ]: get_ipython().system('cat {path}/0006D323-D0B5-4E22-A26E-75ABB621F58E.ttl') #