#!/usr/bin/env python
# coding: utf-8

# # Co-Occurring Tag Analysis
# 
# Analysing how tags co-occur across various Parliamentary publications. The idea behind this is to see whether there are naturally occurring groupings of topic tags by virtue of their co-occurence when used to tag different classes of Parlimanetary publication.
# 
# The data is provided as a set of Linked Data triples exported as *Turtle* (`.ttl`) data files. The data represents, among other things, Parlimentary resources (such as early day motions or other proceedings records) and subject/topic labels they are tagged with.
# 
# The data allows us to generate a graph that associates tags with resources, and from that a graph that directly associates tags with other tags by virtue of their commonly tagging the same resource or set of resources.

# In[2]:


#Data files
get_ipython().system('ls ../data/dataexport')


# ## Utils

# Import a library that lets us work with the data files:

# In[3]:


#Data is provided as Turtle/ttl files - rdflib handles those

#!pip3 install rdflib
from rdflib import Graph


# Simple utility to load all the `.ttl` files in a particular directory into a graph:

# In[4]:


import os
def ttl_graphbuilder(path,g=None,debug=False):
    #We can add the triples to an existing graph or create a new one for them
    if g is None:
        g=Graph()
        
    #Loop through all the files in the directory and then load the ones that have a .ttl suffix
    for ttl in [f for f in os.listdir(path) if f.endswith('.ttl')]:
        if debug: print(ttl)
        g.parse('{}/{}'.format(path,ttl), format='turtle')
    return g


# Tools for running queries over a graph and either printing the result or putting it into a `pandas` dataframe:

# In[5]:


def rdfQuery(graph,q):
    ans=graph.query(q)
    for row in ans:
        for el in row:
            print(el,end=" ")
        print()

#ish via https://github.com/schemaorg/schemaorg/blob/sdo-callisto/scripts/dashboard.ipynb
import pandas as pd
def sparql2df(graph,q, cast_to_numeric=True):
    a=graph.query(q)
    c = []
    for b in a.bindings:
        rowvals=[]
        for k in a.vars:
            rowvals.append(b[k])
        c.append(rowvals)

    df = pd.DataFrame(c)
    df.columns = [str(v) for v in a.vars]
    if cast_to_numeric:
        df = df.apply(lambda x: pd.to_numeric(x, errors='ignore'))

    return df


# Tools to support the export and display of graphs - `networkx` package is handy in this respect, eg exporting to GEXF format for use with Gephi. We can also run projections on the graph quite easily.

# In[6]:


import networkx as nx


# ## Exploring the Data - Terms

# In[7]:


path='../data/dataexport/terms'
termgraph=ttl_graphbuilder(path)


# In[8]:


#What's in the graph generally?
q='''
SELECT DISTINCT ?x ?y ?z {
    ?x ?y ?z.
} LIMIT 10
'''
rdfQuery(termgraph,q)


# In[9]:


#What does a term have associated with it more specifically?
q='''
SELECT DISTINCT ?y ?z {
    <http://data.parliament.uk/terms/95551> ?y ?z.
} LIMIT 10
'''
rdfQuery(termgraph,q)


# Looks like the `prefLabel` is what we want:

# In[10]:


q='''
SELECT DISTINCT ?z ?topic {
    ?z <http://www.w3.org/2004/02/skos/core#prefLabel> ?topic.
} LIMIT 10
'''
sparql2df(termgraph,q)


# ## Exploring the Data - EDMS

# In[11]:


path='../data/dataexport/edms'
g=ttl_graphbuilder(path)


# In[12]:


#See what's there generally...
q='''
SELECT DISTINCT ?x ?y ?z {
    ?x ?y ?z.
} LIMIT 10
'''
rdfQuery(g,q)


# In[13]:


#Explore a specific EDM
q='''
SELECT DISTINCT ?y ?z {
    <http://data.parliament.uk/edms/50457> ?y ?z.
}
'''
rdfQuery(g,q)


# Let's merge the EDM graph data with the terms data.

# In[15]:


path='../data/dataexport/edms'
g=ttl_graphbuilder(path,termgraph)


# Now we can look at the term labels associated with a particular EDM.

# In[16]:


q='''
SELECT DISTINCT ?t ?z {
    <http://data.parliament.uk/edms/50114> <http://data.parliament.uk/schema/parl#topic> ?z.
    ?z <http://www.w3.org/2004/02/skos/core#prefLabel> ?t.
} LIMIT 10
'''
rdfQuery(g,q)


# We can also create a table that links topic labels with EDMs. 

# In[17]:


q='''
SELECT DISTINCT ?edms ?topic {
    ?edms <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://data.parliament.uk/schema/parl#EarlyDayMotion>.
    ?edms <http://data.parliament.uk/schema/parl#topic> ?z.
    ?z <http://www.w3.org/2004/02/skos/core#prefLabel> ?topic.
}
'''
g_df=sparql2df(g,q)
g_df.head()


# From this table, we can a generate a bipartite `networkx` graph that links topic labels with EDMs.

# In[18]:


nxg=nx.from_pandas_dataframe(g_df, 'edms', 'topic')
#nx.write_gexf(nxg,'edms.gexf')


# We can then project this bipartite graph onto just the topic label nodes - edges will now connect nodes that are linked through one or more common EDMs.

# In[19]:


from networkx.algorithms import bipartite
#We can find the sets of names/tags associated with the disjoint sets in the graph
#I think the directedness of the graph means we can be reasonably sure the variable names are correctly ordered?
edms,topic=bipartite.sets(nxg)

#Collapse the bipartite graph to a graph of topic labels connected via a common EDM
topicgraph= bipartite.projected_graph(nxg, topic)
nx.write_gexf(topicgraph,'edms_topics.gexf')


# We can also generate a weighted graph, where edges are weighted relative to how many times topics are linked through different EDMs.

# In[20]:


topicgraph_weighted= bipartite.weighted_projected_graph(nxg, topic)
nx.write_gexf(topicgraph_weighted,'edms_topics_weighted.gexf')


# ## Predicting Topics

# In[39]:


#!pip3 install sklearn


# In[73]:


#via https://stackoverflow.com/a/19172087/454773
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer


# In[56]:


#via https://stackoverflow.com/questions/22219004/grouping-rows-in-list-in-pandas-groupby
g_df['topic']=g_df['topic'].astype(str)
topicsbyedm_df=g_df.groupby('edms')['topic'].apply(list).to_frame().reset_index()
topicsbyedm_df.head()


# In[57]:


q='''
SELECT DISTINCT ?edms ?motiontext {
    ?edms <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://data.parliament.uk/schema/parl#EarlyDayMotion>.
    ?edms <http://data.parliament.uk/schema/parl#motionText> ?motiontext.
}
'''
m_df=sparql2df(g,q)
m_df=m_df.merge(topicsbyedm_df,on='edms')
m_df.head()


# In[69]:


X_train= np.array(m_df['motiontext'][:-100].tolist())
X_test = np.array(m_df['motiontext'][-100:].tolist())   


# In[70]:


target_names=g_df['topic'].astype(str).tolist()
target_names[:3]


# In[76]:


#ytrain= [[target_names.index(i) for i in t] for t in m_df['topic'][:-100] ]
#ytrain[:3]
y_train_text = [ t for t in m_df['topic'][:-100] ]
y_train_text[:3]


# In[96]:


mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(y_train_text)

classifier = Pipeline([
    ('vectorizer', CountVectorizer(analyzer='word',stop_words='english')),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(LinearSVC()))])

classifier.fit(X_train, Y)
predicted = classifier.predict(X_test)
all_labels = mlb.inverse_transform(predicted)

hits=[]
misses=[]
for item, labels in zip(X_test, all_labels):
    if labels!=(): hits.append('{0} => {1}'.format(item, ', '.join(labels)))
    else: misses.append('{0} => {1}'.format(item, ', '.join(labels)))
        
print("some hits:\n{}\n\nsome misses:\n{}".format('\n'.join(hits[:3]),'\n'.join(misses[:3])))


# In[94]:


labels


# ## Exploring the Data - proceedings

# In[ ]:


path='../data/dataexport/proceedings'
p=ttl_graphbuilder(path,debug=True)


# In[ ]:


get_ipython().system('ls {path}')


# In[ ]:


get_ipython().system('cat {path}/0006D323-D0B5-4E22-A26E-75ABB621F58E.ttl')


#