This notebook is made to help you explore how combinators and data are connected. I've only spent about an hour on this, so probably things will break...

In [0]:

#get some initial tools
import pandas as pd
import io
import numpy as np
!pip install geopandas
import geopandas as gpd

I scraped the latest version of the page from the back-end of the dataarc site that previews the combinators, data sources and queries and saved it as a csv and put it on github.

In [0]:

# Pull the combinators from github and make it a dataframe
url = 'https://raw.githubusercontent.com/ropitz/experiments/master/data/dataarc_combinators.csv'
combine = pd.read_csv(url,encoding='utf-8')
combine

In [0]:

# Pull the per source data from github and make it a dataframe
url = 'https://raw.githubusercontent.com/ropitz/experiments/master/data/SEAD.json'

SEAD = gpd.read_file(url)

SEAD.head()

In [0]:

# Pull the per source data from github and make it a dataframe
url = "https://raw.githubusercontent.com/ropitz/experiments/master/data/enviro_threats.json"
threats = gpd.read_file(url)
threats.head()

In [0]:

# Pull the per source data from github and make it a dataframe
url = "https://raw.githubusercontent.com/ropitz/experiments/master/data/sagas.geojson"
sagas = gpd.read_file(url)
sagas.head()

In [0]:

# Fill in NaNs because they make lots of tools grumpy
combine=combine.fillna('blank')
combine.head()

In [0]:

# reshape the combinator dataframe to split lists of concepts in a single column into separate rows
reshaped = \
(combine.set_index(combine.columns.drop('topics',1).tolist())
   .topics.str.split(',', expand=True)
   .stack()
   .reset_index()
   .rename(columns={0:'topics'})
   .loc[:, combine.columns]
)

# view the result
reshaped.head()

In [0]:

# strip leading and trailing whitespace because it confuses me (and makes concept labels not match)
reshaped['topics'] = reshaped['topics'].str.strip()

In [0]:

# get the list of datasources actually mapped to combinators
reshaped['data'].unique()

In [0]:

# get topics mapped per source
sagatopics = reshaped[reshaped['data']=='sagas']
sagatopics.head()

In [0]:

# get topics mapped per source
seadtopics = reshaped[reshaped['data']=='sead']
seadtopics.head()

In [0]:

# get topics mapped per source
threatstopics = reshaped[reshaped['data']=='enviro_threats_icelandic_sites']
threatstopics.head()

In [0]:

# libraries to make less-fancy graphs compared with the real tool

import networkx as nx
import matplotlib.pyplot as plt
 

In [0]:

# let's compare the topics mapped together by sagas and sead
sagasead = pd.concat([seadtopics,sagatopics])

# Build your graph of the selected datasets
G1 = nx.from_pandas_edgelist(sagasead, 'data', 'topics')

#graph it
plt.figure(figsize=(10,10))

# 2. Create a layout for our nodes 
layout = nx.draw_spring(G1, with_labels=True)



# 4. Turn off the axis because I know you don't want it
plt.axis('off')

# 5. Tell matplotlib to show it
plt.show()

In [0]:

# things with two degrees or neighbours are those between the two data sources
degree = pd.DataFrame.from_dict(G1.degree())
degree.columns = ['topic', 'nn']
degree.sort_values('nn')
shared = degree[degree['nn']==2]
shared

In [0]:

# get combinators associated with a given shared topic
building = sagasead[sagasead['topics']=='building']
building

In [0]:

#get the queries attached to those combinators

pd.set_option('display.max_colwidth', -1)
print(building['query'])

In [0]:

# count how many items in the sagas data mention 'wood' as a relevant concept
building_sagas = sagas[sagas['concept'].str.contains('wood', regex=True)]
building_sagas.count()

In [0]:

# count how many items in the sead data mention 'general synanthropic' as a relevant concept
building_sead = SEAD[SEAD['indicators'].str.contains('General synanthropic', regex=False)]

building_sead.count()

In [0]:

# now look at things between all three data sources
sagaseadthreats = pd.concat([seadtopics,sagatopics,threatstopics], sort=False)

# Build your graph of the selected datasets
G2 = nx.from_pandas_edgelist(sagaseadthreats, 'data', 'topics')

#graph it
plt.figure(figsize=(10,10))

# 2. Create a layout for our nodes 
layout = nx.draw_spring(G2, with_labels=True)



# 4. Turn off the axis because I know you don't want it
plt.axis('off')

# 5. Tell matplotlib to show it
plt.show()

In [0]:

# get topics shared by all three data sources
degree2 = pd.DataFrame.from_dict(G2.degree())
degree2.columns = ['topic', 'nn']
degree2.sort_values('nn')
shared2 = degree2[degree2['nn']==3]
shared2

In [0]:

# get topics shared by any two of three data sources
degree2 = pd.DataFrame.from_dict(G2.degree())
degree2.columns = ['topic', 'nn']
degree2.sort_values('nn')
shared3 = degree2[degree2['nn']==2]
shared3

In [0]:

# Count how many mappings of combinators to each concept (topic) is present and append that value to each row in the table
reshaped['Count'] = reshaped.groupby('topics')['topics'].transform('size')
reshaped.head()

In [0]:

# find frequently mapped concepts. Here you can change the number '10' to be something else
# the groupby the count
many = reshaped[reshaped.Count > 10]
top = many.groupby('Count')
top.first()

In [0]:

# find infrequently mapped concepts
few = reshaped[reshaped.Count < 3]
bottom = few.groupby('Count')
bottom.first()

In [0]:

# group the whole big table by concept / topic and pull the first response from each group
bytop = reshaped.groupby('topics')
bytop.first()

In [0]:

# example of how to get a group by concept name
bytop.get_group('Dung')

In [0]:

# example of how to get a group by concept name and assign it to a variable
dung = bytop.get_group('Dung')
dung

In [0]:

# example of how to get a group by concept name and assign it to a variable
animals = bytop.get_group('animal')
animals

In [0]:

# Build your graph from the data sources and combinators connected under a single concept
G3 = nx.from_pandas_edgelist(animals, 'data', 'comb')

# Plot it
nx.draw_circular(G3, with_labels=True)
plt.show()

In [0]:

#combine two topics you think should connect
ani_dung = pd.concat([animals,dung])
ani_dung

In [0]:

# Build your graph from the data sources and combinators connected under a two concepts you think should connect
G4 = nx.from_pandas_edgelist(ani_dung, 'data', 'comb')

# Plot it
nx.draw_circular(G4, with_labels=True)
plt.show()

In [0]:

# Build your graph from the most frequently mapped topics table
G5 = nx.from_pandas_edgelist(many, 'data', 'topics')

# Plot it
nx.draw_circular(G5, with_labels=True)
plt.show()

In [0]:

#select specific topics
concepts = ['animal','humans']
selected = reshaped[reshaped.topics.isin(concepts)]

# Build your graph of the relationships of your selected topics
G6 = nx.from_pandas_edgelist(selected, 'data', 'topics')

# Plot it
nx.draw_circular(G6, with_labels=True)
plt.figure(figsize=(15, 15))
plt.show()

In [0]:

#select specific datasets
datasets = ['sead','enviro_threats_icelandic_sites']
selected2 = reshaped[reshaped.data.isin(datasets)]

# Build your graph of the selected datasets
G7 = nx.from_pandas_edgelist(selected2, 'data', 'topics')

#graph it
plt.figure(figsize=(10,10))

# 2. Create a layout for our nodes 
layout = nx.draw_spring(G7, with_labels=True)



# 4. Turn off the axis because I know you don't want it
plt.axis('off')

# 5. Tell matplotlib to show it
plt.show()

It seems enviro threats and sead only share three concepts... this is probably less than ideal

In [0]:

!pip install pyvis
from pyvis.network import Network