This notebook is made to help you explore how combinators and data are connected. I've only spent about an hour on this, so probably things will break...
#get some initial tools
import pandas as pd
import io
import numpy as np
!pip install geopandas
import geopandas as gpd
I scraped the latest version of the page from the back-end of the dataarc site that previews the combinators, data sources and queries and saved it as a csv and put it on github.
# Pull the combinators from github and make it a dataframe
url = 'https://raw.githubusercontent.com/ropitz/experiments/master/data/dataarc_combinators.csv'
combine = pd.read_csv(url,encoding='utf-8')
combine
# Pull the per source data from github and make it a dataframe
url = 'https://raw.githubusercontent.com/ropitz/experiments/master/data/SEAD.json'
SEAD = gpd.read_file(url)
SEAD.head()
# Pull the per source data from github and make it a dataframe
url = "https://raw.githubusercontent.com/ropitz/experiments/master/data/enviro_threats.json"
threats = gpd.read_file(url)
threats.head()
# Pull the per source data from github and make it a dataframe
url = "https://raw.githubusercontent.com/ropitz/experiments/master/data/sagas.geojson"
sagas = gpd.read_file(url)
sagas.head()
# Fill in NaNs because they make lots of tools grumpy
combine=combine.fillna('blank')
combine.head()
# reshape the combinator dataframe to split lists of concepts in a single column into separate rows
reshaped = \
(combine.set_index(combine.columns.drop('topics',1).tolist())
.topics.str.split(',', expand=True)
.stack()
.reset_index()
.rename(columns={0:'topics'})
.loc[:, combine.columns]
)
# view the result
reshaped.head()
# strip leading and trailing whitespace because it confuses me (and makes concept labels not match)
reshaped['topics'] = reshaped['topics'].str.strip()
# get the list of datasources actually mapped to combinators
reshaped['data'].unique()
# get topics mapped per source
sagatopics = reshaped[reshaped['data']=='sagas']
sagatopics.head()
# get topics mapped per source
seadtopics = reshaped[reshaped['data']=='sead']
seadtopics.head()
# get topics mapped per source
threatstopics = reshaped[reshaped['data']=='enviro_threats_icelandic_sites']
threatstopics.head()
# libraries to make less-fancy graphs compared with the real tool
import networkx as nx
import matplotlib.pyplot as plt
# let's compare the topics mapped together by sagas and sead
sagasead = pd.concat([seadtopics,sagatopics])
# Build your graph of the selected datasets
G1 = nx.from_pandas_edgelist(sagasead, 'data', 'topics')
#graph it
plt.figure(figsize=(10,10))
# 2. Create a layout for our nodes
layout = nx.draw_spring(G1, with_labels=True)
# 4. Turn off the axis because I know you don't want it
plt.axis('off')
# 5. Tell matplotlib to show it
plt.show()
# things with two degrees or neighbours are those between the two data sources
degree = pd.DataFrame.from_dict(G1.degree())
degree.columns = ['topic', 'nn']
degree.sort_values('nn')
shared = degree[degree['nn']==2]
shared
# get combinators associated with a given shared topic
building = sagasead[sagasead['topics']=='building']
building
#get the queries attached to those combinators
pd.set_option('display.max_colwidth', -1)
print(building['query'])
# count how many items in the sagas data mention 'wood' as a relevant concept
building_sagas = sagas[sagas['concept'].str.contains('wood', regex=True)]
building_sagas.count()
# count how many items in the sead data mention 'general synanthropic' as a relevant concept
building_sead = SEAD[SEAD['indicators'].str.contains('General synanthropic', regex=False)]
building_sead.count()
# now look at things between all three data sources
sagaseadthreats = pd.concat([seadtopics,sagatopics,threatstopics], sort=False)
# Build your graph of the selected datasets
G2 = nx.from_pandas_edgelist(sagaseadthreats, 'data', 'topics')
#graph it
plt.figure(figsize=(10,10))
# 2. Create a layout for our nodes
layout = nx.draw_spring(G2, with_labels=True)
# 4. Turn off the axis because I know you don't want it
plt.axis('off')
# 5. Tell matplotlib to show it
plt.show()
# get topics shared by all three data sources
degree2 = pd.DataFrame.from_dict(G2.degree())
degree2.columns = ['topic', 'nn']
degree2.sort_values('nn')
shared2 = degree2[degree2['nn']==3]
shared2
# get topics shared by any two of three data sources
degree2 = pd.DataFrame.from_dict(G2.degree())
degree2.columns = ['topic', 'nn']
degree2.sort_values('nn')
shared3 = degree2[degree2['nn']==2]
shared3
# Count how many mappings of combinators to each concept (topic) is present and append that value to each row in the table
reshaped['Count'] = reshaped.groupby('topics')['topics'].transform('size')
reshaped.head()
# find frequently mapped concepts. Here you can change the number '10' to be something else
# the groupby the count
many = reshaped[reshaped.Count > 10]
top = many.groupby('Count')
top.first()
# find infrequently mapped concepts
few = reshaped[reshaped.Count < 3]
bottom = few.groupby('Count')
bottom.first()
# group the whole big table by concept / topic and pull the first response from each group
bytop = reshaped.groupby('topics')
bytop.first()
# example of how to get a group by concept name
bytop.get_group('Dung')
# example of how to get a group by concept name and assign it to a variable
dung = bytop.get_group('Dung')
dung
# example of how to get a group by concept name and assign it to a variable
animals = bytop.get_group('animal')
animals
# Build your graph from the data sources and combinators connected under a single concept
G3 = nx.from_pandas_edgelist(animals, 'data', 'comb')
# Plot it
nx.draw_circular(G3, with_labels=True)
plt.show()
#combine two topics you think should connect
ani_dung = pd.concat([animals,dung])
ani_dung
# Build your graph from the data sources and combinators connected under a two concepts you think should connect
G4 = nx.from_pandas_edgelist(ani_dung, 'data', 'comb')
# Plot it
nx.draw_circular(G4, with_labels=True)
plt.show()
# Build your graph from the most frequently mapped topics table
G5 = nx.from_pandas_edgelist(many, 'data', 'topics')
# Plot it
nx.draw_circular(G5, with_labels=True)
plt.show()
#select specific topics
concepts = ['animal','humans']
selected = reshaped[reshaped.topics.isin(concepts)]
# Build your graph of the relationships of your selected topics
G6 = nx.from_pandas_edgelist(selected, 'data', 'topics')
# Plot it
nx.draw_circular(G6, with_labels=True)
plt.figure(figsize=(15, 15))
plt.show()
#select specific datasets
datasets = ['sead','enviro_threats_icelandic_sites']
selected2 = reshaped[reshaped.data.isin(datasets)]
# Build your graph of the selected datasets
G7 = nx.from_pandas_edgelist(selected2, 'data', 'topics')
#graph it
plt.figure(figsize=(10,10))
# 2. Create a layout for our nodes
layout = nx.draw_spring(G7, with_labels=True)
# 4. Turn off the axis because I know you don't want it
plt.axis('off')
# 5. Tell matplotlib to show it
plt.show()
It seems enviro threats and sead only share three concepts... this is probably less than ideal
!pip install pyvis
from pyvis.network import Network