#!/usr/bin/env python # coding: utf-8 # Danish art 'motivs' from Wikidata # ================== # # 'Motivs' (depictions) in Danish artworks present in Wikidata. # In[1]: import pandas as pd import sparql import numpy as np import networkx as nx import matplotlib.pyplot as plt # In[2]: # Formulation of a query to Wikidata service = sparql.Service("https://query.wikidata.org/sparql", method="GET") statement = """ PREFIX wikibase: PREFIX wd: PREFIX wdt: PREFIX rdfs: PREFIX p: PREFIX v: SELECT ?artwork ?artworkLabel ?motiv ?motivLabel ?filename WHERE { ?artwork wdt:P31 wd:Q3305213 . ?artwork wdt:P180 ?motiv . ?artwork wdt:P195 ?collection . ?collection wdt:P17 wd:Q35 . OPTIONAL {?artwork wdt:P18 ?filename } SERVICE wikibase:label { bd:serviceParam wikibase:language "da,en" . } } """ # In[3]: # Querying Wikidata and formatting it for a DataFrame result = service.query(statement) df = pd.DataFrame(result.fetchall(), columns=result.variables) # In[4]: df.shape # In[5]: # Show a bit of the download data df.head() # In[6]: # Set up feature matrix motivs = set([item.value for item in df['motivLabel']]) artworks = set([item.value for item in df['artwork']]) feature_matrix = pd.DataFrame(0, index=artworks, columns=motivs) artwork_mapper = {} filename_mapper = {} for n, row in df.iterrows(): artwork = row['artwork'].value motiv = row['motivLabel'].value feature_matrix.ix[artwork, motiv] = 1 artwork_mapper[artwork] = row['artworkLabel'].value filename_mapper[artwork] = str(row['filename'])[51:] # In[7]: feature_matrix.shape # In[8]: # Show a bit of the constructed feature matrix feature_matrix.head() # In[9]: # Some arbitrary scaling - more research needed here scaled_feature_matrix = feature_matrix.copy() scaled_feature_matrix = scaled_feature_matrix.divide((feature_matrix.sum(axis=1)) ** 0.9, axis='index') scaled_feature_matrix = scaled_feature_matrix.divide((feature_matrix.sum(axis=0)) ** 0.05, axis='columns') # In[10]: # Machine learning decomposition with non-negative matrix factorization from sklearn.decomposition import NMF decomposer = NMF(n_components=12) decomposer.fit(scaled_feature_matrix) transformed = decomposer.transform(scaled_feature_matrix) # In[11]: # Show the results for topic_id in range(decomposer.components_.shape[0]): indices = (-decomposer.components_[topic_id, :]).argsort()[:10] s = "" for index in indices: s += "%s (%f) " % (scaled_feature_matrix.columns[index], decomposer.components_[topic_id, index]) print(s + '\n') # In[12]: topic_ids = transformed.shape[1] for topic_id in range(topic_ids): print('\nTopic %d' % (topic_id + 1,)) indices = (-decomposer.components_[topic_id, :]).argsort()[:5] s = "" for index in indices: s += "%s (%f) " % (scaled_feature_matrix.columns[index], decomposer.components_[topic_id, index]) print(s + '\n') indices = (-transformed[:, topic_id]).argsort()[:20] for index in indices: print("%f %s" % (transformed[index, topic_id], artwork_mapper[feature_matrix.index[index]])) # In[13]: # Write part of an HTML file with image grouped accoring to topic with open('tmp.html', 'w') as f: topic_ids = transformed.shape[1] for topic_id in range(topic_ids): f.write('

Emne %d

\n' % (topic_id + 1,)) indices = (-decomposer.components_[topic_id, :]).argsort()[:5] s = "" for index in indices: if decomposer.components_[topic_id, index] < 0.0001: break s += "%s. " % (scaled_feature_matrix.columns[index],) f.write('Motiver: ' + s.encode('utf-8') + '
\n') indices = (-transformed[:, topic_id]).argsort()[:15] for index in indices: if transformed[index, topic_id] < 0.0001: break qid = int(feature_matrix.index[index][32:]) filename = filename_mapper[feature_matrix.index[index]] if filename: f.write("\n" % (qid, filename,)) # In[14]: # Graph with 'motivs' as nodes graph = nx.Graph() for motiv in feature_matrix.columns: graph.add_node(motiv) for image, row in feature_matrix.iterrows(): motivs = row.index[row.nonzero()[0]].tolist() for i1 in range(len(motivs) - 1): for i2 in range(i1 + 1, len(motivs)): graph.add_edge(motivs[i1], motivs[i2]) # In[15]: subgraph = next(nx.connected_component_subgraphs(graph)) # In[16]: # Good position layout is always a problem - here default spring layout is attempted # pos = nx.layout.spectral_layout(graph) pos = nx.layout.spring_layout(subgraph, iterations=50) # In[17]: # Plotting the motiv graph node_sizes = 50 * feature_matrix.sum(axis=0)[subgraph.nodes()] nx.draw_networkx_nodes(subgraph, pos=pos, node_size=node_sizes, node_color='r', alpha=0.5, linewidths=0) nx.draw_networkx_edges(subgraph, nodelist=subgraph.nodes(), pos=pos, alpha=0.05, color='r', linewidths=3) positions = [pos[node] for node in subgraph.nodes()] for i in np.argsort(node_sizes)[:-31:-1]: plt.text(positions[i][0], positions[i][1], subgraph.nodes()[i], horizontalalignment='center', verticalalignment='center') ax = plt.gca() ax.get_xaxis().set_visible(False) ax.get_yaxis().set_visible(False) ax.set_position([0, 0, 1, 1]) ax.axis([-0.05, 1.05, -0.05, 1.05]) ax.axis([0.3, 0.7, 0.3, 0.7]) plt.text(0.5, 0.65, u'Motiver i danske kunstværker i Wikidata', fontsize=50, backgroundcolor=(1, 1, 1), horizontalalignment='center', verticalalignment='center') # In[18]: # Save the image in a file plt.gcf().set_size_inches(18, 12, forward=True) plt.savefig('Danish art motivs from Wikidata.png') # In[19]: plt.show() # In[ ]: