import xml.etree.ElementTree as ET from collections import defaultdict import networkx as nx import matplotlib.pyplot as plt import pyddc import pymarc import operator import math import json import urllib2 import pywikibot from pywikibot import pagegenerators as pg import mwparserfromhell as pfh %pylab inline completedb = pymarc.parse_xml_to_array(open('CLASS_23eng_marc_webdewey_20131020.xml')) graphs = defaultdict(nx.DiGraph) f = defaultdict(int) fcount = defaultdict(int) for record in completedb: class_num = None table_num = None tabl_class_num = None for field in record.fields: #conditon 153 if field.tag == '153': y_z_subfields = field.get_subfields('y','z') #nontables if not y_z_subfields: a_subfields = field.get_subfields('a') a = a_subfields[0] class_num = a #checking to see if we have notational hiearchy: e_subfields = field.get_subfields('e') if e_subfields: graphs['main'].add_edge(class_num, e_subfields[0], rel='notational_hiearchy') #tables ''' else: z = field.get_subfields('z') y = field.get_subfields('y') if y and not z: f['a']+=1 #print field if z and not y: f['b']+=1 print unicode(field) ''' #condition 253 if field.tag == '253': if field.indicator1 == '0': y_z_subfields = field.get_subfields('y','z') if not y_z_subfields: a_subfields = field.get_subfields('a') if a_subfields: if class_num: graphs['main'].add_edge(class_num, a_subfields[0], rel='see_reference') if field.indicator1 == '2': y_z_subfields = field.get_subfields('y','z') if not y_z_subfields: a_subfields = field.get_subfields('a') if a_subfields: if class_num: graphs['main'].add_edge(class_num, a_subfields[0], rel='class_elsewhere') #this is if we want these relations to be symmetric graphs['main'].add_edge(a_subfields[0], class_num, rel='class_elsewhere') if field.tag == '765': z = field.get_subfields('z') s = field.get_subfields('s') if z and s: if len(z) == len(s): if len(z) == 1: if class_num: graphs[z[0]].add_edge(class_num, s[0], rel='built_from') G = graphs['main'] weights_dict = {'notational_hiearchy': 0.5, 'see_reference':1, 'class_elsewhere':2} def update_weights(G, weights_dict): W = nx.DiGraph() for m in G.nodes(): for n in G[m].iterkeys(): relation = G[m][n]['rel'] weight = weights_dict[relation] W.add_edge(m, n, rel=weights_dict[relation]) return W W = update_weights(G, weights_dict) def weighted_shortest_path(W, a, b): path = nx.shortest_path(W, a, b, weight='rel') return path ''' total_length = 0 for linkpos in range(len(path)): link_from = path[linkpos] try: link_to = path[linkpos + 1] except IndexError: break relation = G[link_from][link_to]['rel'] link_weight = weights_dict[relation] total_length += link_weight #print link_from, "- relation: ", relation, " weight: ", link_weight, " - ", link_to, " | " , ''' weighted_shortest_path(W, '394.1', '341.33') nx.shortest_path_length(W, '394.1', '341.33', weight='rel') def weighted_length(G, path, weights_dict, print_path=False): """we can use this to calculate the path lenght, or just print it""" total_length = 0 for linkpos in range(len(path)): link_from = path[linkpos] try: link_to = path[linkpos + 1] except IndexError: break relation = G[link_from][link_to]['rel'] link_weight = weights_dict[relation] total_length += link_weight if print_path: print link_from, "- relation: ", relation, " weight: ", link_weight, " - ", link_to, " | " , if not print_path: return total_length def multiple_betweenness(W, node_list): between_count = defaultdict(int) for a in node_list: for b in node_list: if a != b: try: path = weighted_shortest_path(W, a, b) except nx.NetworkXNoPath: continue for node in path: between_count[node] += 1 sorted_beteween_nodes = sorted(between_count.iteritems(), key=operator.itemgetter(1)) return sorted_beteween_nodes def node_in_G(G, node): #recursively check and remove if not node: return False try: G[node] return node except: nod = node[:-1] node_in_G(G, nod) def get_ddc_from_isbn(isbn): base_path = 'http://classify.oclc.org/classify2/Classify?' isbn_path = 'isbn='+str(isbn) full_path = base_path+isbn_path+'&summary=true' try: req = urllib2.Request(url=full_path) f = urllib2.urlopen(req) xml_response = f.read() root = ET.fromstring(xml_response) for child in root: if child.tag == '{http://classify.oclc.org}recommendations': for rec in child: if rec.tag == '{http://classify.oclc.org}ddc': for pop in rec: if pop.tag == '{http://classify.oclc.org}mostPopular': try: return pop.attrib['nsfa'] except: return pop.attrib['sfa'] except: print isbn_path def get_ddcs_from_categorgy(category_name): cat = pywikibot.Category(pywikibot.Site('en', 'wikipedia'), category_name) articles = list(cat.articles(total=None)) return get_ddcs_from_list(articles) def get_ddcs_from_list(articles): isbns = [] for article in articles: try: wikitext = article.get() except pywikibot.IsRedirectPage: redir_page = article.getRedirectTarget() wikitext = redir_page.get() wikicode = pfh.parse(wikitext) templates = wikicode.filter_templates() for template in templates: for param in template.params: if param.name.lower().startswith('isbn'): #print param.name, param.value isbns.append(param.value) isbn_set = set() str_isbns = map(str, isbns) for isbn in str_isbns: justnums = filter(lambda x: x in '1234567890Xx', isbn) if len(justnums) in [10,13]: isbn_set.add(justnums) ddcs = map(get_ddc_from_isbn, list(isbn_set)) return ddcs def most_between_gen(articles): potential_ddcs = get_ddcs_from_list(articles) valid_ddcs = filter(lambda a: a,map(lambda ddc: node_in_G(W, ddc), potential_ddcs)) multi_between = multiple_betweenness(W, valid_ddcs) top_10 = multi_between[-10:] for top in top_10: print top[1], ' ', top[0], ' ', search_caption(top[0]) def most_between_category(category_name): potential_ddcs = get_ddcs_from_categorgy(category_name) valid_ddcs = filter(lambda a: a,map(lambda ddc: node_in_G(W, ddc), potential_ddcs)) multi_between = multiple_betweenness(W, valid_ddcs) top_10 = multi_between[-10:] for top in top_10: print top[1], ' ', top[0], ' ', search_caption(top[0]) most_between_category('Military_history_of_Germany') most_between_category('Cooking_techniques') cat = pywikibot.Category(pywikibot.Site('en', 'wikipedia'), 'Featured_articles') articles = list(cat.articles(total=None)) isbns = [] for article in articles: try: wikitext = article.get() except pywikibot.IsRedirectPage: redir_page = article.getRedirectTarget() wikitext = redir_page.get() wikicode = pfh.parse(wikitext) templates = wikicode.filter_templates() for template in templates: for param in template.params: if param.name.lower().startswith('isbn'): #print param.name, param.value isbns.append(param.value) ddcs? isbn_set = set() str_isbns = map(str, isbns) for isbn in str_isbns: justnums = filter(lambda x: x in '1234567890Xx', isbn) if len(justnums) in [10,13]: isbn_set.add(justnums) ddcs = map(get_ddc_from_isbn, list(isbn_set)) json.dump(ddcs, open('featuredddcs,json','w')) ddcs = json.load(open('featureddccs.json','r')) len(ddcs) uniqddcs = list(set(ddcs)) valid_ddcs = filter(lambda a: a,map(lambda ddc: node_in_G(W, ddc), uniqddcs)) json.dump(valid_ddcs, open('valid_ddcs,json','w')) smalluniqddcs = uniqddcs[-500:] %%timeit -n1 -r1 valid_ddcs = filter(lambda a: a,map(lambda ddc: node_in_G(W, ddc), smalluniqddcs)) multi_between = multiple_betweenness(W, valid_ddcs) top_10 = multi_between[-10:] for top in top_10: print top[1], ' ', top[0], ' ', search_caption(top[0]) most_between_category('Featured_articles') most_between_category('Feminism') maxs_contribs = list(pg.UserContributionsGenerator(username='Maximilianklein', namespaces=0)) len(maxs_contribs) most_between_gen(maxs_contribs) def search_caption(s_class): for record in completedb: class_num = None table_num = None tabl_class_num = None for field in record.fields: #conditon 153 if field.tag == '153': a_subfields = field.get_subfields('a') if a_subfields[0] == s_class: h = field.get_subfields('h') j = field.get_subfields('j') k = field.get_subfields('k') try: return j[0] except: print j return None def neighbors_n(G, root, n): E = nx.DiGraph() def n_tree(tree, n_remain): neighbors_dict = G[tree] for neighbor, relations in neighbors_dict.iteritems(): E.add_edge(tree, neighbor, rel=relations['rel']) #you can use this map if you want to retain functional purity #map(lambda neigh_rel: E.add_edge(tree, neigh_rel[0], rel=neigh_rel[1]['rel']), neighbors_dict.iteritems() ) neighbors = list(neighbors_dict.iterkeys()) n_forest(neighbors, n_remain= (n_remain - 1)) def n_forest(forest, n_remain): if n_remain <= 0: return else: map(lambda tree: n_tree(tree, n_remain=n_remain), forest) n_forest( [root] , n) return E def draw_n_hops(G, root, n, figsizex=20, figsizey=20): E = neighbors_n(G, root, n) edge_rels = map(lambda edge_tup: edge_tup[2]['rel'], E.edges(data=True) ) edge_colors = map(lambda edge: {'notational_hiearchy':'red', 'see_reference':'green', 'class_elsewhere':'blue'}[edge], edge_rels) max_node_size = max( map(lambda node: nx.degree(E, node), E.nodes())) node_lognorms = map(lambda node: (math.log(nx.degree(E, node)) / math.log(max_node_size) ) , E.nodes() ) node_sizes = map(lambda norm: norm * 1500, node_lognorms) node_colors= node_lognorms pos=nx.graphviz_layout(E,prog="twopi",root=root) plt.figure(1,figsize=(figsizex,figsizey)) nx.draw(E, pos=pos, node_size=node_sizes, node_color=node_colors, edge_color=edge_colors) draw_n_hops(G, '394.1', 2, figsizex=12, figsizey=12) draw_n_hops(G, '394.1', 3, figsizex=12, figsizey=12) draw_n_hops(G, '394.1', 4, figsizex=12, figsizey=12) draw_n_hops(G, '394.1', 5, figsizex=12, figsizey=12) draw_n_hops(G, '394.1', 6, figsizex=12, figsizey=12) draw_n_hops(G, '394.1', 7, figsizex=12, figsizey=12) draw_n_hops(G, '394.1', 10, figsizex=12, figsizey=12) def shortest_relation_path(G, a, b, weights_dict): """this gives the shortest path when counting the edges given the weights dict you pass in""" #shortests is a list because there maybe multiple shortests shortests = [] all_paths = nx.all_simple_paths(G, a, b) i = 0 for path in all_paths: print i i+= 1 wlen = weighted_length(G, path, weights_dict) #print wlen #shortests doesnt exist if not shortests: shortests = [path] # a new shortest elif wlen < len(shortests[0]): shortests = [path] #just as short elif wlen == len(shortests): shortests.append(path) print "there were ", len(shortests), " paths between ", a, " and ", b map(lambda path: weighted_length(G, path, weights_dict, print_path=True), shortests) shortest_relation_path(graph, '363.31', '070.41', weights_dict_test) subgraphs = nx.connected_component_subgraphs(G.to_undirected()) G_biggest = subgraphs[0]