import xml.etree.ElementTree as ET
from collections import defaultdict
import networkx as nx
import matplotlib.pyplot as plt
import pyddc
import pymarc
import operator
import math
import json
import urllib2
import pywikibot
from pywikibot import pagegenerators as pg
import mwparserfromhell as pfh
%pylab inline

completedb = pymarc.parse_xml_to_array(open('CLASS_23eng_marc_webdewey_20131020.xml'))

graphs = defaultdict(nx.DiGraph)

f = defaultdict(int)


fcount = defaultdict(int)
for record in completedb:
    class_num = None
    table_num = None
    tabl_class_num = None
    
    for field in record.fields:
        
        #conditon 153 
        if field.tag == '153':
            y_z_subfields = field.get_subfields('y','z')
            
            #nontables
            if not y_z_subfields:
                a_subfields = field.get_subfields('a')
                a = a_subfields[0]
                class_num = a
                #checking to see if we have notational hiearchy:
                e_subfields = field.get_subfields('e')
                if e_subfields:
                    graphs['main'].add_edge(class_num, e_subfields[0], rel='notational_hiearchy')
            #tables
            '''
            else:
                z = field.get_subfields('z')
                y = field.get_subfields('y')
                
                if y and not z:
                    f['a']+=1
                    #print field
                if z and not y:
                    f['b']+=1
                    print unicode(field)
            '''
            
        #condition 253
        if field.tag == '253':
            
            if field.indicator1 == '0':
                y_z_subfields = field.get_subfields('y','z')
                if not y_z_subfields:
                    a_subfields = field.get_subfields('a')
                    if a_subfields:
                        if class_num:
                            graphs['main'].add_edge(class_num, a_subfields[0], rel='see_reference')
            
            if field.indicator1 == '2':
                y_z_subfields = field.get_subfields('y','z')
                if not y_z_subfields:
                    a_subfields = field.get_subfields('a')
                    if a_subfields:
                        if class_num:
                            graphs['main'].add_edge(class_num, a_subfields[0], rel='class_elsewhere')
                            #this is if we want these relations to be symmetric
                            graphs['main'].add_edge(a_subfields[0], class_num, rel='class_elsewhere')

                            
        if field.tag == '765':
            
                z = field.get_subfields('z')
                s = field.get_subfields('s')

                if z and s:
                    if len(z) == len(s):
                        if len(z) == 1:
                            if class_num:
                                graphs[z[0]].add_edge(class_num, s[0], rel='built_from')


G = graphs['main']

weights_dict = {'notational_hiearchy': 0.5,
                'see_reference':1, 
                'class_elsewhere':2}

def update_weights(G, weights_dict):
    W = nx.DiGraph()
    
    for m in G.nodes():
        for n in G[m].iterkeys():
            relation = G[m][n]['rel']
            weight = weights_dict[relation]
            W.add_edge(m, n, rel=weights_dict[relation])
    
    return W

W = update_weights(G, weights_dict)

def weighted_shortest_path(W, a, b):
    path = nx.shortest_path(W, a, b, weight='rel')
    return path
    '''
    total_length = 0
    
    for linkpos in range(len(path)):
        link_from = path[linkpos]
        try:
            link_to = path[linkpos + 1]
        except IndexError:
            break
        relation = G[link_from][link_to]['rel']
        link_weight = weights_dict[relation]
        total_length += link_weight
        
        #print link_from, "- relation: ", relation, " weight: ", link_weight, " - ", link_to, " | " , 
    '''

weighted_shortest_path(W, '394.1', '341.33')

nx.shortest_path_length(W, '394.1', '341.33', weight='rel')

def weighted_length(G, path, weights_dict, print_path=False):
    """we can use this to calculate the path lenght, or just print it"""
    total_length = 0
    
    for linkpos in range(len(path)):
        link_from = path[linkpos]
        try:
            link_to = path[linkpos + 1]
        except IndexError:
            break
        relation = G[link_from][link_to]['rel']
        link_weight = weights_dict[relation]
        total_length += link_weight
        
        if print_path:
            print link_from, "- relation: ", relation, " weight: ", link_weight, " - ", link_to, " | " , 
    
    if not print_path:
        return total_length

def multiple_betweenness(W, node_list):
    between_count = defaultdict(int)
    for a in node_list:
        for b in node_list:
            if a != b:
                try:
                    path = weighted_shortest_path(W, a, b)
                except nx.NetworkXNoPath:
                    continue
                for node in path:
                    between_count[node] += 1
    sorted_beteween_nodes = sorted(between_count.iteritems(), key=operator.itemgetter(1))
    return sorted_beteween_nodes

def node_in_G(G, node):
    #recursively check and remove
    if not node:
        return False
    try:
        G[node]
        return node
    except:
        nod = node[:-1]
        node_in_G(G, nod)

def get_ddc_from_isbn(isbn):
    base_path = 'http://classify.oclc.org/classify2/Classify?'
    isbn_path = 'isbn='+str(isbn)
    full_path = base_path+isbn_path+'&summary=true'
    
    try:
        req = urllib2.Request(url=full_path)
        f = urllib2.urlopen(req)
        xml_response = f.read()
        root = ET.fromstring(xml_response)
        for child in root:
            if child.tag == '{http://classify.oclc.org}recommendations':
                for rec in child:
                    if rec.tag == '{http://classify.oclc.org}ddc':
                        for pop in rec:
                            if pop.tag == '{http://classify.oclc.org}mostPopular':
                                try:
                                    return pop.attrib['nsfa']
                                except:
                                    return pop.attrib['sfa']
    except:
        print isbn_path

def get_ddcs_from_categorgy(category_name):
    cat = pywikibot.Category(pywikibot.Site('en', 'wikipedia'), category_name)
    articles = list(cat.articles(total=None))
    return get_ddcs_from_list(articles)

def get_ddcs_from_list(articles):
    
    isbns = []
    
    for article in articles:
        try:
            wikitext = article.get()
        except pywikibot.IsRedirectPage:
            redir_page = article.getRedirectTarget()
            wikitext = redir_page.get()
        wikicode = pfh.parse(wikitext)
        templates = wikicode.filter_templates()
        for template in templates:
            for param in template.params:
                if param.name.lower().startswith('isbn'):
                    #print param.name, param.value
                    isbns.append(param.value)
    
    isbn_set = set()
    str_isbns = map(str, isbns)
    for isbn in str_isbns:
        justnums = filter(lambda x: x in '1234567890Xx', isbn)
        if len(justnums) in [10,13]:
            isbn_set.add(justnums)
            
    ddcs = map(get_ddc_from_isbn, list(isbn_set))
    return ddcs

def most_between_gen(articles):
    potential_ddcs = get_ddcs_from_list(articles)
    valid_ddcs = filter(lambda a: a,map(lambda ddc: node_in_G(W, ddc), potential_ddcs))
    multi_between = multiple_betweenness(W, valid_ddcs)
    top_10 = multi_between[-10:]
    for top in top_10:
        print top[1], ' ', top[0], ' ', search_caption(top[0])

def most_between_category(category_name):
    potential_ddcs = get_ddcs_from_categorgy(category_name)
    valid_ddcs = filter(lambda a: a,map(lambda ddc: node_in_G(W, ddc), potential_ddcs))
    multi_between = multiple_betweenness(W, valid_ddcs)
    top_10 = multi_between[-10:]
    for top in top_10:
        print top[1], ' ', top[0], ' ', search_caption(top[0])

most_between_category('Military_history_of_Germany')

most_between_category('Cooking_techniques')

cat = pywikibot.Category(pywikibot.Site('en', 'wikipedia'), 'Featured_articles')
articles = list(cat.articles(total=None))

isbns = []

for article in articles:
    try:
        wikitext = article.get()
    except pywikibot.IsRedirectPage:
        redir_page = article.getRedirectTarget()
        wikitext = redir_page.get()
    wikicode = pfh.parse(wikitext)
    templates = wikicode.filter_templates()
    for template in templates:
        for param in template.params:
            if param.name.lower().startswith('isbn'):
                #print param.name, param.value
                isbns.append(param.value)

ddcs?

isbn_set = set()
str_isbns = map(str, isbns)
for isbn in str_isbns:
    justnums = filter(lambda x: x in '1234567890Xx', isbn)
    if len(justnums) in [10,13]:
        isbn_set.add(justnums)
        
ddcs = map(get_ddc_from_isbn, list(isbn_set))

json.dump(ddcs, open('featuredddcs,json','w'))

ddcs = json.load(open('featureddccs.json','r'))

len(ddcs)

uniqddcs = list(set(ddcs))
valid_ddcs = filter(lambda a: a,map(lambda ddc: node_in_G(W, ddc), uniqddcs))
json.dump(valid_ddcs, open('valid_ddcs,json','w'))

smalluniqddcs = uniqddcs[-500:]

%%timeit -n1 -r1
valid_ddcs = filter(lambda a: a,map(lambda ddc: node_in_G(W, ddc), smalluniqddcs))
multi_between = multiple_betweenness(W, valid_ddcs)
top_10 = multi_between[-10:]
for top in top_10:
    print top[1], ' ', top[0], ' ', search_caption(top[0])

most_between_category('Featured_articles')

most_between_category('Feminism')

maxs_contribs = list(pg.UserContributionsGenerator(username='Maximilianklein', namespaces=0))

len(maxs_contribs)

most_between_gen(maxs_contribs)

def search_caption(s_class):

    for record in completedb:
        class_num = None
        table_num = None
        tabl_class_num = None
        
        for field in record.fields:
            
            #conditon 153 
            if field.tag == '153':
                a_subfields = field.get_subfields('a')
                if a_subfields[0] == s_class:
                    h = field.get_subfields('h')
                    j = field.get_subfields('j')
                    k = field.get_subfields('k')
                    try:
                        return j[0]
                    except:
                        print j
                        return None

def neighbors_n(G, root, n):
    E = nx.DiGraph()
    
    def n_tree(tree, n_remain):
        neighbors_dict = G[tree]
        
        for neighbor, relations in neighbors_dict.iteritems():
          E.add_edge(tree, neighbor, rel=relations['rel'])

        #you can use this map if you want to retain functional purity
        #map(lambda neigh_rel: E.add_edge(tree, neigh_rel[0], rel=neigh_rel[1]['rel']), neighbors_dict.iteritems() )

        neighbors = list(neighbors_dict.iterkeys())
        n_forest(neighbors, n_remain= (n_remain - 1))

    def n_forest(forest, n_remain):
        if n_remain <= 0:
            return
        else:
            map(lambda tree: n_tree(tree, n_remain=n_remain), forest)
        
    n_forest( [root] , n)
    
    return E

def draw_n_hops(G, root, n, figsizex=20, figsizey=20):
    E = neighbors_n(G, root, n)
    edge_rels = map(lambda edge_tup: edge_tup[2]['rel'], E.edges(data=True) )
    edge_colors = map(lambda edge: {'notational_hiearchy':'red', 'see_reference':'green', 'class_elsewhere':'blue'}[edge], edge_rels)
    
    max_node_size = max( map(lambda node: nx.degree(E, node), E.nodes()))
                             
    node_lognorms = map(lambda node: (math.log(nx.degree(E, node)) / math.log(max_node_size) ) , E.nodes() )
    node_sizes = map(lambda norm: norm * 1500, node_lognorms)    
    
    node_colors= node_lognorms
    
    pos=nx.graphviz_layout(E,prog="twopi",root=root)
                                            
    
    plt.figure(1,figsize=(figsizex,figsizey))
    nx.draw(E, pos=pos, node_size=node_sizes, node_color=node_colors, edge_color=edge_colors)
    

draw_n_hops(G, '394.1', 2, figsizex=12, figsizey=12)

draw_n_hops(G, '394.1', 3, figsizex=12, figsizey=12)

draw_n_hops(G, '394.1', 4, figsizex=12, figsizey=12)

draw_n_hops(G, '394.1', 5, figsizex=12, figsizey=12)

draw_n_hops(G, '394.1', 6, figsizex=12, figsizey=12)

draw_n_hops(G, '394.1', 7, figsizex=12, figsizey=12)

draw_n_hops(G, '394.1', 10, figsizex=12, figsizey=12)

def shortest_relation_path(G, a, b, weights_dict):
    """this gives the shortest path when counting the edges given the weights dict you pass in"""
    #shortests is a list because there maybe multiple shortests
    shortests = []
    
    all_paths = nx.all_simple_paths(G, a, b)
    i = 0 
    for path in all_paths:
        print i
        i+= 1
        wlen = weighted_length(G, path, weights_dict)
        #print wlen
        #shortests doesnt exist
        if not shortests:
            shortests = [path]
        # a new shortest
        elif wlen < len(shortests[0]):
            shortests = [path]
        #just as short
        elif wlen == len(shortests):
            shortests.append(path)
    
    print "there were ", len(shortests), " paths between ", a, " and ", b
    map(lambda path: weighted_length(G, path, weights_dict, print_path=True), shortests)

shortest_relation_path(graph, '363.31', '070.41', weights_dict_test)

subgraphs = nx.connected_component_subgraphs(G.to_undirected())
G_biggest = subgraphs[0]