%pylab inline
import requests
import pylab as pl
from pyquery import PyQuery as pq
import numpy as np
import matplotlib as plt
import scipy
import pandas as pd
from lxml import etree

seed_pages = [
    'http://www.legco.gov.hk/general/english/counmtg/yr12-16/mtg_1213.htm',
    'http://www.legco.gov.hk/general/english/counmtg/yr12-16/mtg_1314.htm'
]
def crawl_seed(seed):
    d = pq(seed)
    return d('a').map(lambda i, a: a.attrib.get('name', None)).filter(lambda i, s: s.startswith('cm20'))
meetings = reduce(list.__add__, map(crawl_seed, seed_pages), [])
print meetings

from IPython.core.display import clear_output
import sys

def crawl_xml(meeting):
    # This logic is translated from the official JS code
    yy, mm, dd = map(lambda i: int(meeting[i:(i + 2)]), [4, 6, 8])
    if mm >= 10:
        yr = 'yr%02d-%02d' % (yy, yy + 1)
    else:
        yr = 'yr%02d-%02d' % (yy - 1, yy)
    prefix = 'http://www.legco.gov.hk'
    url = '%(prefix)s/%(yr)s/chinese/counmtg/voting/cm_vote_20%(yy)02d%(mm)02d%(dd)02d.xml' % locals()
    return requests.get(url)

vote_xmls = []
for m in meetings:
    vote_xmls.append(crawl_xml(m))
    clear_output()
    print 'progress: %s/%s %s' % (len(vote_xmls), len(meetings), '#' * len(vote_xmls))
    sys.stdout.flush()

vote_xmls = filter(lambda r: r.ok, vote_xmls)
vote_xmls = map(lambda r: r.content, vote_xmls)
print len(vote_xmls)

# Information fields, useful for reviewing the result
info_fields = ['vote-date', 'vote-time', 'motion-en', 'mover-en', 'mover-type', 'vote-separate-mechanism']
def xml_to_records(xml):
    doc = etree.XML(xml)
    records = []
    for topic in doc.xpath('//legcohk-vote/meeting/vote'):
        info = [topic.xpath(f)[0].text for f in info_fields]
        date = info[0]
        topic_id = '%s-%s' % (date, topic.attrib['number'])
        for member in topic.xpath('individual-votes/member'):
            member_id = member.attrib['name-en'] # Use English name as ID for sipmlicity
            vote = member.xpath('vote')[0].text
            records.append((topic_id, member_id, vote) + tuple(info))
    return records

records = reduce(list.__add__, map(xml_to_records, vote_xmls), [])

# More:
# http://nbviewer.ipython.org/urls/course.ie.cuhk.edu.hk/~engg4030/tutorial/tutorial7/Legco-Preprocessing.ipynb
def clean_record(t):
    # According to the numbers, they seem to be the same person
    t = list(t)
    if t[1] == 'Dr Joseph LEE':
        t[1] = 'Prof Joseph LEE'
    # Other normalization if any
    # ...
    return tuple(t)
records = map(clean_record, records)

df = pd.DataFrame(records, columns = ['topic_id', 'member_id', 'vote'] + info_fields)
df.to_csv('records-all-with-info.csv', encoding='utf-8')
df[:5]

df = df[['topic_id', 'member_id', 'vote']]
df.to_csv('records-all.csv', encoding='utf-8')
df[:5]

print 'total # of topics:', len(df['topic_id'].unique())
print 'total # of members:',len(df['member_id'].unique())
print 'total # of records:', len(df)

print df['vote'].unique()

print df['member_id'].unique()

print df['topic_id'].unique()

# A leader board of voting types
board_pos = pd.DataFrame(index=range(0,5))
for v in df['vote'].unique():
    count = df[df['vote']==v].groupby('member_id').count().sort('vote', ascending=False)['vote']
    count = count.reset_index()[:5]
    board_pos[v] = pd.Series(zip(count['member_id'], count['vote']), index=range(0,5))
board_pos

board_neg = pd.DataFrame(index=range(0,5))
for v in df['vote'].unique():
    count = df[df['vote']==v].groupby('member_id').count().sort('vote', ascending=True)['vote']
    count = count.reset_index()[:5]
    board_neg[v] = pd.Series(zip(count['member_id'], count['vote']), index=range(0,5))
board_neg

df_matrix = pd.DataFrame(index=df['member_id'].unique())
for gn, g in df.groupby('topic_id'):
    df_matrix[gn] = g.set_index('member_id')['vote']
df_matrix[:5]

def to_numeric(x):
    x[(x != 'Yes') & (x != 'No')] = 0
    x[x == 'Yes'] = 1
    x[x == 'No'] = -1
df_matrix.apply(to_numeric)
df_matrix[:5]

X = matrix(df_matrix.as_matrix()).astype('float')
X = X - mean(X, 0)
C = X.T * X
print C.shape

import scipy.sparse.linalg
PA = scipy.sparse.linalg.eigs(C, k=2, which='LM', return_eigenvectors=True)[1]

# Use the following one to pop up the 3D plot in another window.
# Good for interactive exploration.
# May or may not be available due to your desktop.
#%pylab 
#matplotlib.use('TkAgg') 
# Use the following one to plot inline (embedded in this notebook).
#%pylab inline


# Try 3D
PA = scipy.sparse.linalg.eigs(C, k=3, which='LM', return_eigenvectors=True)[1]
# Project data points onto principle axis
X_3D = PA.T * X.T
print X_3D.shape
# We intentionally add some disturbance for better visualization.
# Or else, some of the nodes are located in the same place.
# (Those who vote exactly the same)
X_3D = X_3D + randn(*tuple(X_3D.shape)) * 0.3
x = array(X_3D[0, :]).astype('float')
y = array(X_3D[1, :]).astype('float')
z = array(X_3D[2, :]).astype('float')

from mpl_toolkits.mplot3d import Axes3D
fig = figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(x, y, z, picker=True, s=100)

df_pc1 = pd.DataFrame(x.flatten(), index=df_matrix.index, columns=['PC1'])
df_pc1 = df_pc1.sort('PC1')
figure(figsize(12, 20))
plot(df_pc1['PC1'], df_pc1.rank()['PC1'], 'd', markersize=10)
#yticks(df_pc1.rank()['PC1'], df_pc1.index)
for (_x, _y, _s) in zip(df_pc1['PC1'], df_pc1.rank()['PC1'], df_pc1.index):
    annotate(_s, (_x, _y), xytext=(_x + 0.5, _y - 0.2))
title('Spectrum from Principal Component 1')
axis([-32, 28, 0, 71])