%pylab inline

# Import required libraries
import os
import csv
import segeval as se
import numpy as np
import matplotlib.pyplot as plt
import itertools as it
from collections import defaultdict
from decimal import Decimal
from hcluster import linkage, dendrogram, fcluster

# Document to analyse
item_name = u'kublakhan'
number_of_lines = 54

# Ordered list of coders (and numeric list of coders) used to relate
# numbered cluster coders to other graphs
coders = ['AWRAXV1RIYR0M', 'A23S6QOSZH9TMT', 'A21IFZJ0EDKM4E', 'AO3XB5I5QNNUI', 'A3RLCGRXA34GC0', 'A21SF3IKIZB0VN', 'APXNY64HXO08K', 'AM155T4U3RE1A', 'A2YBGZ2H2KSO5T']
labels = ['%i' % i for i in range(0, len(coders))]

# Load segmentation dataset
filepath = os.path.join('data', 'kubla_khan_fournier_2013.json')
dataset = se.input_linear_mass_json(filepath)

# Load labels
segment_labels = dict()
filepath = os.path.join('data', 'kubla_khan_fournier_2013', 'labels.csv')
with open(filepath) as csv_file:
    reader = csv.reader(csv_file, delimiter=',')
    for row in reader:
        segment_labels[row[0]] = [item.strip() for item in row[1:]]

# Compute boundaries
boundaries = dict([(key, len(mass) - 1) for key, mass in dataset[item_name].items()])
coder_boundaries = [boundaries[coder] for coder in coders]

# Compute similarities (1-B)
similarities = se.boundary_similarity(dataset, one_minus=True)

# Expand segment labels using the mass of each segment to create
# a one to one mapping between line and segment label
expanded_segment_labels = defaultdict(list)
for coder in coders:
    masses = dataset[item_name][coder]
    coder_segment_labels = segment_labels[coder]
    expanded_segment = list()
    for mass, coder_segment_label in zip(masses, coder_segment_labels):
        expanded_segment.extend(list([coder_segment_label]) * mass)
    expanded_segment_labels[coder] = expanded_segment

# Define label similarity function
def jaccard(a, b):
    return float(len(a & b)) / float(len(a | b))

# Compute overall label Jaccard similarities per position
total_similarities = list()
row_similarities = list()
for i in xrange(0, number_of_lines):
    parts = list()
    for coder in coders:
        parts.append(set(expanded_segment_labels[coder][i].split('/')))
    part_combinations = it.combinations(parts, 2)
    position_similarities = [jaccard(a, b) for a, b in part_combinations]
    total_similarities.extend(position_similarities)
    row_similarities.append(position_similarities)

def autolabel(rects, rotation=0, xpad=0):
    # attach some text labels
    for rect in rects:
        height = rect.get_height()
        ax.text(rect.get_x()+rect.get_width()/2.+xpad, 1.05*height, '%.2f'%float(height),
                ha='center', va='bottom', rotation=rotation)

similarity_values = [float(value) for value in similarities.values()]
mean_b = np.mean(similarity_values)
std_b  = np.std(similarity_values)

mean_j = np.mean(total_similarities)
std_j = np.std(total_similarities)

print 'Mean B \t\t {0:.4f} +/- {1:.4f}, n={2}'.format(mean_b, std_b, len(similarity_values))
print 'Mean J \t\t {0:.4f} +/- {1:.4f}, n={2}'.format(mean_j, std_j, len(total_similarities))
print 'Fleiss\' Pi \t {0:.4f}'.format(se.fleiss_pi_linear(dataset))

# Order distances for clustering
coder_combinations = [list(a) for a in it.combinations(coders, 2)]
for coder_combination in coder_combinations:
    coder_combinations.reverse()
keys = list()
for a in coder_combinations:
    a = list(a)
    key = ','.join([item_name] + a)
    if key not in similarities:
        a.reverse()
    key = ','.join([item_name] + a)
    keys.append(key)
distances = [similarities[key] for key in keys]

# Cluster
aglomerative_clusters = linkage(distances, method='complete')
dendro = dendrogram(aglomerative_clusters, labels=labels)
plt.ylabel('Mean Distance (1-B)')
plt.xlabel('Coder')
plt.show(dendro)

cluster_members = {
    '0,2' : [coders[0], coders[2]],
    '1,0,2' : [coders[1], coders[0], coders[2]],
    '4,7' : [coders[4], coders[7]],
    '1,0,2,4,7' : [coders[1], coders[0], coders[2], coders[4], coders[7]],
    '6,8' : [coders[6], coders[8]],
    '5,6,8' : [coders[5], coders[6], coders[8]],
    '3,5,6,8' : [coders[3], coders[5], coders[6], coders[8]]
}

cluster_pi = dict()
cluster_b  = dict()
cluster_j  = dict()
for cluster, members in cluster_members.items():
    data = {coder : dataset[item_name][coder] for coder in members}
    dataset_subset = se.Dataset({item_name : data})
    cluster_b[cluster]  = [float(value) for value in se.boundary_similarity(dataset_subset, n_t=2).values()]
    cluster_pi[cluster] = float(se.fleiss_pi_linear(dataset_subset, n_t=2))
    position_j = list()
    for i in xrange(0, number_of_lines):
        parts = list()
        for coder in members:
            parts.append(set(expanded_segment_labels[coder][i].split('/')))
        part_combinations = it.combinations(parts, 2)
        position_similarities = [jaccard(a, b) for a, b in part_combinations]
        position_j.extend(position_similarities)
    cluster_j[cluster] = position_j

print 'Cluster\t\tPi\tB\t\t\tJ'
for cluster in cluster_members.keys():
    print '{0}\t{1:.4f}\t{2:.4f} +/- {3:.4f}, n={4}\t{5:.4f} +/- {6:.4f}, n={7}'.format(cluster if len(cluster) > 7 else cluster+'\t',
        np.mean(cluster_pi[cluster]),
        np.mean(cluster_b[cluster]),
        np.std(cluster_b[cluster]),
        len(cluster_b[cluster]),
        np.mean(cluster_j[cluster]),
        np.std(cluster_j[cluster]),
        len(cluster_j[cluster]))

y = list()
y2 = list()
y2err = list()
y3 = list()
y3err = list()

for cluster in cluster_members.keys():
    y.append(float(cluster_pi[cluster]))
    y2.append(np.mean(cluster_b[cluster]))
    y2err.append(np.std(cluster_b[cluster]))
    y3.append(np.mean(cluster_j[cluster]))
    y3err.append(np.std(cluster_j[cluster]))

ind = np.arange(len(cluster_members))  # the x locations for the groups
width = 0.26       # the width of the bars

fig = plt.figure()
ax = fig.add_subplot(111)
rects1 = ax.bar(ind, y, width, color='0.25', ecolor='k')
rects2 = ax.bar(ind+width, y2, width, yerr=y2err, color='0.5', ecolor='k')
rects3 = ax.bar(ind+width*2, y3, width, yerr=y3err, color='0.75', ecolor='k')

# add some
ax.set_ylabel('Cluster similarity')
ax.set_xticks(ind + ((width * 3) / 2))
ax.set_xticklabels(labels)
ax.set_xlim([-0.25,6.95])
ax.set_ylim([0,1])

ax.legend( (rects1[0], rects2[0], rects3[0]), ('$\kappa_{\mathrm{B}}$', 'E(B)', 'E(J)') )

autolabel(rects1, rotation=90, xpad=.03)
autolabel(rects2, rotation=90, xpad=.03)
autolabel(rects3, rotation=90, xpad=.03)

# Plot boundaries per coder
y = coder_boundaries
x = np.arange(len(y))

# Set up
width = 0.75
fig = plt.figure()
ax = fig.add_subplot(1,1,1)

# Plot
rects = ax.bar(x, y, width, color='0.75')

# Add xticks
ax.set_xticks(x + (width / 2))
ax.set_xticklabels([str(val) for val in labels])

# Draw mean lines
xmin, xmax, ymean, ystd = -0.25, len(labels), np.mean(y), np.std(y)
ax.plot([xmin, xmax], [ymean] * 2, color='k') # Draw mean
ax.plot([xmin, xmax], [ymean + ystd] * 2, color='0.5') # Draw +std
ax.plot([xmin, xmax], [ymean - ystd] * 2, color='0.5') # Draw -std

# Add numbers to bars
format_str='%d'
fnc_value=int
for rect in rects:
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width() / 2., 1.05 * height, format_str%fnc_value(height), ha='center', va='bottom')

# Format
ax.set_xlim([-0.25, 9])
ax.set_ylim([0, 30])
ax.set_xlabel('Coder')
ax.set_ylabel('Boundaries placed (quantity)')
plt.show()

# Create heat map
y_sim = list()
y_sim_err = list()
for row_similarity in row_similarities:
    y_sim.append(np.mean(row_similarity))
    y_sim_err.append(np.std(row_similarity))

# Plot mean label similarity
labels = ['$%i$' % i for i in range(0, number_of_lines)]

y = list(y_sim)
x = range(0, number_of_lines)
plt.errorbar(x, y, color='k', )

xlim([0, number_of_lines - 1])
ylim([0, 1.05])

plt.ylabel('Mean Label Jaccard Similarity')
plt.xlabel('Line')

plt.show()

position_frequency = [0] * (sum(dataset['kublakhan'].values()[0]) - 1) 

for segmentation in dataset['kublakhan'].values():
    position = 0
    for segment in segmentation[0:-1]:
        position += segment
        position_frequency[position] += 1

position_boundary_sim = [float(value) / 9 for value in position_frequency]

# Create heat map
y = position_frequency

# Plot mean label similarity
labels = ['$%i$' % i for i in range(0, number_of_lines)]

x = range(0, number_of_lines - 1)
plt.errorbar(x, y, color='k', )

xlim([0, 52.0])
ylim([0, 10])

plt.ylabel('Boundary Frequency')
plt.xlabel('Line')

plt.show()