%pylab inline # Import required libraries import os import csv import segeval as se import numpy as np import matplotlib.pyplot as plt import itertools as it from collections import defaultdict from decimal import Decimal from hcluster import linkage, dendrogram, fcluster # Document to analyse item_name = u'kublakhan' number_of_lines = 54 # Ordered list of coders (and numeric list of coders) used to relate # numbered cluster coders to other graphs coders = ['AWRAXV1RIYR0M', 'A23S6QOSZH9TMT', 'A21IFZJ0EDKM4E', 'AO3XB5I5QNNUI', 'A3RLCGRXA34GC0', 'A21SF3IKIZB0VN', 'APXNY64HXO08K', 'AM155T4U3RE1A', 'A2YBGZ2H2KSO5T'] labels = ['%i' % i for i in range(0, len(coders))] # Load segmentation dataset filepath = os.path.join('data', 'kubla_khan_fournier_2013.json') dataset = se.input_linear_mass_json(filepath) # Load labels segment_labels = dict() filepath = os.path.join('data', 'kubla_khan_fournier_2013', 'labels.csv') with open(filepath) as csv_file: reader = csv.reader(csv_file, delimiter=',') for row in reader: segment_labels[row[0]] = [item.strip() for item in row[1:]] # Compute boundaries boundaries = dict([(key, len(mass) - 1) for key, mass in dataset[item_name].items()]) coder_boundaries = [boundaries[coder] for coder in coders] # Compute similarities (1-B) similarities = se.boundary_similarity(dataset, one_minus=True) # Expand segment labels using the mass of each segment to create # a one to one mapping between line and segment label expanded_segment_labels = defaultdict(list) for coder in coders: masses = dataset[item_name][coder] coder_segment_labels = segment_labels[coder] expanded_segment = list() for mass, coder_segment_label in zip(masses, coder_segment_labels): expanded_segment.extend(list([coder_segment_label]) * mass) expanded_segment_labels[coder] = expanded_segment # Define label similarity function def jaccard(a, b): return float(len(a & b)) / float(len(a | b)) # Compute overall label Jaccard similarities per position total_similarities = list() row_similarities = list() for i in xrange(0, number_of_lines): parts = list() for coder in coders: parts.append(set(expanded_segment_labels[coder][i].split('/'))) part_combinations = it.combinations(parts, 2) position_similarities = [jaccard(a, b) for a, b in part_combinations] total_similarities.extend(position_similarities) row_similarities.append(position_similarities) def autolabel(rects, rotation=0, xpad=0): # attach some text labels for rect in rects: height = rect.get_height() ax.text(rect.get_x()+rect.get_width()/2.+xpad, 1.05*height, '%.2f'%float(height), ha='center', va='bottom', rotation=rotation) similarity_values = [float(value) for value in similarities.values()] mean_b = np.mean(similarity_values) std_b = np.std(similarity_values) mean_j = np.mean(total_similarities) std_j = np.std(total_similarities) print 'Mean B \t\t {0:.4f} +/- {1:.4f}, n={2}'.format(mean_b, std_b, len(similarity_values)) print 'Mean J \t\t {0:.4f} +/- {1:.4f}, n={2}'.format(mean_j, std_j, len(total_similarities)) print 'Fleiss\' Pi \t {0:.4f}'.format(se.fleiss_pi_linear(dataset)) # Order distances for clustering coder_combinations = [list(a) for a in it.combinations(coders, 2)] for coder_combination in coder_combinations: coder_combinations.reverse() keys = list() for a in coder_combinations: a = list(a) key = ','.join([item_name] + a) if key not in similarities: a.reverse() key = ','.join([item_name] + a) keys.append(key) distances = [similarities[key] for key in keys] # Cluster aglomerative_clusters = linkage(distances, method='complete') dendro = dendrogram(aglomerative_clusters, labels=labels) plt.ylabel('Mean Distance (1-B)') plt.xlabel('Coder') plt.show(dendro) cluster_members = { '0,2' : [coders[0], coders[2]], '1,0,2' : [coders[1], coders[0], coders[2]], '4,7' : [coders[4], coders[7]], '1,0,2,4,7' : [coders[1], coders[0], coders[2], coders[4], coders[7]], '6,8' : [coders[6], coders[8]], '5,6,8' : [coders[5], coders[6], coders[8]], '3,5,6,8' : [coders[3], coders[5], coders[6], coders[8]] } cluster_pi = dict() cluster_b = dict() cluster_j = dict() for cluster, members in cluster_members.items(): data = {coder : dataset[item_name][coder] for coder in members} dataset_subset = se.Dataset({item_name : data}) cluster_b[cluster] = [float(value) for value in se.boundary_similarity(dataset_subset, n_t=2).values()] cluster_pi[cluster] = float(se.fleiss_pi_linear(dataset_subset, n_t=2)) position_j = list() for i in xrange(0, number_of_lines): parts = list() for coder in members: parts.append(set(expanded_segment_labels[coder][i].split('/'))) part_combinations = it.combinations(parts, 2) position_similarities = [jaccard(a, b) for a, b in part_combinations] position_j.extend(position_similarities) cluster_j[cluster] = position_j print 'Cluster\t\tPi\tB\t\t\tJ' for cluster in cluster_members.keys(): print '{0}\t{1:.4f}\t{2:.4f} +/- {3:.4f}, n={4}\t{5:.4f} +/- {6:.4f}, n={7}'.format(cluster if len(cluster) > 7 else cluster+'\t', np.mean(cluster_pi[cluster]), np.mean(cluster_b[cluster]), np.std(cluster_b[cluster]), len(cluster_b[cluster]), np.mean(cluster_j[cluster]), np.std(cluster_j[cluster]), len(cluster_j[cluster])) y = list() y2 = list() y2err = list() y3 = list() y3err = list() for cluster in cluster_members.keys(): y.append(float(cluster_pi[cluster])) y2.append(np.mean(cluster_b[cluster])) y2err.append(np.std(cluster_b[cluster])) y3.append(np.mean(cluster_j[cluster])) y3err.append(np.std(cluster_j[cluster])) ind = np.arange(len(cluster_members)) # the x locations for the groups width = 0.26 # the width of the bars fig = plt.figure() ax = fig.add_subplot(111) rects1 = ax.bar(ind, y, width, color='0.25', ecolor='k') rects2 = ax.bar(ind+width, y2, width, yerr=y2err, color='0.5', ecolor='k') rects3 = ax.bar(ind+width*2, y3, width, yerr=y3err, color='0.75', ecolor='k') # add some ax.set_ylabel('Cluster similarity') ax.set_xticks(ind + ((width * 3) / 2)) ax.set_xticklabels(labels) ax.set_xlim([-0.25,6.95]) ax.set_ylim([0,1]) ax.legend( (rects1[0], rects2[0], rects3[0]), ('$\kappa_{\mathrm{B}}$', 'E(B)', 'E(J)') ) autolabel(rects1, rotation=90, xpad=.03) autolabel(rects2, rotation=90, xpad=.03) autolabel(rects3, rotation=90, xpad=.03) # Plot boundaries per coder y = coder_boundaries x = np.arange(len(y)) # Set up width = 0.75 fig = plt.figure() ax = fig.add_subplot(1,1,1) # Plot rects = ax.bar(x, y, width, color='0.75') # Add xticks ax.set_xticks(x + (width / 2)) ax.set_xticklabels([str(val) for val in labels]) # Draw mean lines xmin, xmax, ymean, ystd = -0.25, len(labels), np.mean(y), np.std(y) ax.plot([xmin, xmax], [ymean] * 2, color='k') # Draw mean ax.plot([xmin, xmax], [ymean + ystd] * 2, color='0.5') # Draw +std ax.plot([xmin, xmax], [ymean - ystd] * 2, color='0.5') # Draw -std # Add numbers to bars format_str='%d' fnc_value=int for rect in rects: height = rect.get_height() ax.text(rect.get_x() + rect.get_width() / 2., 1.05 * height, format_str%fnc_value(height), ha='center', va='bottom') # Format ax.set_xlim([-0.25, 9]) ax.set_ylim([0, 30]) ax.set_xlabel('Coder') ax.set_ylabel('Boundaries placed (quantity)') plt.show() # Create heat map y_sim = list() y_sim_err = list() for row_similarity in row_similarities: y_sim.append(np.mean(row_similarity)) y_sim_err.append(np.std(row_similarity)) # Plot mean label similarity labels = ['$%i$' % i for i in range(0, number_of_lines)] y = list(y_sim) x = range(0, number_of_lines) plt.errorbar(x, y, color='k', ) xlim([0, number_of_lines - 1]) ylim([0, 1.05]) plt.ylabel('Mean Label Jaccard Similarity') plt.xlabel('Line') plt.show() position_frequency = [0] * (sum(dataset['kublakhan'].values()[0]) - 1) for segmentation in dataset['kublakhan'].values(): position = 0 for segment in segmentation[0:-1]: position += segment position_frequency[position] += 1 position_boundary_sim = [float(value) / 9 for value in position_frequency] # Create heat map y = position_frequency # Plot mean label similarity labels = ['$%i$' % i for i in range(0, number_of_lines)] x = range(0, number_of_lines - 1) plt.errorbar(x, y, color='k', ) xlim([0, 52.0]) ylim([0, 10]) plt.ylabel('Boundary Frequency') plt.xlabel('Line') plt.show()