%pylab inline
%reset
Welcome to pylab, a matplotlib-based Python environment [backend: module://IPython.zmq.pylab.backend_inline]. For more information, type 'help(pylab)'.
import datetime
import numpy as np
import pandas as pd
import matplotlib as mpl
import sys, os, re
# Directory helpers
BASE_DIR = os.getcwd()
CHART_DIR = os.path.join(BASE_DIR, "charts")
# Some me-specific constants
SMS_FILE = "../backups/sms-20130317190417.csv"
MY_NAME = "Jeremy"
# Ultimately, let's consider 3am, instead of midnight, to be the end of the day.
HOUR_SHIFT = 3
# Colors via http://colorbrewer2.com/
WHITE, LIGHT_GRAY, GRAY, BLACK = [ "#FFFFFF", "#E5E5E5", "#777777", "#000000" ]
COLORS = [ "#FF6600", "#D7191C", "#2C7BB6", "#FFFFBF", "#ABD9E9" ]
ORANGE, RED, BLUE, YELLOW, LIGHT_BLUE = COLORS
HIGH_ALPHA = 0.9
MEDIUM_ALPHA = 0.5
LOW_ALPHA = 0.1
rgba = mpl.colors.colorConverter.to_rgba
def set_styles(style_dict):
"""Set matplotlib styles from nested a nested dictionary"""
for obj in style_dict: mpl.rc(obj, **style_dict[obj])
# Styles mostly derived from https://github.com/tonysyu/mpltools/blob/master/mpltools/style/ggplot.rc
set_styles({
"figure": { "figsize": [ 12, 8 ], "facecolor": WHITE },
"savefig": { "dpi": 100, "bbox": "tight" },
"patch": { "linewidth": 0.5, "facecolor": ORANGE, "edgecolor": WHITE, "antialiased": True },
"font": { "size": 12 },
"legend": { "fontsize": 10 },
"axes": {
"facecolor": LIGHT_GRAY,
"edgecolor": WHITE,
"linewidth": 1,
"grid": True,
"titlesize": "large",
"labelsize": "large",
"labelcolor": GRAY,
"axisbelow": True,
"color_cycle": COLORS
},
"xtick": { "color": GRAY, "direction": "out" },
"ytick": { "color": GRAY, "direction": "out" },
"grid": { "color": WHITE, "linestyle": "-" }
})
def savechart(filename):
"""Save currently open chart as a PNG to the chart directory."""
mpl.pyplot.savefig(os.path.join(CHART_DIR, filename + ".png"), bbox_inches='tight')
close = mpl.pyplot.close
def humancount(x):
"""Abbreviate a large number into something more readable."""
if x == 0: return "0"
magnitude = int(np.log10(x))
thousands = magnitude / 3
divisor = pow(1000, thousands)
divided = float(x)/divisor
abbvs = [ "", "k", "m" ]
return ((("%." + str(thousands) + "f") if divided % 1 > 0 else "%d") % (divided)) + abbvs[thousands]
def get_step(max_val):
"""Given a max value, return a good step size for y-axis tick marks"""
magnitude = int(np.log10(max_val))
step = pow(10, magnitude - 1)
return step if max_val / step < 25 else step * 5
def pad_time_axis(axis):
"""Add a little extra space to the left and right of a date-based x-axis."""
old = axis.get_view_interval()
axis.set_view_interval(old[0] - 0.5, old[1] + 0.5)
COLS = ["contact_name", "date", "body", "type" ]
msgs = pd.read_csv(os.path.join(BASE_DIR, SMS_FILE))[COLS]
msgs = msgs[msgs["contact_name"] != "(Unknown)"]
msgs = msgs[msgs["type"] < 3]
msgs["datetime"] = msgs["date"].apply(lambda x: datetime.datetime.fromtimestamp(x / 1000))
FIRST_TIMESTAMP = msgs["datetime"].iget(0)
LAST_TIMESTAMP = FIRST_TIMESTAMP + datetime.timedelta(days=365)
msgs = msgs[msgs["datetime"] < LAST_TIMESTAMP]
# Hour of day -- 12am as 0, 1am as 1, ..., 1pm as 13, ..., 11pm as 23
msgs["hour"] = msgs["datetime"].apply(lambda x: x.hour)
# Day of week -- with Monday as 0, Tuesday as 1...
msgs["weekday"] = msgs["datetime"].apply(lambda x: x.weekday())
# Most recent Monday
msgs["week"] = pd.DatetimeIndex(msgs["datetime"].apply(lambda x: x.date() - datetime.timedelta(days=x.weekday())))
# First day of month
msgs["month"] = pd.DatetimeIndex(msgs["datetime"].apply(lambda x: pd.Period(x, freq="M").to_timestamp()))
DATE_RANGE, WEEK_RANGE, MONTH_RANGE = (pd.period_range(FIRST_TIMESTAMP, LAST_TIMESTAMP, freq=x) for x in ("D", "W-MON", "M"))
Adding the message multiplier ("160s") because my phone does not separate out longer messages to/from other Verizon accounts as separate 160-character messages. In some analyses, this will matter; we need a common metric for all messages, regardless of carrier.
msgs["direction"] = msgs["type"].apply(lambda x: "received" if x == 1 else "sent")
msgs["chars"] = msgs["body"].apply(len).astype(int)
msgs["160s"] = msgs["chars"].apply(lambda x: (x / 160) + 1)
THINGS = { "chars": "Characters", "160s": "Messages" }
sent, received = (msgs[msgs["direction"] == x] for x in ("sent", "received"))
people = msgs.groupby("contact_name")
people_directions = msgs.groupby([ "contact_name", "direction" ])
METRICS = { "160s": "messages", "chars": "characters" }
GROUPINGS = { "all": msgs, "sent": sent, "received": received }
TOTALS = dict((m, dict((key, GROUPINGS[key][m].sum()) for key in GROUPINGS)) for m in METRICS)
def print_basic_stats(per_page=2000):
n_days = (LAST_TIMESTAMP - FIRST_TIMESTAMP).days
print "Between %s and %s ...\n" % tuple(x.strftime("%B %d, %Y") for x in (FIRST_TIMESTAMP, LAST_TIMESTAMP))
print "I exchanged text messages with %d known contacts.\n" % len(msgs["contact_name"].unique())
print "\n\n".join("\n".join([
"I sent %d *%s*, or %0.1f per day." % (TOTALS[m]["sent"], METRICS[m], 1.0*TOTALS[m]["sent"]/n_days),
"I received %d, or %0.1f per day." % (TOTALS[m]["received"], 1.0*TOTALS[m]["received"]/n_days),
"That's %d in total, or %0.1f per day." % (TOTALS[m]["all"], 1.0*TOTALS[m]["all"]/n_days),
"For every 100 %s I sent, I received about %d." % (METRICS[m], round(100.0 * TOTALS[m]["received"] / TOTALS[m]["sent"], 0)),
]) for m in METRICS) + "\n"
print "Given ~%d characters per printed page, I wrote ~%d pages' worth of text messages during this time.\n" % \
(per_page, TOTALS["chars"]["sent"]/per_page)
print "I sent %d chars/msg. I received %d chars/msg.\n" % tuple(round(TOTALS["chars"][x] * 1.0 / TOTALS["160s"][x], 0) for x in ("sent", "received"))
by_date = pd.DataFrame({ "date": sent["datetime"].apply(lambda x: x.date()), "160s": sent["160s"], "chars": sent["chars"] }).groupby("date")
top_msgs, top_chars = (by_date[m].sum().order(ascending=False).reset_index().values[0] for m in ("160s", "chars"))
print "The most messages I ever sent in a day was %d, on %s.\n" % (top_msgs[1], top_msgs[0].strftime("%B %d, %Y"))
print "The most total characters I ever sent was %d, on %s.\n" % (top_chars[1], top_chars[0].strftime("%B %d, %Y"))
print_basic_stats()
Between March 13, 2012 and March 13, 2013 ... I exchanged text messages with 95 known contacts. I sent 1514 *messages*, or 4.1 per day. I received 1779, or 4.9 per day. That's 3293 in total, or 9.0 per day. For every 100 messages I sent, I received about 118. I sent 104406 *characters*, or 286.0 per day. I received 92610, or 253.7 per day. That's 197016 in total, or 539.8 per day. For every 100 characters I sent, I received about 89. Given ~2000 characters per printed page, I wrote ~52 pages' worth of text messages during this time. I sent 69 chars/msg. I received 52 chars/msg. The most messages I ever sent in a day was 48, on March 10, 2013. The most total characters I ever sent was 3539, on March 10, 2013.
def scatter_all(contact_name=False, color_direction=False):
# Alter color scheme if highlighting a particular contact
def get_color(name, direction):
is_focus = not contact_name or contact_name == name
if color_direction:
if is_focus:return rgba((BLUE if direction == "sent" else RED), MEDIUM_ALPHA)
else: return rgba(WHITE, 0)
else:
if is_focus: return rgba(ORANGE, MEDIUM_ALPHA)
else: return rgba(GRAY, LOW_ALPHA)
colors = [ get_color(name, direction)
for name, direction in msgs[["contact_name", "direction"]].values ]
# Measure vertical axis in seconds since the day "began"
def to_seconds(x):
shifted = x - datetime.timedelta(hours=HOUR_SHIFT)
morn = datetime.datetime.combine(shifted, datetime.time(0))
return (shifted - morn).seconds
days = msgs["datetime"].apply(lambda x: (x - datetime.timedelta(hours=HOUR_SHIFT)).date())
seconds = msgs["datetime"].apply(to_seconds)
# Create and label plot
close()
plot = mpl.pyplot.scatter(
days,
seconds,
c = colors,
marker = "x",
linewidth = 1.25 if contact_name else 1,
norm = True,
s = 20
).axes
t1 = "Every Message" if not contact_name else "Every Message To/From %s" % contact_name
t2 = "From %s in Blue, To %s in Red" % (MY_NAME, MY_NAME) if color_direction else None
plot.set_title(u" • ".join(x for x in [ t1, t2 ] if x) + "\n")
# Set x-axis details
plot.set_xticks(MONTH_RANGE.to_datetime())
plot.set_xticklabels(list(x.strftime("%b\n%Y") for x in MONTH_RANGE.to_datetime()))
# Set y-axis details
TOTAL_SECONDS = 60 * 60 * 24
SECONDS_TO_SHOW = range(0, TOTAL_SECONDS + 1, 60 * 60)
HOURS_TO_SHOW = list(datetime.time(((x / 3600) + HOUR_SHIFT) % 24) for x in SECONDS_TO_SHOW)
plot.set_ylim(TOTAL_SECONDS * -0.01, TOTAL_SECONDS * 1.01)
plot.set_yticks(SECONDS_TO_SHOW)
plot.set_yticklabels(list(x.strftime("%I %p").lstrip("0") for x in HOURS_TO_SHOW))
scatter_all()
savechart("all-messages")
scatter_all(color_direction=True)
savechart("all-messages-directional")
def save_contact_scatters(threshold=20):
replacer = re.compile("[^\.a-z0-9]+")
above_threshold = [ name for name, msgs in people if len(msgs) > threshold ]
count = len(above_threshold)
print "Generating charts for %d people." % count
for i in range(count):
name = above_threshold[i]
filename = re.sub(replacer, "-", name.lower())
# print "Generating %d of %d" % (i + 1, len(above_threshold))
scatter_all(name)
savechart("personal/%s-AMONG" % filename)
scatter_all(name, color_direction=True)
savechart("personal/%s-SOLO" % filename)
close()
save_contact_scatters()
Generating charts for 35 people.
def plot_over_time(col, metric="160s", annotations = []):
"""Plot the number of messages sent + received for each week or month."""
# Note: slicing off first and last (partial) weeks.
all_counts, sent_counts, received_counts = (selection.groupby(col)[metric].sum()[1:-1]
for selection in (msgs, sent, received))
MAX_COUNT = int(max(all_counts))
# Create and label plot
close()
plot = all_counts.plot(lw=2, label="All")
sent_counts.plot(color=rgba(BLUE, 0.5), label="Sent")
received_counts.plot(color=rgba(RED, 0.5), label="Received")
ything = THINGS[metric]
plot.set_title("%s per %s" % (ything, col.capitalize()) + (" (Begins on Monday)" if col == "week" else "") + "\n")
plot.set_xlabel("")
plot.set_ylabel(ything + " Sent + Received\n")
leg = plot.legend(loc='upper left')
leg.get_frame().set_alpha(0)
# Set ticks
step = get_step(MAX_COUNT)
yticks = range(0, int(MAX_COUNT + 1.1) + step, step)
plot.set_yticks(yticks)
plot.set_yticklabels(list(humancount(y) for y in yticks))
plot.set_ylim(0, int(MAX_COUNT * 1.1))
pad_time_axis(plot.xaxis)
def annotate_week (week, text, adjust_x = 0, adjust_y = 0):
"""Add an annotation to any week."""
timestamp = pd.Timestamp(week)
plot.annotate(text,
xy=(timestamp, all_counts[timestamp]),
xytext=(timestamp + datetime.timedelta(weeks=adjust_x), all_counts[timestamp] + adjust_y * MAX_COUNT / 100),
horizontalalignment="center",
arrowprops=dict(arrowstyle="-|>", color="black"))
for a in annotations: annotate_week(*a)
plot_over_time("month", metric="160s")
savechart("monthly-messages")
plot_over_time("month", metric="chars")
savechart("monthly-characters")
annotations = [
("2012-10-29", "Hurricane Sandy", -6, 0),
("2012-12-24", "Home for\nthe holidays.", 0, 10),
("2013-01-28", "Advised\nfriend on a\n relationship.", 0, 10)
]
plot_over_time("week", metric="160s", annotations=annotations)
savechart("weekly-messages-labeled")
plot_over_time("week", metric="chars", annotations=annotations)
savechart("weekly-characters-labeled")
def plot_weekdaily(metric="160s"):
"""Plot the average number of messages sent + received per day of the week."""
# Calculate the number of each weekday in our date range, in order to calculate the averages.
weekday_counts = pd.Series(DATE_RANGE.to_datetime()).apply(lambda x: x.weekday()).value_counts()
all_per_day, sent_per_day, received_per_day = (1.0 * selection.groupby("weekday")[metric].sum() / weekday_counts
for selection in (msgs, sent, received))
# Create and label plot
close()
plot = all_per_day.plot(lw=2, marker=".", markersize=10, label="All")
sent_per_day.plot(marker=".", markersize=5, label="Sent", c=rgba(BLUE, 0.5))
received_per_day.plot(marker=".", markersize=5, label="Received", c=rgba(RED, 0.5))
plot.margins(0.1, 0.1)
thing = THINGS[metric]
plot.set_title("Average # of " + thing + " by Day of Week\n")
plot.set_xlabel("\nDay of Week")
plot.set_ylabel("Average # of " + thing + "\n")
leg = plot.legend(loc='upper left')
leg.get_frame().set_alpha(0)
# Set x-axis details
plot.set_xticks(range(0, len(pd.datetools.DAYS)))
plot.set_xticklabels(pd.Series(pd.datetools.DAYS[x] for x in range(7)))
# Set y-axis details
MAX_COUNT = all_per_day.max()
step = get_step(MAX_COUNT)
plot.set_ylim(0, MAX_COUNT + 1.1)
plot.set_yticks(range(0, int(MAX_COUNT * 1.1) + 1, step))
plot_weekdaily()
savechart("weekdaily-messages")
plot_weekdaily("chars")
savechart("weekdaily-characters")
def plot_hourly(metric="160s", secondary_metric=False):
def get_pcts(metric):
hourly_counts = msgs.groupby("hour")[metric].sum()
total = hourly_counts.sum()
hourly_pcts = 1.0 * hourly_counts / total
return hourly_pcts
def fill_hours(data):
return pd.Series(list(data[x] if x in data else 0 for x in all_hours), index = all_hours)
# Fill in 0s for hour(s) with no SMSes
all_hours = range(HOUR_SHIFT, 24) + range(0, HOUR_SHIFT)
hourly_pcts = get_pcts(metric)
all_hourly_pcts = fill_hours(hourly_pcts)
# Create and label plot
close()
plot = all_hourly_pcts.plot(lw=2, marker=".", markersize=10, label="Messages")
fill_hours(get_pcts(secondary_metric)).plot(c=rgba(BLACK, 0.25), label="Characters")
plot.set_title("Hourly Distribution\n")
plot.set_xlabel("Hour Beginning at...")
plot.set_ylabel("Percentage of Total Sent + Received")
leg = plot.legend(loc='upper left')
leg.get_frame().set_alpha(0)
# Set x-axis details
plot.set_xlim(min(all_hours) - 0.5, max(all_hours) + 0.5)
xticks = plot.set_xticks(all_hours)
for x in xticks: x.label.set_fontsize(10)
plot.set_xticklabels(list(datetime.datetime.strptime(str((x + HOUR_SHIFT) % 24), "%H").strftime("%I%p").lstrip("0").lower().rstrip("m")
for x in all_hours))
# Set y-axis details
max_pct = hourly_pcts.max()
plot.set_ylim(-max_pct * 0.05, max_pct * 1.05)
yticks = np.arange(0, max_pct * 1.1, 0.01)
plot.set_yticks(yticks)
plot.set_yticklabels(list("%d%%" % (y * 100) for y in yticks))
plot_hourly(metric="160s", secondary_metric="chars")
savechart("hourly")
def plot_monthly_uniques():
sent = msgs[msgs.direction == "sent"]
monthly_uniques = sent.groupby("month")["contact_name"].agg(lambda x: len(x.unique()))[1:-1]
close()
monthly_plot = monthly_uniques.plot(marker=".", markersize=10, lw=2)
monthly_plot.set_title("Unique SMS Recipients per Month\n")
monthly_plot.set_xlabel("")
monthly_plot.set_ylabel("Number of Unique SMS Recipients\n")
monthly_plot.set_ylim(0, monthly_uniques.max() * 1.1)
monthly_plot.set_xlim(monthly_plot.get_xlim()[0] - 0.5, monthly_plot.get_xlim()[1] + 0.5)
plot_monthly_uniques()
savechart("monthly-uniques")
def plot_diversity():
def simpson(x):
numerator = sum(pow(y, 2) for y in x.value_counts()) * 1.0
denominator = pow(len(x), 2)
return 1 - numerator / denominator
sent = msgs[msgs.direction == "sent"]
monthly_diversity = sent.groupby("month")["contact_name"].agg(simpson)[1:-1]
close()
monthly_plot = monthly_diversity.plot(marker=".", markersize=10, lw=2)
monthly_plot.set_title("Simpson Diversity Index of SMS Recipients by Month\n")
monthly_plot.set_xlabel("")
monthly_plot.set_ylabel("Diversity (Higher = Greater Diversity of Recipients)\n")
monthly_plot.set_ylim(0, 1)
monthly_plot.set_xlim(monthly_plot.get_xlim()[0] - 0.5, monthly_plot.get_xlim()[1] + 0.5)
monthly_plot.set_yticks(list(float(x) / 10 for x in range(0, 11)))
plot_diversity()
savechart("diversity")
def scatterplot_contacts(metric="160s", size=50, thing="Total Messages"):
counts = people_directions[metric].sum()
counts_unstacked = counts.unstack("direction").fillna(0)
max_val = counts.max()
step = get_step(max_val)
max_val_bleed = max_val * 1.05
close()
plot = mpl.pyplot.scatter(counts_unstacked.sent, counts_unstacked.received, s=size, c=ORANGE, alpha=0.9, zorder=2)
mpl.pyplot.plot([-max_val_bleed, max_val_bleed], [-max_val_bleed, max_val_bleed], c=LIGHT_GRAY, zorder=1)
plot.figure.set_size_inches(8, 8)
axes = plot.axes
axes.set_title(thing + " Sent/Received per Contact\n")
axes.set_xlabel("\n" + thing + " Sent to Contact")
axes.set_ylabel(thing + " Received from Contact\n")
axes.set_xlim(-max_val_bleed / 20, max_val_bleed)
axes.set_ylim(-max_val_bleed / 20, max_val_bleed)
ticks = np.arange(0, max_val_bleed, step)
labels = list(humancount(x) for x in ticks)
axes.set_xticks(ticks)
axes.set_yticks(ticks)
axes.set_xticklabels(labels)
axes.set_yticklabels(labels)
def save_sent_vs_received_scatterplots():
# All the bizarre math here is to exaggerate differences between average message lengths.
avg_msg_len = pow(people["chars"].sum() / people["160s"].sum(), 2) / 30
scatterplot_contacts("160s", size=avg_msg_len, thing="Total Messages")
savechart("personal/sent-vs-received-msgs")
scatterplot_contacts("chars", size=avg_msg_len, thing="Total Characters")
savechart("personal/sent-vs-received-chars")
close()
save_sent_vs_received_scatterplots()
def plot_message_hist (df, title, ything):
counts = df.groupby("contact_name")["160s"].sum()
n_people = len(msgs["contact_name"].unique())
pct95 = (int(np.percentile(counts, 95)) / 10) * 10
trimmed = counts.apply(lambda x: min([x, pct95]))
step = get_step(pct95)
bins = range(0, pct95 + step, step)
# Create and label histogram
close()
plot = trimmed.hist(bins=bins)
plot.set_title(title + "\n")
plot.set_xlabel("# Of Messages")
plot.set_ylabel("# Of %s" % (ything))
# Set x-axis and labels
plot.set_xticks(bins)
plot.set_xticklabels(bins[:-1] + ["%d+" % (bins[-1])])
# Set secondary y-axis and labels
yticks = plot.get_yticks()
y2 = plot.twinx()
y2.grid(False)
y2.set_yticks(yticks)
y2.set_yticklabels(list("%.1f%%" % (100.0 * y / n_people) for y in yticks))
y2.set_ylabel("Percentage of All Contacts", rotation=-90)
plot_message_hist(msgs, "Distribution of Total Messages Exchanged per Person", "People")
savechart("messages-per-person")
Caveat: Seems that my phone allowed for messages longer than 160 characters if sent/received to/from another Verizon customer.
def plot_message_lengths(direction=False):
to_analyze = msgs[msgs["direction"] == direction] if direction else msgs
# Break up messages longer than 160 characters into chunks of 160 characters or less
chunk_into_160s = lambda x: [ 160 ] * (x/160) + [ x%160 ] if x > 160 else [ x ]
adjusted_chars = pd.Series(to_analyze["chars"].apply(chunk_into_160s).sum())
# Define histogram bins and calculate max bar height
bins = range(0, 170, 10)
counts, divisions = np.histogram(adjusted_chars, bins=bins)
total = len(adjusted_chars)
max_val = counts.max() * 1.0 / total
# Create plot and set axes
close()
hist = adjusted_chars.hist(bins=bins)
hist.set_xticks(bins)
yticks = np.arange(0, max_val + 0.01, 0.01)
hist.set_yticks(yticks * total)
hist.set_yticklabels(list("%d%%" % (y * 100) for y in yticks))
# Set labels
dir_string = str(direction or "all").capitalize()
hist.set_title(u"Message Length by Number of Characters • %s Messages\n" % dir_string)
hist.set_xlabel("\nMessage Length (# of Characters)")
hist.set_ylabel("Percentage of %s Messages\n" % dir_string)
plot_message_lengths()
savechart("message-lengths-all")
plot_message_lengths("sent")
savechart("message-lengths-sent")
plot_message_lengths("received")
savechart("message-lengths-received")
def plot_words(word_dict, exp=2):
def scaler(x, rev=False):
return pow(x, pow(1.0/exp, -1 if rev else 1))
words = pd.Series(word_dict)
bodies = msgs.groupby("direction")["body"].agg(lambda x: " /// ".join(x))
chars_received, chars_sent = msgs.groupby("direction")["body"].agg(sum).apply(len)
count_received, count_sent = (words.apply(lambda regex: len(regex.findall(body))) for body in bodies)
freq_received, freq_sent = (scaler(x) for x in (1.0 * count_received / chars_received, 1.0 * count_sent / chars_sent))
sent_more = (freq_sent > freq_received) * 1
word_df = pd.DataFrame({
"regex": words,
"freq_sent": freq_sent,
"freq_received": freq_received,
"sent_more": sent_more,
"rank_received": (1 * (sent_more * (freq_received + 1))).rank(method="first"),
"rank_sent": (1 * ((sent_more^1) * (freq_sent + 1))).rank(method="first")
}).reset_index()
close()
plot = mpl.pyplot.scatter(
word_df["freq_sent"],
word_df["freq_received"],
color=ORANGE,
s=50).axes
plot.figure.set_size_inches(10, 10)
lim = max(max(freq_received), max(freq_sent))
plot.set_xlim(-lim*0.01, lim*1.05)
plot.set_ylim(-lim*0.01, lim*1.05)
mpl.pyplot.plot([ -1, 1 ], [ -1, 1 ], color=rgba(BLACK, 0.5))
top = lim
n = len(sent_more)
nright = sum(sent_more)
ntop = n - nright
def budge(row):
x = top if row["sent_more"] else (row["rank_sent"] - (nright + 1)) * top / ntop
y = (row["rank_received"] - (ntop + 1)) * top / nright if row["sent_more"] else top
return (x, y)
def annotate(row):
bbox_props = dict(boxstyle="round,pad=0.3", fc=WHITE, ec=rgba(BLACK, 0.25))
plot.annotate(row["index"],
xy=(row["freq_sent"], row["freq_received"]),
xytext=budge(row),
arrowprops=(dict(arrowstyle="-|>", color=rgba(BLACK, 0.5))),
bbox=bbox_props)
for x in word_df.index:
row = word_df.ix[x]
annotate(row)
ticks = [ 0 ] + list(scaler(1 * pow(10, -x)) for x in range(2, 6))
labels = list(int(round(x)) if round(x, 3)%1 == 0 else x
for x in list(scaler(x, rev=True) * 10000 for x in ticks))
plot.set_xticks(ticks)
plot.set_xticklabels(labels)
plot.set_yticks(ticks)
plot.set_yticklabels(labels)
plot.set_title("Frequency of Specific Words/Characters\n")
plot.set_xlabel("\n# Sent per 10,000 Characters")
plot.set_ylabel("# Received per 10,000 Characters")
plot_words({
"!": re.compile("\!"),
"?": re.compile("\?"),
"; (semicolon)": re.compile("; "),
", (comma)": re.compile(", "),
# ". (period)": re.compile("\."),
"oops": re.compile("oops", re.IGNORECASE),
"yikes": re.compile("yikes", re.IGNORECASE),
"no prob*": re.compile("no prob*", re.IGNORECASE),
":)": re.compile(":.?\)"),
":(": re.compile(":.?\("),
"I": re.compile(" I(?![A-Za-z])"),
"i": re.compile(" i(?![A-Za-z])"),
"oops": re.compile("oops", re.IGNORECASE),
"no/nope": re.compile("(no|nope)(?![a-z])", re.IGNORECASE),
"yes": re.compile("yes(?!a-z])", re.IGNORECASE),
"u": re.compile(" u(?![A-Za-z])"),
"you": re.compile("you(?!a-z])", re.IGNORECASE),
"(s)he": re.compile(" s?he ", re.IGNORECASE),
# "my": re.compile(" my ", re.IGNORECASE),
"thank*": re.compile("thanks?(?![a-z])", re.IGNORECASE),
"please": re.compile("please", re.IGNORECASE),
"sorry": re.compile("sorry", re.IGNORECASE),
"congrat*": re.compile("congrat", re.IGNORECASE)
}, exp=2)
savechart("word-frequency")