Notebook

Setup!¶

In [1]:

%pylab inline
%reset

Welcome to pylab, a matplotlib-based Python environment [backend: module://IPython.zmq.pylab.backend_inline].
For more information, type 'help(pylab)'.

In [2]:

import datetime
import numpy as np
import pandas as pd
import matplotlib as mpl
import sys, os, re

In [3]:

# Directory helpers
BASE_DIR = os.getcwd()
CHART_DIR = os.path.join(BASE_DIR, "charts")

In [4]:

# Some me-specific constants
SMS_FILE = "../backups/sms-20130317190417.csv"
MY_NAME = "Jeremy"

In [5]:

# Ultimately, let's consider 3am, instead of midnight, to be the end of the day.
HOUR_SHIFT = 3

In [6]:

# Colors via http://colorbrewer2.com/
WHITE, LIGHT_GRAY, GRAY, BLACK = [ "#FFFFFF", "#E5E5E5", "#777777", "#000000" ] 
COLORS = [ "#FF6600", "#D7191C", "#2C7BB6", "#FFFFBF", "#ABD9E9" ]
ORANGE, RED, BLUE, YELLOW, LIGHT_BLUE = COLORS
HIGH_ALPHA = 0.9
MEDIUM_ALPHA = 0.5
LOW_ALPHA = 0.1
rgba = mpl.colors.colorConverter.to_rgba

In [7]:

def set_styles(style_dict):
    """Set matplotlib styles from nested a nested dictionary"""
    for obj in style_dict: mpl.rc(obj, **style_dict[obj]) 

# Styles mostly derived from https://github.com/tonysyu/mpltools/blob/master/mpltools/style/ggplot.rc
set_styles({
    "figure": { "figsize": [ 12, 8 ], "facecolor": WHITE },
    "savefig": { "dpi": 100, "bbox": "tight" },
    "patch": { "linewidth": 0.5, "facecolor": ORANGE, "edgecolor": WHITE, "antialiased": True },
    "font": { "size": 12 },
    "legend": { "fontsize": 10 },
    "axes": { 
        "facecolor": LIGHT_GRAY, 
        "edgecolor": WHITE, 
        "linewidth": 1, 
        "grid": True, 
        "titlesize": "large", 
        "labelsize": "large", 
        "labelcolor": GRAY,
        "axisbelow": True,
        "color_cycle": COLORS
    },
    "xtick": { "color": GRAY, "direction": "out" },
    "ytick": { "color": GRAY, "direction": "out" },
    "grid": { "color": WHITE, "linestyle": "-" }
})

In [8]:

def savechart(filename):
    """Save currently open chart as a PNG to the chart directory."""
    mpl.pyplot.savefig(os.path.join(CHART_DIR, filename + ".png"), bbox_inches='tight')

close = mpl.pyplot.close

def humancount(x):
    """Abbreviate a large number into something more readable."""
    if x == 0: return "0"
    magnitude = int(np.log10(x))
    thousands = magnitude / 3
    divisor = pow(1000, thousands)
    divided = float(x)/divisor
    abbvs = [ "", "k", "m" ]
    return ((("%." + str(thousands) + "f") if divided % 1 > 0 else "%d") % (divided)) + abbvs[thousands]

def get_step(max_val):
    """Given a max value, return a good step size for y-axis tick marks"""
    magnitude = int(np.log10(max_val))
    step = pow(10, magnitude - 1)
    return step if max_val / step < 25 else step * 5

def pad_time_axis(axis):
    """Add a little extra space to the left and right of a date-based x-axis."""
    old = axis.get_view_interval()
    axis.set_view_interval(old[0] - 0.5, old[1] + 0.5)

The Data!¶

Read in CSV of all messages¶

In [9]:

COLS = ["contact_name", "date", "body", "type" ]
msgs = pd.read_csv(os.path.join(BASE_DIR, SMS_FILE))[COLS]

Filter out messages to/from people not in my contacts¶

In [10]:

msgs = msgs[msgs["contact_name"] != "(Unknown)"]

Filter out unsent messages¶

"type" values:

1: received
2: sent
3: draft
4: ???
5: failed/error

In [11]:

msgs = msgs[msgs["type"] < 3]

Add Pythonic datetime, and restrict data to one year from start¶

In [12]:

msgs["datetime"] = msgs["date"].apply(lambda x: datetime.datetime.fromtimestamp(x / 1000))
 
FIRST_TIMESTAMP = msgs["datetime"].iget(0)
LAST_TIMESTAMP = FIRST_TIMESTAMP + datetime.timedelta(days=365)
msgs = msgs[msgs["datetime"] < LAST_TIMESTAMP]

Add columns for different measures of time¶

In [13]:

# Hour of day -- 12am as 0, 1am as 1, ..., 1pm as 13, ..., 11pm as 23
msgs["hour"] = msgs["datetime"].apply(lambda x: x.hour)

# Day of week -- with Monday as 0, Tuesday as 1...
msgs["weekday"] = msgs["datetime"].apply(lambda x: x.weekday())

# Most recent Monday
msgs["week"] = pd.DatetimeIndex(msgs["datetime"].apply(lambda x: x.date() - datetime.timedelta(days=x.weekday())))

# First day of month
msgs["month"] = pd.DatetimeIndex(msgs["datetime"].apply(lambda x: pd.Period(x, freq="M").to_timestamp()))

Calculate some basic things about the range of dates included¶

In [14]:

DATE_RANGE, WEEK_RANGE, MONTH_RANGE = (pd.period_range(FIRST_TIMESTAMP, LAST_TIMESTAMP, freq=x) for x in ("D", "W-MON", "M"))

Add columns for sent/received, character count, and message multiplier¶

Adding the message multiplier ("160s") because my phone does not separate out longer messages to/from other Verizon accounts as separate 160-character messages. In some analyses, this will matter; we need a common metric for all messages, regardless of carrier.

In [15]:

msgs["direction"] = msgs["type"].apply(lambda x: "received" if x == 1 else "sent")
msgs["chars"] = msgs["body"].apply(len).astype(int)
msgs["160s"] = msgs["chars"].apply(lambda x: (x / 160) + 1)

THINGS = { "chars": "Characters", "160s": "Messages" }

Group messages by person and direction¶

In [16]:

sent, received = (msgs[msgs["direction"] == x] for x in ("sent", "received"))
people = msgs.groupby("contact_name")
people_directions = msgs.groupby([ "contact_name", "direction" ])

Basic Stats!¶

In [17]:

METRICS = { "160s": "messages", "chars": "characters" }
GROUPINGS = { "all": msgs, "sent": sent, "received": received }
TOTALS = dict((m, dict((key, GROUPINGS[key][m].sum()) for key in GROUPINGS)) for m in METRICS)

def print_basic_stats(per_page=2000):
    n_days = (LAST_TIMESTAMP - FIRST_TIMESTAMP).days
    print "Between %s and %s ...\n" % tuple(x.strftime("%B %d, %Y") for x in (FIRST_TIMESTAMP, LAST_TIMESTAMP))
    print "I exchanged text messages with %d known contacts.\n" % len(msgs["contact_name"].unique())
    print "\n\n".join("\n".join([
            "I sent %d *%s*, or %0.1f per day." % (TOTALS[m]["sent"], METRICS[m], 1.0*TOTALS[m]["sent"]/n_days),
            "I received %d, or %0.1f per day." % (TOTALS[m]["received"], 1.0*TOTALS[m]["received"]/n_days),
            "That's %d in total, or %0.1f per day." % (TOTALS[m]["all"], 1.0*TOTALS[m]["all"]/n_days),
            "For every 100 %s I sent, I received about %d." % (METRICS[m], round(100.0 * TOTALS[m]["received"] / TOTALS[m]["sent"], 0)),
    ]) for m in METRICS) + "\n"
    print "Given ~%d characters per printed page, I wrote ~%d pages' worth of text messages during this time.\n" % \
        (per_page, TOTALS["chars"]["sent"]/per_page)

        
    print "I sent %d chars/msg. I received %d chars/msg.\n" % tuple(round(TOTALS["chars"][x] * 1.0 / TOTALS["160s"][x], 0) for x in ("sent", "received"))
    
    by_date = pd.DataFrame({ "date": sent["datetime"].apply(lambda x: x.date()), "160s": sent["160s"], "chars": sent["chars"] }).groupby("date")
    top_msgs, top_chars = (by_date[m].sum().order(ascending=False).reset_index().values[0] for m in ("160s", "chars"))
    
    print "The most messages I ever sent in a day was %d, on %s.\n" % (top_msgs[1], top_msgs[0].strftime("%B %d, %Y"))
    print "The most total characters I ever sent was %d, on %s.\n" % (top_chars[1], top_chars[0].strftime("%B %d, %Y"))

print_basic_stats()

Between March 13, 2012 and March 13, 2013 ...

I exchanged text messages with 95 known contacts.

I sent 1514 *messages*, or 4.1 per day.
I received 1779, or 4.9 per day.
That's 3293 in total, or 9.0 per day.
For every 100 messages I sent, I received about 118.

I sent 104406 *characters*, or 286.0 per day.
I received 92610, or 253.7 per day.
That's 197016 in total, or 539.8 per day.
For every 100 characters I sent, I received about 89.

Given ~2000 characters per printed page, I wrote ~52 pages' worth of text messages during this time.

I sent 69 chars/msg. I received 52 chars/msg.

The most messages I ever sent in a day was 48, on March 10, 2013.

The most total characters I ever sent was 3539, on March 10, 2013.

Charts!¶

Plot all every message on a single chart¶

In [18]:

def scatter_all(contact_name=False, color_direction=False):
    # Alter color scheme if highlighting a particular contact
    def get_color(name, direction):
        is_focus = not contact_name or contact_name == name
        if color_direction:
            if is_focus:return rgba((BLUE if direction == "sent" else RED), MEDIUM_ALPHA)
            else: return rgba(WHITE, 0)
        else:
            if is_focus: return rgba(ORANGE, MEDIUM_ALPHA)
            else: return rgba(GRAY, LOW_ALPHA)
        
    colors = [  get_color(name, direction) 
                for name, direction in msgs[["contact_name", "direction"]].values ]
    
    # Measure vertical axis in seconds since the day "began"
    def to_seconds(x):
        shifted = x - datetime.timedelta(hours=HOUR_SHIFT)
        morn = datetime.datetime.combine(shifted, datetime.time(0))
        return (shifted - morn).seconds
        
    days = msgs["datetime"].apply(lambda x: (x - datetime.timedelta(hours=HOUR_SHIFT)).date())
    seconds = msgs["datetime"].apply(to_seconds)
    
    # Create and label plot
    close()
    plot = mpl.pyplot.scatter(
        days, 
        seconds,
        c = colors,
        marker = "x",
        linewidth = 1.25 if contact_name else 1,
        norm = True,
        s = 20
    ).axes
    
    t1 = "Every Message" if not contact_name else "Every Message To/From %s" % contact_name
    t2 = "From %s in Blue, To %s in Red" % (MY_NAME, MY_NAME) if color_direction else None
    plot.set_title(u" • ".join(x for x in [ t1, t2 ] if x) + "\n")

    # Set x-axis details
    plot.set_xticks(MONTH_RANGE.to_datetime())
    plot.set_xticklabels(list(x.strftime("%b\n%Y") for x in MONTH_RANGE.to_datetime()))

    # Set y-axis details
    TOTAL_SECONDS = 60 * 60 * 24
    SECONDS_TO_SHOW = range(0, TOTAL_SECONDS + 1, 60 * 60)
    HOURS_TO_SHOW = list(datetime.time(((x / 3600) + HOUR_SHIFT) % 24) for x in SECONDS_TO_SHOW)
    plot.set_ylim(TOTAL_SECONDS * -0.01, TOTAL_SECONDS * 1.01)
    plot.set_yticks(SECONDS_TO_SHOW)
    plot.set_yticklabels(list(x.strftime("%I %p").lstrip("0") for x in HOURS_TO_SHOW))

In [19]:

scatter_all()
savechart("all-messages")

In [20]:

scatter_all(color_direction=True)
savechart("all-messages-directional")

Save personal charts for my most frequent contacts¶

In [21]:

def save_contact_scatters(threshold=20):
    replacer = re.compile("[^\.a-z0-9]+")
    above_threshold = [ name for name, msgs in people if len(msgs) > threshold ]
    count = len(above_threshold)
    print "Generating charts for %d people." % count
    for i in range(count):
        name = above_threshold[i]
        filename = re.sub(replacer, "-", name.lower())
        # print "Generating %d of %d" % (i + 1, len(above_threshold))
        scatter_all(name)
        savechart("personal/%s-AMONG" % filename)
        scatter_all(name, color_direction=True)
        savechart("personal/%s-SOLO" % filename)
    close()
save_contact_scatters()

Generating charts for 35 people.

Which months/weeks did I text-message the most?¶

In [22]:

def plot_over_time(col, metric="160s", annotations = []):
    """Plot the number of messages sent + received for each week or month."""
    # Note: slicing off first and last (partial) weeks.
    all_counts, sent_counts, received_counts = (selection.groupby(col)[metric].sum()[1:-1]
        for selection in (msgs, sent, received))

    MAX_COUNT = int(max(all_counts))
    
    # Create and label plot
    close()
    plot = all_counts.plot(lw=2, label="All")
    sent_counts.plot(color=rgba(BLUE, 0.5), label="Sent")
    received_counts.plot(color=rgba(RED, 0.5), label="Received")
    
    ything = THINGS[metric]
    plot.set_title("%s per %s" % (ything, col.capitalize()) + (" (Begins on Monday)" if col == "week" else "") + "\n")
    plot.set_xlabel("")
    plot.set_ylabel(ything + " Sent + Received\n")
    leg = plot.legend(loc='upper left')
    leg.get_frame().set_alpha(0)
    
    # Set ticks
    step = get_step(MAX_COUNT)  
    yticks = range(0, int(MAX_COUNT + 1.1) + step, step)
    plot.set_yticks(yticks)
    plot.set_yticklabels(list(humancount(y) for y in yticks))
    plot.set_ylim(0, int(MAX_COUNT * 1.1))
    pad_time_axis(plot.xaxis)
    
    
    def annotate_week (week, text, adjust_x = 0, adjust_y = 0):
        """Add an annotation to any week."""
        timestamp = pd.Timestamp(week)
        plot.annotate(text,
            xy=(timestamp, all_counts[timestamp]),
            xytext=(timestamp + datetime.timedelta(weeks=adjust_x), all_counts[timestamp] + adjust_y * MAX_COUNT / 100),
            horizontalalignment="center",
            arrowprops=dict(arrowstyle="-|>", color="black"))

    for a in annotations: annotate_week(*a)

In [23]:

plot_over_time("month", metric="160s")
savechart("monthly-messages")

In [24]:

plot_over_time("month", metric="chars")
savechart("monthly-characters")

In [25]:

annotations = [
    ("2012-10-29", "Hurricane Sandy", -6, 0),
    ("2012-12-24", "Home for\nthe holidays.", 0, 10),
    ("2013-01-28", "Advised\nfriend on a\n relationship.", 0, 10)
]

plot_over_time("week", metric="160s", annotations=annotations)
savechart("weekly-messages-labeled")

In [26]:

plot_over_time("week", metric="chars", annotations=annotations)
savechart("weekly-characters-labeled")

Which days of the week?¶

In [27]:

def plot_weekdaily(metric="160s"):
    """Plot the average number of messages sent + received per day of the week."""
    # Calculate the number of each weekday in our date range, in order to calculate the averages.
    weekday_counts = pd.Series(DATE_RANGE.to_datetime()).apply(lambda x: x.weekday()).value_counts()
    all_per_day, sent_per_day, received_per_day = (1.0 * selection.groupby("weekday")[metric].sum() / weekday_counts 
        for selection in (msgs, sent, received))
    
    # Create and label plot
    close()
    plot = all_per_day.plot(lw=2, marker=".", markersize=10, label="All")
    sent_per_day.plot(marker=".", markersize=5, label="Sent", c=rgba(BLUE, 0.5))
    received_per_day.plot(marker=".", markersize=5, label="Received", c=rgba(RED, 0.5))
    plot.margins(0.1, 0.1)
    thing = THINGS[metric]
    plot.set_title("Average # of " + thing + " by Day of Week\n")
    plot.set_xlabel("\nDay of Week")
    plot.set_ylabel("Average # of " + thing + "\n")
    leg = plot.legend(loc='upper left')
    leg.get_frame().set_alpha(0)

    
    # Set x-axis details
    plot.set_xticks(range(0, len(pd.datetools.DAYS)))
    plot.set_xticklabels(pd.Series(pd.datetools.DAYS[x] for x in range(7)))
    
    # Set y-axis details
    MAX_COUNT = all_per_day.max()
    step = get_step(MAX_COUNT)
    plot.set_ylim(0, MAX_COUNT + 1.1)
    plot.set_yticks(range(0, int(MAX_COUNT * 1.1) + 1, step))

In [28]:

plot_weekdaily()
savechart("weekdaily-messages")

In [29]:

plot_weekdaily("chars")
savechart("weekdaily-characters")

What times of day?¶

In [30]:

def plot_hourly(metric="160s", secondary_metric=False):
    def get_pcts(metric):
        hourly_counts = msgs.groupby("hour")[metric].sum()
        total = hourly_counts.sum()
        hourly_pcts = 1.0 * hourly_counts / total
        return hourly_pcts

    def fill_hours(data):
        return pd.Series(list(data[x] if x in data else 0 for x in all_hours), index = all_hours)
        
    # Fill in 0s for hour(s) with no SMSes
    all_hours = range(HOUR_SHIFT, 24) + range(0, HOUR_SHIFT)
    hourly_pcts = get_pcts(metric)
    all_hourly_pcts = fill_hours(hourly_pcts)
    
    # Create and label plot
    close()
    plot = all_hourly_pcts.plot(lw=2, marker=".", markersize=10, label="Messages")
    fill_hours(get_pcts(secondary_metric)).plot(c=rgba(BLACK, 0.25), label="Characters")

    plot.set_title("Hourly Distribution\n")
    plot.set_xlabel("Hour Beginning at...")
    plot.set_ylabel("Percentage of Total Sent + Received")
    
    leg = plot.legend(loc='upper left')
    leg.get_frame().set_alpha(0)

    
    # Set x-axis details
    plot.set_xlim(min(all_hours) - 0.5, max(all_hours) + 0.5)
    xticks = plot.set_xticks(all_hours)
    for x in xticks: x.label.set_fontsize(10)
    plot.set_xticklabels(list(datetime.datetime.strptime(str((x + HOUR_SHIFT) % 24), "%H").strftime("%I%p").lstrip("0").lower().rstrip("m")
        for x in all_hours))
    
    # Set y-axis details
    max_pct = hourly_pcts.max()
    plot.set_ylim(-max_pct * 0.05, max_pct * 1.05)
    yticks = np.arange(0, max_pct * 1.1, 0.01)
    plot.set_yticks(yticks)
    plot.set_yticklabels(list("%d%%" % (y * 100) for y in yticks))

In [31]:

plot_hourly(metric="160s", secondary_metric="chars")
savechart("hourly")

How diverse are my contacts each month?¶

In [32]:

def plot_monthly_uniques():
    sent = msgs[msgs.direction == "sent"]
    monthly_uniques = sent.groupby("month")["contact_name"].agg(lambda x: len(x.unique()))[1:-1]
   
    close()
    monthly_plot = monthly_uniques.plot(marker=".", markersize=10, lw=2)
    
    monthly_plot.set_title("Unique SMS Recipients per Month\n")
    monthly_plot.set_xlabel("")
    monthly_plot.set_ylabel("Number of Unique SMS Recipients\n")
    monthly_plot.set_ylim(0, monthly_uniques.max() * 1.1)
    monthly_plot.set_xlim(monthly_plot.get_xlim()[0] - 0.5, monthly_plot.get_xlim()[1] + 0.5)

In [33]:

plot_monthly_uniques()
savechart("monthly-uniques")

Just for fun, let's plot the Simpson Index of SMS recipients¶

Takes into account the number of messages sent to each person, and the distribution of those numbers.

In [34]:

def plot_diversity():
    def simpson(x):
        numerator = sum(pow(y, 2) for y in x.value_counts()) * 1.0
        denominator = pow(len(x), 2)
        return 1 - numerator / denominator
    
    sent = msgs[msgs.direction == "sent"]
    monthly_diversity = sent.groupby("month")["contact_name"].agg(simpson)[1:-1]
   
    close()
    monthly_plot = monthly_diversity.plot(marker=".", markersize=10, lw=2)
    
    monthly_plot.set_title("Simpson Diversity Index of SMS Recipients by Month\n")
    monthly_plot.set_xlabel("")
    monthly_plot.set_ylabel("Diversity (Higher = Greater Diversity of Recipients)\n")
    monthly_plot.set_ylim(0, 1)
    monthly_plot.set_xlim(monthly_plot.get_xlim()[0] - 0.5, monthly_plot.get_xlim()[1] + 0.5)
    monthly_plot.set_yticks(list(float(x) / 10 for x in range(0, 11)))

In [35]:

plot_diversity()
savechart("diversity")

How often do I send and receive messages from specific people?¶

Though the contacts aren't named in the plots, the outliers are potentially re-identifiable, so saving these to the "personal" directory.

In [36]:

def scatterplot_contacts(metric="160s", size=50, thing="Total Messages"):
    counts = people_directions[metric].sum()
    counts_unstacked = counts.unstack("direction").fillna(0)
    max_val = counts.max()
    step = get_step(max_val)
    max_val_bleed = max_val * 1.05
    close()
    plot = mpl.pyplot.scatter(counts_unstacked.sent, counts_unstacked.received, s=size, c=ORANGE, alpha=0.9, zorder=2)
    mpl.pyplot.plot([-max_val_bleed, max_val_bleed], [-max_val_bleed, max_val_bleed], c=LIGHT_GRAY, zorder=1)
    plot.figure.set_size_inches(8, 8)
    axes = plot.axes
    
    axes.set_title(thing + " Sent/Received per Contact\n")
    axes.set_xlabel("\n" + thing + " Sent to Contact")
    axes.set_ylabel(thing + " Received from Contact\n")
    axes.set_xlim(-max_val_bleed / 20, max_val_bleed)
    axes.set_ylim(-max_val_bleed / 20, max_val_bleed)
    ticks = np.arange(0, max_val_bleed, step)
    labels = list(humancount(x) for x in ticks)
    axes.set_xticks(ticks)
    axes.set_yticks(ticks)
    axes.set_xticklabels(labels)
    axes.set_yticklabels(labels)

In [37]:

def save_sent_vs_received_scatterplots():
    # All the bizarre math here is to exaggerate differences between average message lengths.
    avg_msg_len = pow(people["chars"].sum() / people["160s"].sum(), 2) / 30

    scatterplot_contacts("160s", size=avg_msg_len, thing="Total Messages")
    savechart("personal/sent-vs-received-msgs")
    scatterplot_contacts("chars", size=avg_msg_len, thing="Total Characters")
    savechart("personal/sent-vs-received-chars")
    close()
save_sent_vs_received_scatterplots()

Plot message-volume per contact¶

In [38]:

def plot_message_hist (df, title, ything):
    counts = df.groupby("contact_name")["160s"].sum()
    n_people = len(msgs["contact_name"].unique())
    pct95 = (int(np.percentile(counts, 95)) / 10) * 10
    trimmed = counts.apply(lambda x: min([x, pct95]))
    step = get_step(pct95)
    bins = range(0, pct95 + step, step)
    
    # Create and label histogram
    close()
    plot = trimmed.hist(bins=bins)
    plot.set_title(title + "\n")
    plot.set_xlabel("# Of Messages")
    plot.set_ylabel("# Of %s" % (ything))

    # Set x-axis and labels
    plot.set_xticks(bins)
    plot.set_xticklabels(bins[:-1] + ["%d+" % (bins[-1])])
    
    # Set secondary y-axis and labels
    yticks = plot.get_yticks()
    y2 = plot.twinx()
    y2.grid(False)
    y2.set_yticks(yticks)
    y2.set_yticklabels(list("%.1f%%" % (100.0 * y / n_people) for y in yticks))
    y2.set_ylabel("Percentage of All Contacts", rotation=-90)

In [39]:

plot_message_hist(msgs, "Distribution of Total Messages Exchanged per Person", "People")
savechart("messages-per-person")

Plot the character count per message.¶

Caveat: Seems that my phone allowed for messages longer than 160 characters if sent/received to/from another Verizon customer.

In [40]:

def plot_message_lengths(direction=False):
    to_analyze = msgs[msgs["direction"] == direction] if direction else msgs
    
    # Break up messages longer than 160 characters into chunks of 160 characters or less
    chunk_into_160s = lambda x: [ 160 ] * (x/160) + [ x%160 ] if x > 160 else [ x ]
    adjusted_chars = pd.Series(to_analyze["chars"].apply(chunk_into_160s).sum())

    # Define histogram bins and calculate max bar height
    bins = range(0, 170, 10)
    counts, divisions = np.histogram(adjusted_chars, bins=bins)
    total = len(adjusted_chars)
    max_val = counts.max() * 1.0 / total

    # Create plot and set axes
    close()
    hist = adjusted_chars.hist(bins=bins)
    hist.set_xticks(bins)    
    yticks = np.arange(0, max_val + 0.01, 0.01)
    hist.set_yticks(yticks * total)
    hist.set_yticklabels(list("%d%%" % (y * 100) for y in yticks))
    
    # Set labels
    dir_string = str(direction or "all").capitalize()
    hist.set_title(u"Message Length by Number of Characters • %s Messages\n" % dir_string)
    hist.set_xlabel("\nMessage Length (# of Characters)")
    hist.set_ylabel("Percentage of %s Messages\n" % dir_string)

In [41]:

plot_message_lengths()
savechart("message-lengths-all")

In [42]:

plot_message_lengths("sent")
savechart("message-lengths-sent")

In [43]:

plot_message_lengths("received")
savechart("message-lengths-received")

In [44]:

def plot_words(word_dict, exp=2):
    def scaler(x, rev=False):
        return pow(x, pow(1.0/exp, -1 if rev else 1))
    
    words = pd.Series(word_dict)
    bodies = msgs.groupby("direction")["body"].agg(lambda x: " /// ".join(x))
    chars_received, chars_sent = msgs.groupby("direction")["body"].agg(sum).apply(len)
    count_received, count_sent = (words.apply(lambda regex: len(regex.findall(body))) for body in bodies)
    freq_received, freq_sent = (scaler(x) for x in (1.0 * count_received / chars_received, 1.0 * count_sent / chars_sent))
    sent_more = (freq_sent > freq_received) * 1
    
    word_df = pd.DataFrame({ 
        "regex": words, 
        "freq_sent": freq_sent, 
        "freq_received": freq_received,
        "sent_more": sent_more,
        "rank_received": (1 * (sent_more * (freq_received + 1))).rank(method="first"),
        "rank_sent": (1 * ((sent_more^1) * (freq_sent + 1))).rank(method="first")
    }).reset_index()
    close()
    plot = mpl.pyplot.scatter(
        word_df["freq_sent"], 
        word_df["freq_received"], 
        color=ORANGE,
        s=50).axes
    plot.figure.set_size_inches(10, 10)
    
    lim = max(max(freq_received), max(freq_sent))
    plot.set_xlim(-lim*0.01, lim*1.05)
    plot.set_ylim(-lim*0.01, lim*1.05)
    
    mpl.pyplot.plot([ -1, 1 ], [ -1, 1 ], color=rgba(BLACK, 0.5))
    
    top = lim
    n = len(sent_more)
    nright = sum(sent_more)
    ntop = n - nright
    def budge(row):
        x = top if row["sent_more"] else (row["rank_sent"] - (nright + 1)) * top / ntop
        y = (row["rank_received"] - (ntop + 1)) * top / nright if row["sent_more"] else top
        return (x, y)
    
    def annotate(row):
        bbox_props = dict(boxstyle="round,pad=0.3", fc=WHITE, ec=rgba(BLACK, 0.25))
        plot.annotate(row["index"], 
            xy=(row["freq_sent"], row["freq_received"]),
            xytext=budge(row),
            arrowprops=(dict(arrowstyle="-|>", color=rgba(BLACK, 0.5))),
            bbox=bbox_props)

    for x in word_df.index:
        row = word_df.ix[x]
        annotate(row)
        
    ticks = [ 0 ] + list(scaler(1 * pow(10, -x)) for x in range(2, 6))
    labels = list(int(round(x)) if round(x, 3)%1 == 0 else x 
        for x in list(scaler(x, rev=True) * 10000 for x in ticks))
    plot.set_xticks(ticks)
    plot.set_xticklabels(labels)
    plot.set_yticks(ticks)
    plot.set_yticklabels(labels)
    
    plot.set_title("Frequency of Specific Words/Characters\n")
    plot.set_xlabel("\n# Sent per 10,000 Characters")
    plot.set_ylabel("# Received per 10,000 Characters")  
    

In [45]:

plot_words({ 
    "!": re.compile("\!"),
    "?": re.compile("\?"),
    "; (semicolon)": re.compile("; "),
    ", (comma)": re.compile(", "),
  #  ". (period)": re.compile("\."),
    "oops": re.compile("oops", re.IGNORECASE),
    "yikes": re.compile("yikes", re.IGNORECASE),
    "no prob*": re.compile("no prob*", re.IGNORECASE),
    ":)": re.compile(":.?\)"),
    ":(": re.compile(":.?\("),
    "I": re.compile(" I(?![A-Za-z])"),
    "i": re.compile(" i(?![A-Za-z])"),
    "oops": re.compile("oops", re.IGNORECASE),
    "no/nope": re.compile("(no|nope)(?![a-z])", re.IGNORECASE),
    "yes": re.compile("yes(?!a-z])", re.IGNORECASE),    
    "u": re.compile(" u(?![A-Za-z])"),
    "you": re.compile("you(?!a-z])", re.IGNORECASE),
    "(s)he": re.compile(" s?he ", re.IGNORECASE),
 #   "my": re.compile(" my ", re.IGNORECASE),
    "thank*": re.compile("thanks?(?![a-z])", re.IGNORECASE),
    "please": re.compile("please", re.IGNORECASE),
    "sorry": re.compile("sorry", re.IGNORECASE),
    "congrat*": re.compile("congrat", re.IGNORECASE)
}, exp=2)
savechart("word-frequency")