Notebook <style type="text/css"> .input_prompt, .input_area, .output_prompt { display:none !important; } .reveal h1, .reveal h2 { font-family:times } </style>

China300.1x1T2016 metric report¶

In [1]:

%matplotlib inline
from __future__ import division
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import OrderedDict

import plotly
import plotly.offline as py
import plotly.graph_objs as go
import plotly.tools as tls

py.init_notebook_mode() # graphs charts inline (IPython).

In [2]:

# change course_id as needed
course_id = 'UBCx__China300_1x__1T2016'
# update courses with graded_problems other than multiple choices
not_mc_list = [
    'UBCx__CW1_1x__1T2016',
#     'UBCx__CW1_2x__1T2016',
    'UBCx__Phot1x__1T2016',
    'UBCx__ITSx__2T2015',
    'UBCx__SPD1x__2T2015'
]
# update professional education courses
pe_list = [
    'UBCx__CW1_1x__1T2016',
#     'UBCx__CW1_2x__1T2016',
    'UBCx__Phot1x__1T2016',
    'UBCx__ITSx__2T2015'
]

In [3]:

def query_moduleActivity(course_id=course_id):
    """
    Give the course_id, query # students with any activity, # students attempted any problem 
    and # students watched any video for each chapter of the course
    """
    # query # students attempted any problem for each chapter of the course, exclude those with less than 20 attempts
    query = """
    Select course_id, sub.index As index, module_id, chapter_name, exact_count_distinct(user_id) As tried_problem
    From
    (SELECT p.course_id As course_id, p.user_id As user_id, c2.index As index, 
    c2.module_id As module_id, c2.name As chapter_name
    FROM [{0}.problem_analysis] p
    Left Join [{0}.course_axis] c1
    on p.problem_url_name = c1.url_name
    Left Join [{0}.course_axis] c2
    On c1.chapter_mid = c2.module_id) sub
    Group By course_id, index, module_id, chapter_name
    Order By index""".format(course_id)
    tried_problem = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)
    tried_problem = tried_problem[tried_problem.tried_problem > 20]

    # query # students watched any video for each chapter of the course, exclude those with less than 20 views
    query = """
    Select course_id, index, module_id, chapter_name, exact_count_distinct(username) As watched_video
    From
    (SELECT c1.course_id As course_id, v.username As username, c2.index As index, 
    c2.module_id As module_id, c2.name As chapter_name
    FROM [{0}.video_stats_day] v
    Left Join [{0}.course_axis] c1
    on v.video_id = c1.url_name
    Left Join [{0}.course_axis] c2
    On c1.chapter_mid = c2.module_id) sub
    Group By course_id, index, module_id, chapter_name
    Order By index""".format(course_id)
    watched_video = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)
    watched_video = watched_video[watched_video.watched_video > 20]

    # query # students with any activity for each chapter of the course, excluding those with less than 20 active students
    query = """
    Select sub.course_id As course_id, sub.module_id As module_id, 
    c.name As chapter_name, c.index As index, sub.nactive As nactive
    From [{0}.course_axis] c
    Join 
    (Select course_id As course_id, Regexp_replace(module_id,'i4x://', '') As module_id, 
    exact_count_distinct(student_id) As nactive
    From [{0}.studentmodule]
    Where module_type = 'chapter' 
    Group By course_id, module_id) sub
    On sub.module_id = c.module_id
    Order By index""".format(course_id)
    nactive = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)
    nactive = nactive[nactive.nactive > 20]
    
    # merge watched_video, tried_problem, nactive 
    module_activity = watched_video.merge(tried_problem, how='outer').merge(nactive, how='outer').fillna(0)
    return module_activity[module_activity.chapter_name != 0].sort_values('index').set_index('chapter_name')
    
module_activity = query_moduleActivity()
# create a list of course indices to make sure courses are ordered in the visualization
indices = module_activity.index

In [4]:

def query_cs(course_id = course_id):    
    """
    Given course_id (e.g. 'UBCx__Climate1x__1T2016'), 
    return a list of all the course items (graded_problem, self_test, video, assignment, chapter) 
    from course_axis table ordered by index.
    IMPORTANT: Need to update course_axis in SPD1x first (delete items that belong to SPD2x and SPD3x)
    """
    query = """
    SELECT
    Case 
         When c1.category='problem' And c1.graded='true' Then 'graded_problem'
         When c1.category='problem' And c1.graded!='true' Then 'self_test' 
         Else c1.category
    End As category, c1.index As index, c1.name As name,
    c1.url_name As url_name, c2.name As chapter
    FROM [[{0}.course_axis] c1
    Left Join [{0}.course_axis] c2
    On c1.chapter_mid = c2.module_id
    Where c1.category in ('video', 'problem', 'openassessment', 'chapter')
    Order By c1.index""".format(course_id)

    structure = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)
    structure = structure[(structure.name.isin(indices)) | (structure.chapter.isin(indices))]
    
    query = """
    Select problem_url_name, exact_count_distinct(item.answer_id) As num
    From [{0}.problem_analysis]
    Group By problem_url_name""".format(course_id)
    nQuestions = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)
    structure = structure.merge(nQuestions, left_on='url_name', right_on='problem_url_name', how='left')\
    .drop('problem_url_name', axis=1)
    structure.num = structure.num.fillna(1)
    return structure

course_structure = query_cs()

In [5]:

# count # of videos, graded_problems, self_test and assigments for each chapter and order te courses
cs_chapter = course_structure.groupby(['chapter', 'category']).num.sum().unstack('category')\
.reindex(indices).dropna(how='all')

cols = []
# some courses don't have all the items
for col in ['video', 'graded_problem', 'self_test', 'openassessment']:
    if col in cs_chapter.columns.values:
        cols.append(col)
cs_chapter = cs_chapter[cols]
cs_chapter['chapter'] = np.nan
cs_chapter.fillna(0, inplace=True)

Course structure and activity¶

In [6]:

module_activity = module_activity.reindex(cs_chapter.index)
fig = tls.make_subplots(rows=1, cols=2, print_grid=False, subplot_titles=('Course structure', 'Module activity'))

colors = {'video': 'rgb(202,178,214)', 'graded_problem': 'rgb(66,146,198)', 
          'self_test': 'rgb(166,206,227)', 'openassessment': 'rgb(116,196,118)', 
          'chapter': 'rgb(0, 0, 0)'}

# traces for module activity
fig.append_trace(go.Scatter(y=module_activity.index, x=module_activity.watched_video, 
                    name='watched a video', fill='tozerox', mode='lines', 
                            line=dict(color='rgb(152,78,163)')), 1, 2)
fig.append_trace(go.Scatter(y=module_activity.index, x=module_activity.tried_problem, 
                    name='tried a problem', fill='tonextx', mode='lines', 
                            line=dict(color='rgb(66,146,198)')), 1, 2)
fig.append_trace(go.Scatter(y=module_activity.index, x=module_activity.nactive, 
                    name='with any activity', fill='tonextx', mode='lines', 
                            line=dict(color='rgb(255,127,0)')), 1, 2)

# traces for course structure
for i in range(0, cs_chapter.shape[1]):
    fig.append_trace(go.Bar(y=cs_chapter.index, x=cs_chapter.ix[:, i], orientation='h',
                       marker=dict(color=colors[cs_chapter.columns[i]]), name=cs_chapter.columns[i]), 1, 1)

fig['layout']['yaxis1'].update(tickfont=dict(size=8), showgrid=False, autorange='reversed')
fig['layout']['yaxis2'].update(showticklabels=False, showgrid=False, autorange='reversed')
fig['layout']['xaxis1'].update(showgrid=False)
fig['layout']['xaxis2'].update(showgrid=False)
fig['layout']['legend'].update(x=1, y=0, traceorder='normal')
fig['layout'].update(height=50+30*len(cs_chapter), width=850, margin=go.Margin(l=185, t=25, b=20), barmode='stack')
py.iplot(fig)

In [7]:

# def rolling_count(df):
#     df['block'] = (df['category'] != df['category'].shift(1)).astype(int).cumsum()
#     df['count'] = df.groupby('block').num.cumsum()
#     return df
# # count # of times an item (graded_problem, self_test, video) appears consecutively
# df = course_structure.fillna(method='bfill')
# df = df.groupby('chapter').apply(rolling_count)
# idx = df.groupby(['chapter', 'block'])['count'].transform(max) == df['count']
# df = df.ix[idx]

# # plotting    
# data = [go.Bar(x=df['count'], y=[course_id.replace('__', '/').replace('_', '.')]*len(df), 
#                orientation='h', hoverinfo='y',
#               marker=dict(color=df.category.apply(lambda x: colors[x]).values))]
# layout = go.Layout(
#     xaxis=dict(tickfont=dict(size=8), showgrid=False),
#     yaxis=dict(showticklabels=False),
#     barmode='stack', 
#     width=850,
#     height=50,
#     margin=go.Margin(b=15, t=0, l=100)
# )
# fig = go.Figure(data=data, layout=layout)
# py.iplot(fig)

In [8]:

def course_item(course_id=course_id):
    """
    Given course_id, query students' event for video, graded_problem, 
    self_test, openassessment and chapter from the studentmodule table
    => the numbers are slightly different from thosed queried from person_item and video_stats_day
    """
    query = """
    SELECT sub.module_id As item_id, c.index As index, name, category, nstudents
    FROM [ubcxdata:{0}.course_axis] c
    Join 
    (Select Regexp_replace(module_id,'i4x://', '') As module_id, exact_count_distinct(student_id) As nstudents
    From [ubcxdata:{0}.studentmodule]
    Where module_type In ('openassessment', 'chapter')
    Group By module_id) sub
    On sub.module_id = c.module_id
    Order By index""".format(course_id)
    chapter_assign = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)

    query = """
    Select problem_url_name as item_id, index, name, 
    Case When graded='true' Then 'graded_problem' Else 'self_test' End As category,
    exact_count_distinct(user_id) As nstudents
    From [{0}.problem_analysis] p
    Join [{0}.course_axis] c
    On p.problem_url_name= c.url_name
    Group By item_id, index, name, category
    Order By index""".format(course_id)
    nproblems = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)

    query = """
    Select video_id as item_id, index_video as index, name, 'video' As category, videos_viewed As nstudents
    From [{0}.video_stats]
    Where videos_viewed > 20""".format(course_id)
    nvideos = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False).dropna()

    courseItem = pd.concat([chapter_assign, nproblems, nvideos]).sort_values('index')
    courseItem = courseItem[courseItem.nstudents > 20].reset_index(drop=True)
    return courseItem

courseItem = course_item()

In [9]:

# make it center in the middle
trace1 = go.Bar(x=courseItem.index+1, y=courseItem.nstudents, hoverinfo='text',
               text=['{0}:<br>nstudents: {1}'.format(name.encode('utf-8'), value) 
                      for name, value in zip(courseItem.name, courseItem.nstudents)],
               marker=dict(color=courseItem.category.apply(lambda x: colors[x]).values))
trace2 = go.Bar(x=courseItem.index+1, y=-courseItem.nstudents, hoverinfo='none',
               marker=dict(color=courseItem.category.apply(lambda x: colors[x]).values))
data = [trace1, trace2]
layout = go.Layout(barmode='relative', title='course structure vs. students activity', 
                   xaxis=dict(showticklabels=False, title='course_structure'), 
                   yaxis=dict(showticklabels=False, showgrid=False, title='nstudents', zeroline=False), 
                   height=300, width=850, margin=go.Margin(t=25, b=15), showlegend=False)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

In [10]:

def query_nstudents(cs = course_structure, course_id = course_id):
    """
    Query and calculate number of students viewed the video, attempted the problem
    """
    query = """
    Select l.video_id As video_id, nstudents, position As length
    From (Select video_id, position, Row_number() Over (Partition By video_id Order By position Desc) As rn
    From [{0}.video_stats_day]) l
    Join 
    (Select video_id, exact_count_distinct(username) As nstudents
    From [{0}.video_stats_day]
    Where position != 0
    Group By video_id) n
    On l.video_id = n.video_id
    Where rn=5""".format(course_id)
    videos = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)
    videos = cs[cs.category=='video'].merge(videos, left_on='url_name', right_on='video_id')
    videos = videos[(videos.length<1800) & (videos.nstudents>10)].reset_index(drop=True)

    query = """
    SELECT c.chapter_name As chapter, c.section_name As section_name,
    p.item_short_id As name, c.item_id As problem_id,
    count(*) As nstudents, Sum(item_grade) As ncorrect
    FROM [{0}.person_item] p
    Join [{0}.course_item] c
    On p.item_short_id = c.item_short_id
    Group By chapter, section_name, name, c.item_nid, problem_id
    Order By c.item_nid""".format(course_id)

    graded_problems = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)
    graded_problems = graded_problems[graded_problems.ncorrect != 0].reset_index(drop=True)

    return videos, graded_problems

In [11]:

videos, problems = query_nstudents(cs = course_structure)

Video activity¶

In [12]:

choices = ['rgba(166, 206, 227, 0.8)', 'rgba(31, 120, 180, 0.8)', 'rgba(178, 223, 138, 0.8)', 'rgba(51, 160, 44, 0.8)', 
           'rgba(251, 154, 153, 0.8)', 'rgba(227, 26, 28, 0.8)', 'rgba(253, 191, 111, 0.8)', 'rgba(255, 127, 0, 0.8)', 
           'rgba(202, 178, 214, 0.8)', 'rgba(106,61,154, 0.8)']
# create a dictionary to map colors to chapters
colors_chapter = dict(zip(cs_chapter.index, choices[:len(cs_chapter.index)]))

In [13]:

# x-axis needs to start from 1, videos.length and videos.length are normalized so that maximum=100
# hoverinfo => question_name: actual value
trace1 = go.Bar(x = videos.index+1, y = videos.length/(videos.length.max()/100) , 
                text=['{0}: {1}s'.format(name.encode('utf-8'), value) 
                      for name, value in zip(videos.name, videos.length.round(2))], hoverinfo='text',
                marker=dict(color=videos.chapter.map(colors_chapter)), name='video_length')
trace2 = go.Bar(x = videos.index+1, y = -videos.nstudents/(videos.nstudents.max()/100), 
               text=['{0}: {1}'.format(name.encode('utf-8'), value) 
                     for name, value in zip(videos.name, videos.nstudents)], hoverinfo='text',
                marker=dict(color=videos.chapter.map(colors_chapter)), name='nstudents_watched')


data = [trace1, trace2]
layout = go.Layout(barmode='relative', xaxis=dict(showticklabels=False), 
                   yaxis=dict(showticklabels=False, showgrid=False, title='nstudents_watched          video_length (s)'), 
                   height=360, width=850, margin=go.Margin(t=25, b=25), showlegend=False)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

Graded problem activity¶

In [14]:

# x-axis needs to start from 1, problems.nstudents is normalized so that maximum=100 (the same as pct_correct)
# hoverinfo: question_name: actual value
pct_correct = (problems.ncorrect/problems.nstudents*100).round(2)
trace1 = go.Bar(x=problems.index+1, y = pct_correct, 
                text=['{0}<br>{1}: {2}'.format(section.encode('utf-8'), name.encode('utf-8'), value) 
                      for section, name, value in zip(problems.section_name, problems.name, pct_correct)],
                hoverinfo='text', marker=dict(color=problems.chapter.map(colors_chapter)), name='pct_correct')
trace2 = go.Bar(x=problems.index+1, y = -problems.nstudents/(problems.nstudents.max()/100), 
                text=['{0}<br>{1}: {2}'.format(section.encode('utf-8'), name.encode('utf-8'), value) 
                      for section, name, value in zip(problems.section_name, problems.name, problems.nstudents)], 
                hoverinfo='text', marker=dict(color=problems.chapter.map(colors_chapter)), name='nstudents_attempted')

data = [trace1, trace2]
layout = go.Layout(barmode='relative', xaxis=dict(showticklabels=False), 
                   yaxis=dict(showticklabels=False, showgrid=False, title='nstudents_attempted          pct_correct'), 
                   height=360, width=850, margin=go.Margin(t=25, b=25), showlegend=False)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

In [15]:

# def query_least(course_id=course_id):
#     """
#     Given the course_id, return the distribution of answers for the 10 least successful graded_problems
#     """
#     least = "', '".join(problems.ix[pct_correct.argsort()[:10], 'problem_id'].values)
#     # query 10 least successful problems and corresponding responses by each student
#     query = """
#     Select item.answer_id As problem_id, user_id, item.response As response, item.correctness
#     From [{0}.problem_analysis]
#     Where item.answer_id in ('{1}')
#     """.format(course_id, least)

#     answers = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)
#     answers = answers.merge(problems[['name', 'problem_id']])
#     # the correct answers: most common responses
#     correct =  answers[answers.item_correctness=='correct'].groupby('name').response.first().to_dict()
#     # total # of responses
#     count = answers.groupby('name').response.count()
    
#     answers.response = answers.response.apply(lambda x: x.replace('[', '').replace(']', '').split(', '))
#     rows = []
#     # explode items in a list to multiple rows <= those with more than 1 answer
#     _ = answers.apply(lambda row: [rows.append([row['name'], row['user_id'], choice]) 
#                              for choice in row.response], axis=1)
#     answers_new = pd.DataFrame(rows, columns=['name', 'user_id', 'response'])#.set_index(['name', 'opponent'])
#     # aggregate and calculate % distribution of answers for each problem
#     answers_pct = answers_new.groupby('name').response.value_counts().unstack('name').divide(count, axis=1)
#     return correct, answers_pct#, answers

In [16]:

# if course_id in not_mc_list:
#     pass
# else:
#     correct, answers_pct = query_least()
#     def correct_color(col):
#         # green for correct answers, blue otherwise
#         return ['rgb(44,162,95)' if x in correct[answers_pct.columns[col]] else 'rgb(49,130,189)' for x in answers_pct.index]

#     fig = tls.make_subplots(rows=2, cols=5, print_grid=False, vertical_spacing=0.25)
#     # the first five problems
#     for i in range(5):
#         quesion = answers_pct.ix[:, i].dropna()
#         fig.append_trace(go.Bar(x=quesion.index, y=quesion, name=answers_pct.columns[i], 
#                                 marker=dict(color=correct_color(i)), showlegend=False), 1, i+1)
#     # the next five problems
#     for i in range(5):
#         quesion = answers_pct.ix[:, i+5].dropna()
#         fig.append_trace(go.Bar(x=quesion.index, y=quesion, name=answers_pct.columns[i],
#                                 marker=dict(color=correct_color(i+5)), showlegend=False), 2, i+1)  

#     for i in range(1, answers_pct.shape[1]+1):
#         fig['layout']['xaxis%s' % i].update(tickangle=45, tickfont=dict(size=8),
#                                             title=answers_pct.columns[i-1], titlefont=dict(size=10))
#         fig['layout']['yaxis%s' % i].update(showgrid=False, tickfont=dict(size=8))

#         fig['layout'].update(height=500, width=850, 
#                              title = 'Ten least successful graded problems')  
#     py.iplot(fig)

Overall engagement:¶

** Learner type **

Registered: learners registered in the course
Sampled: learners who accessed the course at least once
Involved: learners with sum_dt > 15 min
Passed: learners whose grade is at least 50%
Verified: those purchased the verified certificate

** sum_dt **: Total elapsed time spent by learner on this course, based on time difference between consecutive events, with a 5 min max cutoff, based on event data

In [17]:

def query_pc(course_id = course_id):     
    """
    Given course_id(e.g. 'UBCx__Marketing1x__3T2015'), query and calculate ndays_act, sum_dt, nforum_posts, 
    nvideos_watched, nproblems_attempted, pct_video_watched, pct_problem_attempted for sampled students,
    also return total_videos and total_problems.
    """
    query = """
    Select pc.user_id As user_id, pc.course_id As course_id, pc.mode As mode, pc.grade As grade, 
    pc.ndays_act As ndays_act, pc.sum_dt As sum_dt, pc.nforum_posts As nforum_posts,
    v.videos_watched As nvideos_watched, p.problems_attempted As nproblems_attempted
    From [{0}.person_course] pc
    Left Join
    (SELECT username, exact_count_distinct(video_id) As videos_watched 
    FROM [{0}.video_stats_day]
    Group By username) v
    on pc.username = v.username
    Left Join 
    (Select user_id, exact_count_distinct(item.answer_id) As problems_attempted
    From [{0}.problem_analysis]
    Group By user_id) p
    On pc.user_id = p.user_id
    Where pc.sum_dt > 0""".format(course_id)
    df = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False).fillna(0)

    # course_axis includes items not accessible to the students, 
    # => total_videos/total_problems are maximum number of videos/problems students accessed 
    # if smaller than the number from course_axis then use the latter one
    total_videos = min(df.nvideos_watched.max(), cs_chapter.video.sum())
    df['pct_video_watched'] = df.nvideos_watched / total_videos
    
    total_problems = min(df.nproblems_attempted.max(), 
                         cs_chapter.graded_problem.sum() + cs_chapter.self_test.sum() if 'self_test' in cs_chapter.columns 
                         else cs_chapter.graded_problem.sum())
    df['pct_problem_attempted'] = df.nproblems_attempted / total_problems
    
    
    return  total_videos, total_problems, df

In [18]:

def compute_srp(pc):
    # pc for the sampled
    pc_sampled = pc.copy()
    # pc for the involved
    pc_learned = pc[pc.sum_dt>900].copy()
    # pc for the passed
    pc_passed = pc[pc.grade>=0.5].copy()
    # added category column
    pc_sampled['category'] = 'Sampled'
    pc_learned['category'] = 'Involved'
    pc_passed['category'] = 'Passed'
    srp = pd.concat([pc_sampled, pc_learned, pc_passed])
    # aggregate nforum_posts => # students posted, others by median, by learner type
    srp_agg = srp.groupby('category').agg({'nvideos_watched': np.median, 
                             'nproblems_attempted': np.median, 'ndays_act': np.median, 
                             'sum_dt': np.median, 'nforum_posts': lambda x: (x > 0).sum()})
    srp_agg = srp_agg.reindex(index = ['Sampled', 'Involved', 'Passed'])
    
    return srp_agg

In [19]:

total_videos, total_problems, pc = query_pc()
# convert to %
pc[['pct_video_watched', 'pct_problem_attempted']] = \
pc[['pct_video_watched', 'pct_problem_attempted']].applymap(lambda x: "{0:.2f}".format(x * 100))
srp_agg = compute_srp(pc)

In [20]:

def plot_pls(df, course_id=course_id, title=None):
    """
    Plot students' activity: median sum_dt, median ndays_act, # students posted, 
    nproblems_attempted, nvideos_watched, grouped by passed vs. involved vs. sampled;
    """
    if course_id in pe_list:
        query = \
        """SELECT Count(*) As Registered, 
        Sum(Case When sum_dt > 0 Then 1 Else 0 End) As Sampled,
        Sum(Case When sum_dt > 900 Then 1 Else 0 End) As Involved, 
        Sum(Case When grade >= 0.5 Then 1 Else 0 End) As Passed
        FROM [%s.person_course]""" % course_id
    else:
        query = \
        """SELECT Count(*) As Registered, 
        Sum(Case When sum_dt > 0 Then 1 Else 0 End) As Sampled,
        Sum(Case When sum_dt > 900 Then 1 Else 0 End) As Involved, 
        Sum(Case When grade >= 0.5 Then 1 Else 0 End) As Passed, 
        Sum(Case When mode='verified' Then 1 Else 0 End) As Verified
        FROM [%s.person_course]""" % course_id
    stats =  pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)
    # print stats


    trace1 = go.Bar(x=stats.values[0], y=stats.columns, orientation='h', showlegend=False, name='# of conversion')
    trace2 = go.Bar(x=df.index, y=df.nvideos_watched, showlegend=True, name='nvideos watched')
    trace3 = go.Bar(x=df.index, y=df.nproblems_attempted, showlegend=True, name='nproblems attempted')
    trace4 = go.Bar(x=df.index, y=df.sum_dt/3600, showlegend=False, name='median sum_dt (H)')
    trace5 = go.Bar(x=df.index, y=df.ndays_act, showlegend=False, name='median days active')
    trace6 = go.Bar(x=df.index, y=df.nforum_posts, showlegend=False, name='# students posted')

    fig = tls.make_subplots(rows=1, cols=5, shared_xaxes=True, print_grid=False)
    fig.append_trace(trace1, 1, 1)
    fig.append_trace(trace2, 1, 5)
    fig.append_trace(trace3, 1, 5)
    fig.append_trace(trace4, 1, 2)
    fig.append_trace(trace5, 1, 3)
    fig.append_trace(trace6, 1, 4)


    fig['layout'].update(barmode='stack', height=300, width=900, margin=go.Margin(t=40), title=title)
    fig['layout']['legend'].update(font=dict(size=10))
    fig['layout']['xaxis1'].update(title='# students', showgrid=False,
                                  titlefont=dict(size=12), tickfont=dict(size=10))
    fig['layout']['xaxis2'].update(title='median sum_dt (H)', showgrid=False,
                                  titlefont=dict(size=12), tickfont=dict(size=10))
    fig['layout']['xaxis3'].update(title='median days active', showgrid=False,
                                  titlefont=dict(size=12), tickfont=dict(size=10))
    fig['layout']['xaxis4'].update(title='# students posted', showgrid=False,
                                  titlefont=dict(size=12), tickfont=dict(size=10))
    fig['layout']['xaxis5'].update(title='median events', showgrid=False, 
                                   titlefont=dict(size=12), tickfont=dict(size=10))
    fig['layout']['yaxis1'].update(autorange='reversed', showgrid=False, tickfont=dict(size=10))
    fig['layout']['yaxis2'].update(showgrid=False, tickfont=dict(size=10))
    fig['layout']['yaxis3'].update(showgrid=False, tickfont=dict(size=10))
    fig['layout']['yaxis4'].update(showgrid=False, tickfont=dict(size=10))
    fig['layout']['yaxis5'].update(showgrid=False, tickfont=dict(size=10))

    py.iplot(fig)

In [21]:

plot_pls(srp_agg, title="Students' engagement: Sampled vs. Involved vs. Passed")

In [22]:

# pc for the involved
pc_activity = pc[pc.sum_dt>900].copy()
# density map
trace2 = go.Histogram2d(x=pc_activity.pct_video_watched, y=pc_activity.pct_problem_attempted,
            histnorm='probability',
            autobinx=False,
            xbins=dict(start=0, end=100, size=10),
            autobiny=False,
            ybins=dict(start=0, end=100, size=10),
            colorscale=[[0, 'rgb(8,81,156)'], [1/1000, 'rgb(8,81,156)'], [1/100, 'rgb(242,211,56)'], 
                            [1/10, 'rgb(242,143,56)'], [1, 'rgb(217,30,30)']],
            zsmooth='fast')
# illustration on how to read the density map
z = [[1, 0.5, 0.5, 0.5], [0.5, 0.5, 0, 0], [0.5, 0, 0.5, 0], [0.5, 0, 0, 1]]
z_text = [['Early dropout', '', 'Videos only', ''], ['', 'Progress', '', ''],
         ['Problems only', '', 'Progress', ''], ['', '', '', 'Completed']]
annotations = []
for n, row in enumerate(z):
    for m, val in enumerate(row):
        text = z_text[n][m]
        annotations.append(
            dict(
                text=str(text),
                x=m, y=n,
                font=dict(color='black'),
                showarrow=False)
            )
colorscale=[[0, 'rgb(82,82,82)'], [0.5, 'rgb(150,150,150)'], [1, 'rgb(204,204,204)']]
trace1 = go.Heatmap(z=z, colorscale=colorscale, showscale=False, hoverinfo='none')

fig = tls.make_subplots(rows=1, cols=2, print_grid=False)
fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 1, 2)

fig['layout']['xaxis1'].update(ticks='', showticklabels=False, showgrid=False, title='% videos')
fig['layout']['xaxis2'].update(title='% videos (total:{0})'.format(int(total_videos)))
fig['layout']['yaxis1'].update(ticks='', showticklabels=False, showgrid=False, zeroline=False, title='% problems')
fig['layout']['yaxis2'].update(title='% problems (total:{0})'.format(int(total_problems)))
fig['layout'].update(
    width=850, height=400, annotations=annotations,
    title='Density map illustrating pattern of video and problem activity for the involved',
    margin=go.Margin(l=40, t=40)
)

py.iplot(fig)