%matplotlib inline
from __future__ import division
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import OrderedDict
import plotly
import plotly.offline as py
import plotly.graph_objs as go
import plotly.tools as tls
import warnings
warnings.filterwarnings("ignore")
py.init_notebook_mode() # graphs charts inline (IPython).
# only change course_list (for all courses on edX), mooc_list (Moocs) and pe_list (professional education) if needed
# list for all the courses
course_list = [
'UBCx__Marketing1x__3T2015',
'UBCx__Climate1x__2T2016',
'UBCx__SPD1x__2T2016',
'UBCx__SPD2x__2T2016',
'UBCx__SPD3x__2T2016',
'UBCx__UseGen_1x__1T2016',
'UBCx__UseGen_2x__1T2016',
'UBCx__China300_1x__1T2016',
'UBCx__China300_2x__1T2016',
'UBCx__Forest222x__1T2015',
'UBCx__IndEdu200x__3T2015',
'UBCx__Water201x_2__2T2015',
'UBCx__CW1_1x__1T2016',
'UBCx__CW1_2x__1T2016',
'UBCx__Phot1x__1T2016',
'UBCx__ITSx__2T2015'
]
# list for the mooc course
mooc_list = [
'UBCx__Marketing1x__3T2015',
'UBCx__Climate1x__2T2016',
'UBCx__SPD1x__2T2016',
'UBCx__SPD2x__2T2016',
'UBCx__SPD3x__2T2016',
'UBCx__UseGen_1x__1T2016',
'UBCx__UseGen_2x__1T2016',
'UBCx__China300_1x__1T2016',
'UBCx__China300_2x__1T2016',
'UBCx__Forest222x__1T2015',
'UBCx__IndEdu200x__3T2015',
'UBCx__Water201x_2__2T2015',
]
# list for professional education courses
pe_list = [
'UBCx__CW1_1x__1T2016',
'UBCx__CW1_2x__1T2016',
'UBCx__Phot1x__1T2016',
'UBCx__ITSx__2T2015'
]
# make sure courses are ordered in the visualization
indices = [course.replace('__', '/').replace('_', '.') for course in course_list]
indices[indices.index('UBCx/Water201x.2/2T2015')] = 'UBCx/Water201x_2/2T2015'
mooc_indices = [course.replace('__', '/').replace('_', '.') for course in mooc_list]
mooc_indices[mooc_indices.index('UBCx/Water201x.2/2T2015')] = 'UBCx/Water201x_2/2T2015'
pe_indices = [course.replace('__', '/').replace('_', '.') for course in pe_list]
def query_moduleActivity(course_id):
"""
Give the course_id, query # students with any activity, # students attempted any problem
and # students watched any video for each chapter of the course
"""
# query # students attempted any problem for each chapter of the course, exclude those with less than 20 attempts
query = """
Select sub.course_id As course_id, sub.index As index, sub.module_id As module_id,
sub.chapter_name As chapter_name, Count(Distinct sub.user_id) As tried_problem
From
(SELECT p.course_id As course_id, p.user_id As user_id, c2.index As index,
c2.module_id As module_id, c2.name As chapter_name
FROM [{0}.problem_analysis] p
Left Join [{0}.course_axis] c1
on p.problem_url_name = c1.url_name
Left Join [{0}.course_axis] c2
On c1.chapter_mid = c2.module_id) sub
Group By course_id, index, module_id, chapter_name
Order By index""".format(course_id)
tried_problem = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)
tried_problem = tried_problem[tried_problem.tried_problem > 20]
# query # students watched any video for each chapter of the course, exclude those with less than 20 views
query = """
Select course_id, index, module_id, chapter_name, exact_count_distinct(username) As watched_video
From
(SELECT c1.course_id As course_id, v.username As username, c2.index As index,
c2.module_id As module_id, c2.name As chapter_name
FROM [{0}.video_stats_day] v
Left Join [{0}.course_axis] c1
on v.video_id = c1.url_name
Left Join [{0}.course_axis] c2
On c1.chapter_mid = c2.module_id) sub
Group By course_id, index, module_id, chapter_name
Order By index""".format(course_id)
watched_video = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)
watched_video = watched_video[watched_video.watched_video > 20]
# query # students with any activity for each chapter of the course, excluding those with less than 20 active students
query = """
Select sub.course_id As course_id, sub.module_id As module_id,
c.name As chapter_name, c.index As index, sub.nactive As nactive
From [{0}.course_axis] c
Join
(Select course_id As course_id, Regexp_replace(module_id,'i4x://', '') As module_id,
Count(Distinct student_id) As nactive
From [{0}.studentmodule]
Where module_type = 'chapter'
Group By course_id, module_id) sub
On sub.module_id = c.module_id
Order By index""".format(course_id)
nactive = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)
nactive = nactive[nactive.nactive > 20]
# merge watched_video, tried_problem, nactive
module_activity = watched_video.merge(tried_problem, how='outer').merge(nactive, how='outer').fillna(0)
return module_activity[module_activity.chapter_name != 0].sort_values('index').set_index('chapter_name')
modules = {}
for course_id in course_list:
modules[course_id] = query_moduleActivity(course_id)
def query_cs(course_id):
"""
Given course_id (e.g. 'UBCx__Climate1x__1T2016'),
return a list of all the course items (graded_problem, self_test, video, assignment, chapter)
from course_axis table ordered by index.
IMPORTANT: Need to update course_axis in SPD1x first (delete items that belong to SPD2x and SPD3x)
"""
query = """
SELECT c1.course_id As course_id,
Case
When c1.category='problem' And c1.graded='true' Then 'graded_problem'
When c1.category='problem' And c1.graded!='true' Then 'self_test'
Else c1.category
End As category, c1.index As index, c1.name As name,
c1.url_name As url_name, c2.name As chapter
FROM [[{0}.course_axis] c1
Left Join [{0}.course_axis] c2
On c1.chapter_mid = c2.module_id
Where c1.category in ('video', 'problem', 'openassessment', 'chapter')
Order By c1.index""".format(course_id)
indices = modules[course_id].index
structure = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)
structure = structure[(structure.name.isin(indices)) | (structure.chapter.isin(indices))]
query = """
Select problem_url_name, Count(Distinct item.answer_id) As num
From [{0}.problem_analysis]
Group By problem_url_name""".format(course_id)
nQuestions = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)
structure = structure.merge(nQuestions, left_on='url_name', right_on='problem_url_name', how='left')\
.drop('problem_url_name', axis=1)
structure.num = structure.num.fillna(1)
return structure
dfs = []
for course_id in course_list:
df = query_cs(course_id)
dfs.append(df)
course_structures = pd.concat(dfs)
css_agg = course_structures.groupby(['course_id', 'category']).num.sum().unstack('category')\
.reindex(indices)[['chapter', 'video', 'graded_problem', 'self_test', 'openassessment']].fillna(0)
# UseGen => different implementation for graded problems, need to query from course_item table
# for course in course_list:
# if 'UseGen' in course:
# value = pd.io.gbq.read_gbq("SELECT Count(*) FROM [%s.course_item]" % course,
# project_id='ubcxdata', verbose=False)
# css_agg.ix[course.replace('__', '/').replace('_', '.'), 'graded_problem'] = value.values[0][0]
# given the course_structures table, ploting the bar graph
data = []
# assign colors to different types of items
colors = {'video': 'rgb(202,178,214)', 'graded_problem': 'rgb(66,146,198)',
'self_test': 'rgb(166,206,227)', 'openassessment': 'rgb(116,196,118)', 'chapter': 'rgb(0, 0, 0)'}
for i in range(0, css_agg.shape[1]):
data.append(go.Bar(x=css_agg.ix[:, i], y=css_agg.index,
marker=dict(color=colors[css_agg.columns[i]]),
orientation='h', name=css_agg.columns[i]))
layout = go.Layout(
xaxis=dict(showgrid=False),
yaxis=dict(autorange='reversed'),
# adjusting height by # of courses in the course_list
height=25+30*len(css_agg),
width=600,
margin=go.Margin(l=180, b=25, t=0),
legend=dict(x=100, y=0),
barmode='stack')
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)
# def rolling_count(df):
# df['block'] = (df['category'] != df['category'].shift(1)).astype(int).cumsum()
# df['count'] = df.groupby('block').num.cumsum()
# return df
# # iterate over courses in the course_list
# for course_id in indices:
# # query course structure (list of items in course_axis)
# # count # of times an item (graded_problem, self_test, video) appears consecutively
# df = course_structures[course_structures.course_id == course_id].copy()
# df.fillna(method='bfill', inplace=True)
# df = df.groupby('chapter').apply(rolling_count)
# idx = df.groupby(['chapter', 'block'])['count'].transform(max) == df['count']
# df = df.ix[idx]
# # plotting
# data = [go.Bar(x=df['count'], y=[course_id]*len(df),
# orientation='h', hoverinfo='y',
# marker=dict(color=df.category.apply(lambda x: colors[x]).values))]
# layout = go.Layout(
# xaxis=dict(tickfont=dict(size=8), showgrid=False),
# barmode='stack',
# width=750,
# height=45,
# margin=go.Margin(l=180, b=10, t=0)
# )
# fig = go.Figure(data=data, layout=layout)
# py.iplot(fig)
def course_item(course_id=course_id):
"""
Given course_id, query students' event for video, graded_problem,
self_test, openassessment and chapter from the studentmodule table
=> the numbers are slightly different from thosed queried from person_item and video_stats_day
"""
query = """
SELECT sub.module_id As item_id, c.index As index, name, category, nstudents
FROM [ubcxdata:{0}.course_axis] c
Join
(Select Regexp_replace(module_id,'i4x://', '') As module_id, exact_count_distinct(student_id) As nstudents
From [ubcxdata:{0}.studentmodule]
Where module_type In ('openassessment', 'chapter')
Group By module_id) sub
On sub.module_id = c.module_id
Order By index""".format(course_id)
chapter_assign = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)
query = """
Select problem_url_name as item_id, index, name,
Case When graded='true' Then 'graded_problem' Else 'self_test' End As category,
exact_count_distinct(user_id) As nstudents
From [{0}.problem_analysis] p
Join [{0}.course_axis] c
On p.problem_url_name= c.url_name
Group By item_id, index, name, category
Order By index""".format(course_id)
nproblems = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)
query = """
Select video_id as item_id, index_video as index, name, 'video' As category, videos_viewed As nstudents
From [{0}.video_stats]
Where videos_viewed > 20""".format(course_id)
nvideos = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False).dropna()
courseItem = pd.concat([chapter_assign, nproblems, nvideos]).sort_values('index')
courseItem = courseItem[courseItem.nstudents > 20].reset_index(drop=True)
return courseItem
for i in range(len(course_list)):
courseItem = course_item(course_list[i])
# make it center in the middle
trace1 = go.Bar(x=courseItem.index+1, y=courseItem.nstudents, hoverinfo='text',
text=['{0}:<br>nstudents: {1}'.format(name.encode('utf-8'), value)
for name, value in zip(courseItem.name, courseItem.nstudents)],
marker=dict(color=courseItem.category.apply(lambda x: colors[x]).values))
trace2 = go.Bar(x=courseItem.index+1, y=-courseItem.nstudents, hoverinfo='none',
marker=dict(color=courseItem.category.apply(lambda x: colors[x]).values))
data = [trace1, trace2]
layout = go.Layout(barmode='relative', xaxis=dict(showticklabels=False),
yaxis=dict(showticklabels=False, showgrid=False, title=indices[i], zeroline=False),
height=200, width=850, margin=go.Margin(t=0, b=0), showlegend=False)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)
def query_convs(mooc_list):
"""
Query nregistered, nviewed, nexplored, npassing, nverified for all the courses
in the mooc_list(e.g. ['UBCx__Climate101x__3T2015', 'UBCx__Climate1x__1T2016', 'UBCx__Marketing1x__3T2015'])
"""
pc_tables = ',\n'.join(['[%s.person_course]' % x for x in course_list])
query = \
"""SELECT course_id, Count(*) As Registered,
Sum(Case When sum_dt > 0 Then 1 Else 0 End) As Sampled,
Sum(Case When sum_dt > 900 Then 1 Else 0 End) As Involved,
Sum(Case When grade >= 0.5 Then 1 Else 0 End) As Passed,
Sum(Case When mode='verified' Then 1 Else 0 End) As Verified
FROM %s
Group By course_id""" % pc_tables
convs = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)
convs.set_index('course_id', inplace=True)
return convs
def query_convs_pe(course_list):
"""
Query nregistered, nviewed, nexplored, npassing for all the courses
in the pe_list(e.g. ['UBCx__Phot1x__1T2016', 'UBCx__ITSx__2T2015'])
"""
pc_tables = ',\n'.join(['[%s.person_course]' % x for x in course_list])
query = \
"""SELECT course_id, Count(*) As Registered,
Sum(Case When sum_dt > 0 Then 1 Else 0 End) As Sampled,
Sum(Case When sum_dt > 900 Then 1 Else 0 End) As Involved,
Sum(Case When grade >= 0.5 Then 1 Else 0 End) As Passed
FROM %s
Group By course_id""" % pc_tables
convs = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)
convs.set_index('course_id', inplace=True)
return convs
# mooc_list
convs = query_convs(mooc_list).reindex(mooc_indices)
# % involved, passed, verified out of sampled students
convs_pct = convs.drop('Registered', axis=1).divide(convs.drop('Registered', axis=1).max(axis=1), axis=0)#.round(3)
# hover text
convs_txt = convs_pct.copy()
# normalize along the column for plotting in heatmap (different scales across columns)
convs_df = convs_txt.divide(convs_txt.max(axis=0), axis=1).ix[:, 1:]
# convert to %
convs_pct = convs_pct.applymap(lambda x: "{0:.2f}".format(x * 100))
convs_txt = convs_txt.applymap(lambda x: "{0:.2f}".format(x * 100)).ix[:, 1:]
** sum_dt **: Total elapsed time spent by learner on this course, based on time difference between consecutive events, with a 5 min max cutoff, based on event data
names = convs.columns
colors = ['rgb(55, 126, 184)', 'rgb(255, 127, 0)', 'rgb(77, 175, 74)', 'rgb(228, 26, 28)', 'rgb(152, 78, 163)']
fig = tls.make_subplots(rows=1, cols=3, print_grid=False,
subplot_titles=('# students', '% of sampled', 'Compare % of sampled'))
# plotting # students
for i in range(0, convs.shape[1]):
fig.append_trace(go.Bar(x=convs.ix[:, i], y=convs.index, orientation='h',
marker=dict(color=colors[i]), name=names[i]), 1, 1)
# plotting # of sampled
for i in range(0, convs_pct.shape[1]):
fig.append_trace(go.Bar(x=convs_pct.ix[:, i], y=convs_pct.index, orientation='h',
marker=dict(color=colors[i+1]), name=names[i+1], showlegend=False), 1, 2)
# plotting heatmap
fig.append_trace(go.Heatmap(z=convs_df.values, x=convs_df.columns, y=convs_df.index,
text=convs_txt.values, hoverinfo='y+text',
colorscale=[[0.0, 'rgb(224,243,248)'], [1.0, 'rgb(43,130,189)']], showscale=False), 1, 3)
fig['layout']['xaxis1'].update(showgrid=False)
fig['layout']['yaxis1'].update(autorange='reversed')
fig['layout']['xaxis2'].update(showgrid=False)
fig['layout']['yaxis2'].update(showticklabels=False, autorange='reversed')
fig['layout']['yaxis3'].update(showticklabels=False, autorange='reversed')
fig['layout']['xaxis3'].update(autorange='reversed')
# fig['layout']['legend'].update(x=0.13, y=0.15)
fig['layout'].update(barmode='overlay', height=100+30*len(convs), width=900,
title='MOOC', margin=go.Margin(l=180, t=60, b=40))
py.iplot(fig)
# pe_list
convs_pe = query_convs_pe(pe_list).reindex(pe_indices)
convs_pct = convs_pe.drop('Registered', axis=1).divide(convs_pe.drop('Registered', axis=1).max(axis=1), axis=0)#.round(3)
convs_txt = convs_pct.copy()
convs_df = convs_txt.divide(convs_txt.max(axis=0), axis=1).ix[:, 1:]
convs_pct = convs_pct.applymap(lambda x: "{0:.2f}".format(x * 100))
convs_txt = convs_txt.applymap(lambda x: "{0:.2f}".format(x * 100)).ix[:, 1:]
names = convs.columns
colors = ['rgb(55, 126, 184)', 'rgb(255, 127, 0)', 'rgb(77, 175, 74)', 'rgb(228, 26, 28)', 'rgb(152, 78, 163)']
fig = tls.make_subplots(rows=1, cols=3, print_grid=False,
subplot_titles=('# students', '% of sampled', 'Compare % of sampled'))
for i in range(0, convs_pe.shape[1]):
fig.append_trace(go.Bar(x=convs_pe.ix[:, i], y=convs_pe.index, orientation='h',
marker=dict(color=colors[i]), name=names[i]), 1, 1)
for i in range(0, convs_pct.shape[1]):
fig.append_trace(go.Bar(x=convs_pct.ix[:, i], y=convs_pct.index, orientation='h',
marker=dict(color=colors[i+1]), name=names[i+1], showlegend=False), 1, 2)
fig.append_trace(go.Heatmap(z=convs_df.values, x=convs_df.columns, y=convs_df.index,
text=convs_txt.values, hoverinfo='y+text',
colorscale=[[0.0, 'rgb(224,243,248)'], [1.0, 'rgb(43,130,189)']], showscale=False), 1, 3)
fig['layout']['xaxis1'].update(showgrid=False)
fig['layout']['yaxis1'].update(autorange='reversed')
fig['layout']['xaxis2'].update(showgrid=False)
fig['layout']['yaxis2'].update(showticklabels=False, autorange='reversed')
fig['layout']['yaxis3'].update(showticklabels=False, autorange='reversed')
fig['layout']['xaxis3'].update(autorange='reversed')
# fig['layout']['legend'].update(x=0.13, y=0.15)
fig['layout'].update(barmode='overlay', height=100+30*len(convs_pe), width=900,
title='PE', margin=go.Margin(l=180, t=60, b=40))
py.iplot(fig)
# dictionaries to store total # of videos/problems for each course in the course_list
total_videos = {}
total_problems = {}
def query_pcs_learned(course_list = course_list):
"""
Iterate over each course in the course_list, query and calculate ndays_act, sum_dt, nforum_posts,
nvideos_watched, nproblems_attempted, pct_video_watched, pct_problem_attempted for involved students.
Update total_videos and total_problems
"""
dfs = []
for i in range(len(course_list)):
query = """
Select pc.user_id As user_id, pc.course_id As course_id, pc.mode As mode, pc.grade As grade,
pc.ndays_act As ndays_act, pc.sum_dt / 3600 As sum_dt, pc.nforum_posts As nforum_posts,
v.videos_watched As nvideos_watched, p.problems_attempted As nproblems_attempted
From [{0}.person_course] pc
Left Join
(SELECT username, Count(Distinct video_id) As videos_watched
FROM [{0}.video_stats_day]
Group By username) v
on pc.username = v.username
Left Join
(Select user_id, Count(Distinct item.answer_id) As problems_attempted
From [{0}.problem_analysis]
Group By user_id) p
On pc.user_id = p.user_id
Where pc.sum_dt > 900""".format(course_list[i])
df = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False).fillna(0)
course_id = indices[i]
total_videos[course_id] = min(df.nvideos_watched.max(), css_agg.ix[course_id, 'video'])
df['pct_video_watched'] = df.nvideos_watched / total_videos[course_id]
total_problems[course_id] = min(df.nproblems_attempted.max(),
css_agg.ix[course_id, 'graded_problem'] +
css_agg.ix[course_id, 'self_test'])
df['pct_problem_attempted'] = df.nproblems_attempted / total_problems[course_id]
dfs.append(df)
pcs_learned = pd.concat(dfs)
return pcs_learned
cols = ['ndays_act', 'sum_dt', 'pct_video_watched', 'pct_problem_attempted', 'nforum_posts']
# for the involved
pcs_learned = query_pcs_learned()
# aggregate nforum_posts => % students posted, others by median
pcs_learned_agg = pcs_learned.groupby('course_id').agg({'ndays_act': np.median, 'sum_dt': np.median,
'pct_video_watched': np.median, 'pct_problem_attempted': np.median,
'nforum_posts': lambda s: (s > 0).sum() / len(s)})
# order the courses as needed
pcs_learned_agg = pcs_learned_agg[cols].reindex(indices)
# for the passed
pcs_passed = pcs_learned[pcs_learned.grade >= 0.5]
pcs_passed_agg = pcs_passed.groupby('course_id').agg({'ndays_act': np.median, 'sum_dt': np.median,
'pct_video_watched': np.median, 'pct_problem_attempted': np.median,
'nforum_posts': lambda s: (s > 0).sum() / len(s)})
pcs_passed_agg = pcs_passed_agg[cols].reindex(indices)
# normalize along the columns for plotting heatmap
pcs_passed_norm = pcs_passed_agg.divide(pcs_passed_agg.max(axis=0), axis=1)
pcs_learned_norm = pcs_learned_agg.divide(pcs_learned_agg.max(axis=0), axis=1)
# formatting
pcs_passed_agg.sum_dt = pcs_passed_agg.sum_dt.round(2)
pcs_passed_agg[['pct_video_watched', 'pct_problem_attempted', 'nforum_posts']] = \
pcs_passed_agg[['pct_video_watched', 'pct_problem_attempted', 'nforum_posts']].applymap(lambda x: "{0:.2f}".format(x * 100))
pcs_learned_agg.sum_dt = pcs_learned_agg.sum_dt.round(2)
pcs_learned_agg[['pct_video_watched', 'pct_problem_attempted', 'nforum_posts']] = \
pcs_learned_agg[['pct_video_watched', 'pct_problem_attempted', 'nforum_posts']].applymap(lambda x: "{0:.2f}".format(x * 100))
# normalized value for plotting heatmap, unnormalized for hoverinfo
names = ['median days active', 'median sum_dt (H)',
'median % videos', 'median % problems', '% students posted']
trace1 = go.Heatmap(
z=pcs_learned_norm.values,
x=names,
y=pcs_learned_norm.index,
text = pcs_learned_agg.values,
hoverinfo='x+text',
showscale=False,
colorscale=[[0.0, 'rgb(224,243,248)'], [1.0, 'rgb(51,160,44)']])
trace2 = go.Heatmap(
z=pcs_passed_norm.values,
x=names,
y=pcs_passed_norm.index,
text = pcs_passed_agg.values,
hoverinfo='x+text',
showscale=False,
colorscale=[[0.0, 'rgb(224,243,248)'], [1.0, 'rgb(43,130,189)']])
fig = tls.make_subplots(rows=1, cols=2, print_grid=False,
subplot_titles=('Engagement for the involved',
'Engagement for the passed'))
fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 1, 2)
fig['layout']['yaxis1'].update(autorange='reversed')
fig['layout']['yaxis2'].update(showticklabels=False, autorange='reversed')
fig['layout'].update(
width=750, height=140+30*len(pcs_learned_agg),
margin=go.Margin(l=180, b=120, t=20)
)
py.iplot(fig)
dfs = []
for course_id in course_list:
query = """
Select pcd.course_id As course_id, pcd.username as username,
pcd.date As date, pc.grade As grade, pc.sum_dt As sum_dt
From [{0}.person_course_day] pcd
Join [{0}.person_course] pc
On pcd.username = pc.username
Where pcd.sum_dt > 0""".format(course_id)
dfs.append(pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False))
dates = pd.concat(dfs).reset_index(drop=True)
dates.date = pd.to_datetime(dates.date, format="%Y/%m/%d")
dates = dates[dates.grade >= 0.5]
def cal_days(df):
df['days'] = [int(i.days) for i in (df.date - df.date.min())]
return df
dates_days = dates.groupby(['course_id', 'username']).apply(cal_days)
dates_days['days_bin'] = pd.cut(dates_days.days, bins=[1, 7, 30, 60, 90, 180, 365],
labels=['1W', '1M', '2M', '3M', '6M', '>6M'], include_lowest=True, right=False)
days_agg = dates_days.groupby(['course_id', 'days_bin']).username.nunique().unstack('days_bin').fillna(0)\
.divide(dates_days.groupby('course_id').username.nunique(), axis=0).reindex(indices).applymap(lambda x: "{0:.2f}".format(x * 100))
data = [
go.Heatmap(
z=days_agg.values,
colorscale=[[0, 'rgb(240,249,232)'], [0.25, 'rgb(186,228,188)'], [0.5, 'rgb(123,204,196)'],
[0.75, 'rgb(67,162,202)'], [1, 'rgb(8,104,172)']],
x=days_agg.columns,
y=days_agg.index
)
]
layout = go.Layout(width=600, height=60+30*len(days_agg), margin=go.Margin(l=180, b=40, t=20),
yaxis=dict(autorange='reversed'), xaxis=dict(title='Time elapsed'))
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)
# iterate over courses in the course_list
for i in range(len(indices)):
showlegend = True if i==0 else False
fig = tls.make_subplots(rows=1, cols=2, print_grid=False)
# query # students with any activity, # students attempted any problem and # students watched any video
module_activity = modules[course_list[i]]
# plotting
fig.append_trace(go.Scatter(x=module_activity.index, y=module_activity.tried_problem,
name='tried a problem', fill='tozeroy', mode='lines',
line = dict(color = ('rgb(255, 127, 0)')), showlegend=showlegend), 1, 1)
fig.append_trace(go.Scatter(x=module_activity.index, y=module_activity.watched_video,
name='watched a video', fill='tonexty', mode='lines',
line = dict(color = ('rgb(77, 175, 74)')), showlegend=showlegend), 1, 1)
fig.append_trace(go.Scatter(x=module_activity.index, y=module_activity.nactive,
name='with any activity', fill='tonexty', mode='lines',
line = dict(color = ('rgb(55, 126, 184)')), showlegend=showlegend), 1, 1)
# select corresponding person_course
pc_activity = pcs_learned[pcs_learned.course_id==indices[i]].copy()
# convert to %
pc_activity[['pct_video_watched', 'pct_problem_attempted']] = \
pc_activity[['pct_video_watched', 'pct_problem_attempted']].applymap(lambda x: "{0:.2f}".format(x * 100))
# plotting density map
fig.append_trace(
go.Histogram2d(x=pc_activity.pct_video_watched, y=pc_activity.pct_problem_attempted,
histnorm='probability',
autobinx=False,
xbins=dict(start=0, end=100, size=10),
autobiny=False,
ybins=dict(start=0, end=100, size=10),
colorscale=[[0, 'rgb(12,51,131)'], [1/1000, 'rgb(12,51,131)'], [1/100, 'rgb(242,211,56)'],
[1/10, 'rgb(242,143,56)'], [1, 'rgb(217,30,30)']],
zmin=0, zmax=0.601,
zsmooth='fast',
colorbar=dict(thickness=20)), 1, 2)
fig['layout']['xaxis1'].update(title='chapter', showticklabels=False, showgrid=False)
fig['layout']['yaxis1'].update(title='# students', showgrid=False)
fig['layout']['xaxis2'].update(title='% videos (total:{0})'.format(int(total_videos[indices[i]])))
fig['layout']['yaxis2'].update(title='% problems (total:{0})'.format(int(total_problems[indices[i]])))
fig['layout']['legend'].update(x=0.2, y=1)
fig['layout'].update(height=250, width=750, margin=go.Margin(l=50, t=40, b=35), title=indices[i])
py.iplot(fig)