%matplotlib inline
from __future__ import division
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import OrderedDict
import plotly
import plotly.offline as py
import plotly.graph_objs as go
import plotly.tools as tls
py.init_notebook_mode()
course_id = 'UBCx__Climate101x__3T2015'
** Entry survey**
def query_entry(course_id = course_id):
"""
Query entry survey;
Example course_id: 'UBCx__Marketing1x__3T2015'
"""
query = "Select * From [%s.entry_survey_mapped]" % course_id
survey = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)
survey = survey.drop_duplicates('user_id', keep='last').ix[:, 11:]
survey.columns = survey.columns.str.replace("s_", "")
return survey
def query_exit(course_id = course_id):
"""
Query exit survey;
Example course_id: 'UBCx__Marketing1x__3T2015'
"""
query = "Select * From [%s.exit_survey_mapped]" % course_id
survey = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False)
survey = survey.drop_duplicates('user_id', keep='last').ix[:, 11:]
survey.columns = survey.columns.str.replace("s_", "")
return survey
def query_pc(course_id = course_id):
"""
Given course_id(e.g. 'UBCx__Marketing1x__3T2015'), query and calculate ndays_act, sum_dt, nforum_posts,
nvideos_watched, nproblems_attempted for sampled students,
also return total_videos and total_problems.
"""
query = """
Select pc.user_id As user_id, is_active, certified, pc.course_id As course_id, pc.mode As mode, pc.grade As grade,
pc.ndays_act As ndays_act, pc.sum_dt As sum_dt, pc.nforum_posts As nforum_posts,
v.videos_watched As nvideos_watched, p.problems_attempted As nproblems_attempted
From [{0}.person_course] pc
Left Join
(SELECT username, exact_count_distinct(video_id) As videos_watched
FROM [{0}.video_stats_day]
Group By username) v
on pc.username = v.username
Left Join
(Select user_id, exact_count_distinct(item.answer_id) As problems_attempted
From [{0}.problem_analysis]
Group By user_id) p
On pc.user_id = p.user_id
Where pc.sum_dt > 0""".format(course_id)
df = pd.io.gbq.read_gbq(query, project_id='ubcxdata', verbose=False).fillna(0)
return df
exit = query_exit()
entry = query_entry()
pc = query_pc()
surveys = pd.merge(entry, exit, on='user_id', how='inner')
# pc = query_pc()
merged = entry.merge(pc, on='user_id')
def compute_pcstats(merged, selections, start):
"""
Compute students' behavior: median events (nplay_video, nproblem_check, nforum_posts)
meidan ndays_act, median grade, pct_passing grouped by goals;
merged: entry survey merged with person_course
selections: a list of all the goals from entry survey
start: index of the first column for this question in the entry survey
"""
df = []
rm = []
counts = []
pct_passing = []
selections2 = selections[:]
merged_count = len(merged)*0.05
for i in range(len(selections2)):
group_count = merged.ix[:, i+start].notnull().sum()
if group_count > merged_count:
counts.append(group_count)
pct_passing.append(merged.ix[merged.ix[:, i+start].notnull(),'certified'].sum() / group_count)
df.append(pd.DataFrame(data={'Selection': selections[i],
'grade': merged.ix[merged.ix[:, i+start].notnull(), 'grade'],
'nforum_posts': merged.ix[merged.ix[:, i+start].notnull(), 'nforum_posts'],
'ndays_act': merged.ix[merged.ix[:, i+start].notnull(), 'ndays_act'],
'nproblems_attempted': merged.ix[merged.ix[:, i+start].notnull(), 'nproblems_attempted'],
'nvideos_watched': merged.ix[merged.ix[:, i+start].notnull(), 'nvideos_watched']
}))
else:
rm.append(selections2[i])
result = pd.concat(df)
#result.fillna(0, inplace=True)
for s in rm:
selections2.remove(s)
median = result.groupby('Selection').median().ix[selections2, :]
return median, counts, pct_passing
selections = ['Relevant understanding',
'Something interesting',
'Verified certificate',
'Pursue further',
'Learn with friends',
'Impress people',
'More about MOOCs',
'Substantial understanding',
'Practical knowledge',
'Improve English',
'Others']
merged1 = merged[merged.is_active==1]
median, counts, pct_passing = compute_pcstats(merged1, selections, 10)
# median
def plot_pcstats(median, counts, pct_passing, title=None):
"""
Plot students' behavior: median events (nplay_video, nproblem_check, nforum_posts)
meidan ndays_act, median grade, pct_passing grouped by goals;
counts: number of students for each goal
"""
trace1 = go.Bar(x=median.index, y=median.nvideos_watched, name='# videos watched', showlegend=True)
trace2 = go.Bar(x=median.index, y=median.nproblems_attempted, name='# problems attempted', showlegend=True)
trace3 = go.Bar(x=median.index, y=median.nforum_posts, name='# forum posts', showlegend=True)
trace4 = go.Bar(x=median.index, y=median.ndays_act, name='days active', showlegend=False)
trace5 = go.Bar(x=median.index, y=median.grade, name='grade', showlegend=False)
trace6 = go.Bar(x=median.index, y=pct_passing, name='pct passing', showlegend=False)
trace7 = go.Bar(x=median.index, y=counts, name='# of students', showlegend=False)
fig = tls.make_subplots(rows=5, cols=1, print_grid=False,
subplot_titles=('median events', 'median days active',
'median grade', 'pct passing', '# of students'))
fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 1, 1)
fig.append_trace(trace3, 1, 1)
fig.append_trace(trace4, 2, 1)
fig.append_trace(trace5, 3, 1)
fig.append_trace(trace6, 4, 1)
fig.append_trace(trace7, 5, 1)
fig['layout']['xaxis1'].update(tickfont=dict(size=8))
fig['layout']['xaxis2'].update(tickfont=dict(size=8))
fig['layout']['xaxis3'].update(tickfont=dict(size=8))
fig['layout']['xaxis4'].update(tickfont=dict(size=8))
fig['layout']['xaxis5'].update(tickfont=dict(size=8))
fig['layout']['yaxis1'].update(showgrid=False)
fig['layout']['yaxis2'].update(showgrid=False)
fig['layout']['yaxis3'].update(showgrid=False)
fig['layout']['yaxis4'].update(showgrid=False)
fig['layout']['yaxis5'].update(showgrid=False)
fig['layout'].update(barmode='stack', title=title, font=dict(size=12),
height=920, width=600, margin=go.Margin(t=80, b=120))
py.iplot(fig)
plot_pcstats(median, counts, pct_passing, title="Students' engagement by goals")
** Exit survey**
def compute_heatmap(surveys, selections, matrix, start, col_names):
"""
Compute stats for questions 'Are you likely...' and 'How satisfied ...';
surveys: entry survey merged with exit survey
selections: a list of all the goals from entry survey
start: index of the first column for question about goals in the merged survey
matrix: for question Are you likely...' or 'How satisfied ...',
a dictionary with column names as keys and an empty list as values
e.g. matrix = {'Q2_2_1': [], 'Q2_2_2': [], 'Q2_2_3': [], 'Q2_2_4': [], 'Q2_2_5': []}
col_names: corresponding names for the columns
e.g. ['Recommend', 'Advanced', 'Revisit', 'Involvement', 'Habbits']
"""
rm = []
selections2 = selections[:]
survey_count = len(surveys)*0.05
for i in range(len(selections2)):
group_count = surveys.ix[:, i+start].notnull().sum()
if group_count > survey_count:
for k, v in matrix.iteritems():
v.append(surveys.ix[surveys.ix[:, i+start].notnull(), k].notnull().sum() / group_count)
else:
rm.append(selections2[i])
for k, v in matrix.iteritems():
v.append(surveys.ix[:, k].notnull().sum() / len(surveys))
for s in rm:
selections2.remove(s)
selections2 = selections2 + ['Overall']
matrix = pd.DataFrame(matrix)
matrix.index = selections2
matrix.columns = col_names
return matrix
matrix = {'Q2_2_1': [], 'Q2_2_2': [], 'Q2_2_3': [], 'Q2_2_4': [], 'Q2_2_5': []}
col_names = ['Recommend', 'Advanced', 'Revisit', 'Involvement', 'Habbits']
likely = compute_heatmap(surveys, selections, matrix, 10, col_names)
#likely
def heatmap(df, title=None, width=700):
"""Plot heatmap given a dataframe"""
data = [
go.Heatmap(
z=df.values,
x=df.columns,
y=df.index,
colorscale=[[0.0, 'rgb(224,243,248)'], [1.0, 'rgb(43,130,189)']]
)
]
layout = go.Layout(
yaxis=dict(autorange='reversed'),
width=width, height=500,
margin=go.Margin(l=100, b=120),
title=title
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)
heatmap(likely.T, title='Exit survey feedback grouped by goals')
def cal_goalsmet(surveys, selections, start, col_name):
"""
Calculate stats for goalsmet question;
surveys: entry survey merged with exit survey
selections: a list of all the goals from entry survey
start: index of the first column for question about goals in the merged survey
col_name: name of goalsmet column, e.g. 'Q2_1'
"""
df = []
rm = []
selections2 = selections[:]
survey_count = len(surveys)*0.05
for i in range(len(selections)):
group_count = surveys.ix[:, i+start].notnull().sum()
if group_count > survey_count:
df.append(pd.DataFrame(data={'Selection': selections[i],
'goalsmet': surveys.ix[surveys.ix[:, i+start].notnull(), col_name]}))
else:
rm.append(selections2[i])
for s in rm:
selections2.remove(s)
goalsmet = pd.concat(df)
goalsmet = goalsmet.groupby('Selection').goalsmet.value_counts(normalize=True).unstack('Selection').fillna(0).T
col_order = ['Yes and more, the course exceeded my expectations.',
'Yes, my goals were met.', 'My goals were somewhat met.']
goalsmet = goalsmet.ix[selections2, col_order]
goalsmet.loc['Overall'] = surveys[col_name].value_counts(normalize=True)[col_order]
return goalsmet
def stacked_bar(df, names, barmode, title=None, width=600):
"""
Plot stacked or overlay bar graph given the dataframe
names: names for all the traces
"""
data = []
colors = ['rgb(77, 175, 74)', 'rgb(255, 127, 0)', 'rgb(55, 126, 184)', 'rgb(228, 26, 28)']
for i in range(0, df.shape[1]):
data.append(go.Bar(x=df.index, y=df.ix[:, i], marker=dict(color=colors[i]), name=names[i]))
layout = go.Layout(
width=width,
margin=go.Margin(b=150),
barmode=barmode,
title=title)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)
names = ['Exceeded expection', 'Goals met', 'Goals somewhat met']
goals_met = cal_goalsmet(surveys, selections, 10, 'Q2_1')
stacked_bar(goals_met, names, 'stack', title='Percent goals met grouped by goals', width=800)
satisfy = {'Q4_1_1': [], 'Q4_1_2': [], 'Q4_1_3': [], 'Q4_1_4': [], 'Q4_1_5': [], 'Q4_1_6': [], 'Q4_1_7': []}
col_names = ['challenge', 'workload', 'pace', 'depth', 'communication', 'responsiveness', 'knowledge']
satisfy = compute_heatmap(surveys, selections, satisfy, 10, col_names)
#satisfy
heatmap(satisfy.T, title='Percent satisfaction of course components grouped by goals')
entry_marketing = query_entry('UBCx__Marketing1x__3T2015')
exit_marketing = query_exit('UBCx__Marketing1x__3T2015')
surveys_marketing = pd.merge(entry_marketing, exit_marketing, on='user_id', how='inner')
exit_climate = query_exit('UBCx__Climate101x__3T2015')
entry_climate = query_entry('UBCx__Climate101x__3T2015')
surveys = pd.merge(entry_climate, exit_climate, on='user_id', how='inner')
exit_climate2 = query_exit('UBCx__Climate1x__1T2016')
entry_climate2 = query_entry('UBCx__Climate1x__1T2016')
surveys2 = pd.merge(entry_climate2, exit_climate2, on='user_id', how='inner')
# different survey choinces in Marketing
selections_m = ['Practical knowledge',
'Verified certificate',
'Something interesting',
'More about MOOCs',
'Substantial understanding',
'Improve English',
'Others']
matrix_m = {'Q1_1': [], 'Q1_2_y': [], 'Q1_3_y': [], 'Q1_4': [], 'Q1_5': [], 'Q1_6': [], 'Q1_7': [], 'Q1_8_y': []}
colnames_m = ['Recommend', 'Advanced', 'Revist', 'Instructor', 'Involvement', 'UBC', 'Habbits', 'edX']
likely_marketing = compute_heatmap(surveys_marketing, selections_m, matrix_m, 0, colnames_m )
selections = ['Relevant understanding',
'Something interesting',
'Verified certificate',
'Pursue further',
'Learn with friends',
'Impress people',
'More about MOOCs',
'Substantial understanding',
'Practical knowledge',
'Improve English',
'Others']
matrix = {'Q2_2_1': [], 'Q2_2_2': [], 'Q2_2_3': [], 'Q2_2_4': [], 'Q2_2_5': []}
col_names = ['Recommend', 'Advanced', 'Revisit', 'Involvement', 'Habbits']
likely = compute_heatmap(surveys, selections, matrix, 10, col_names)
matrix = {'Q2_2_1': [], 'Q2_2_2': [], 'Q2_2_3': [], 'Q2_2_4': [], 'Q2_2_5': []}
likely2 = compute_heatmap(surveys2, selections, matrix, 10, col_names)
recommend = pd.concat([likely.loc[likely_marketing.index].Recommend, likely2.loc[likely_marketing.index].Recommend,
likely_marketing.Recommend], axis=1)
recommend.columns = ['Climate2015', 'Climate2016', 'Marketing2015']
# recommend
heatmap(recommend.T, title='Percent recommendation grouped by goals', width=600)