Purpose: Analyze the feedback that are embeded at the end of the tutorials in Galaxy Training Material
import matplotlib.pyplot as plt
import pandas as pd
from pprint import pprint
import datetime
import emoji
url = 'https://docs.google.com/spreadsheets/d/1NfZhi5Jav7kl9zFCkeb7rIC2F8xW1isruv1TeO4WpNI/export?format=tsv'
str(datetime.datetime.now())
'2020-07-08 09:01:07.895695'
df = (pd.read_csv(url, sep='\t')
# remove last column
.drop(['Make feedback confidential?'], axis=1)
# rename column
.rename(columns = {'Timestamp': 'timestamp',
'How much did you like this tutorial?': 'note',
'What did you like?': 'pro',
'What could be improved?': 'con',
'Tutorial': 'tutorial_topic'}))
# extract topic from tutorial name
new = df.tutorial_topic.str[::-1].str.split('(', n = 1, expand = True)
df["tutorial"]= new[1].str[::-1].str[:-1]
df["topic"]= new[0].str[::-1].str[:-1]
df = (df.drop(columns =["tutorial_topic"])
# remove rows with NaN on note, pro and con
.dropna(subset=['note', 'pro', 'con'], how='all')
# replace NaN in note by 0
.fillna(value={'note': 0}))
# format note to integer
df.note = df.note.astype(int)
# format pro and con to string
df.pro = df.pro.astype(str)
df.con = df.con.astype(str)
# format timestamp to remove hour and use datetime
df.timestamp = pd.to_datetime(
df.timestamp.str.split(' ', n = 1, expand = True)[0],
dayfirst=True)
# change topic for some tutorials
df.loc[df.tutorial == 'Formation of the Super-Structures on the Inactive X', 'topic'] = 'Epigenetics'
df.loc[df.tutorial == 'Identification of the binding sites of the Estrogen receptor', 'topic'] = 'Epigenetics'
df.loc[df.tutorial == 'Identification of the binding sites of the T-cell acute lymphocytic leukemia protein 1 (TAL1)', 'topic'] = 'Epigenetics'
df.loc[df.tutorial == 'RAD-Seq Reference-based data analysis', 'topic'] = 'Ecology'
df.loc[df.tutorial == 'RAD-Seq de-novo data analysis', 'topic'] = 'Ecology'
df.loc[df.tutorial == 'RAD-Seq to construct genetic maps', 'topic'] = 'Ecology'
def get_notes(df, name):
'''Aggregage the notes
:df: dataframe with all feedbacks
:return: dataframe object with aggregated notes
'''
return (df.note
.value_counts(sort=False)
.to_frame()
.rename(columns= {'note': name}))
notes = get_notes(df, 'All topics')
def get_topic_df(grouped_by_topic, topic, notes):
'''Extract the dataframe for a topic and plot note histogram
:grouped_by_topic: groupby object grouping by topic
:topic: topic to extract
:return: dataframe object for the topic
'''
topic_df = (grouped_by_topic
.get_group(topic)
.drop('topic', 1))
notes = pd.concat([notes, get_notes(topic_df, topic)], axis=1)
return topic_df, notes
def extract_tutorial_feedbacks(topic_df, topic_name):
'''Extract pro/con per tutorial for a topic and
write them in a file
:topic_df: dataframe object for the topic
:topic_name: name for the topic, name for the file
'''
grouped_by_tuto = topic_df.groupby(by="tutorial")
with open('../results/%s.md' % topic_name, 'w') as f:
for tuto, group in grouped_by_tuto:
# get groups
tuto_df = grouped_by_tuto.get_group(tuto)
pros = []
cons = []
# get pros/cons
for index, row in tuto_df.iterrows():
if row['pro'] != 'nan':
pros.append("%s (*%s*)" % (row['pro'], row['timestamp']))
if row['con'] != 'nan':
cons.append("%s (*%s*)" % (row['con'], row['timestamp']))
# write in report file
f.write("- **%s**\n" % tuto)
if len(pros) > 0:
f.write(" - Pro:\n - ")
f.write("\n - ".join(pros))
if len(cons) > 0:
f.write("\n - Con:\n - ")
f.write("\n - ".join(cons))
f.write("\n\n")
grouped_by_topic = df.groupby(by="topic")
for topic in grouped_by_topic.groups:
print(topic)
topic_df, notes = get_topic_df(grouped_by_topic, topic, notes)
topic_name = topic.lower().replace(' ', '-')
extract_tutorial_feedbacks(topic_df, topic_name)
Assembly Computational chemistry Contributing to the Galaxy Training Material Data Manipulation Development in Galaxy Ecology Epigenetics Galaxy Server administration Genome Annotation Imaging Introduction to Galaxy Analyses Metabolomics Metagenomics Proteomics Sequence analysis Statistics and machine learning Teaching and Hosting Galaxy training Transcriptomics User Interface and Features Variant Analysis Visualisation
Details (pros/cons) for each tutorials are available: https://github.com/bebatut/galaxy-training-material-stats/tree/master/results
Feedback number:
# number of rows
len(df)
1248
Feedback number over time
months = df.timestamp.dt.to_period("M")
nb_per_months = (df
.groupby(months)
.count()
.timestamp)
nb_per_months
timestamp 2018-09 52 2018-10 63 2018-11 39 2018-12 28 2019-01 37 2019-02 41 2019-03 37 2019-04 61 2019-05 42 2019-06 43 2019-07 62 2019-08 62 2019-09 74 2019-10 111 2019-11 51 2019-12 46 2020-01 52 2020-02 62 2020-03 70 2020-04 69 2020-05 58 2020-06 76 2020-07 12 Freq: M, Name: timestamp, dtype: int64
plt.figure()
(nb_per_months
.cumsum()
.plot())
plt.xlabel('Months')
plt.ylabel('Cumulative number of feedback')
plt.show()
Feedback number per topics
(grouped_by_topic
.count()
.sort_values('timestamp', ascending=False)
.timestamp)
topic Introduction to Galaxy Analyses 549 Transcriptomics 198 Sequence analysis 124 Metagenomics 67 Galaxy Server administration 52 Epigenetics 43 Variant Analysis 37 Statistics and machine learning 31 Assembly 30 Genome Annotation 24 Proteomics 23 User Interface and Features 15 Contributing to the Galaxy Training Material 14 Data Manipulation 11 Imaging 6 Computational chemistry 6 Metabolomics 4 Teaching and Hosting Galaxy training 3 Ecology 1 Development in Galaxy 1 Visualisation 1 Name: timestamp, dtype: int64
Top 10 tutorials with feedbacks
(df
.groupby(by="tutorial")
.count()
.sort_values('timestamp', ascending=False)
.timestamp
.head(10))
tutorial A short introduction to Galaxy 306 Galaxy 101 103 Quality Control 93 Reference-based RNA-Seq data analysis 65 From peaks to genes 64 Visualization of RNA-Seq results with Volcano Plot 31 Mapping 31 RNA-Seq reads to counts 25 NGS data logistics 24 16S Microbial Analysis with mothur (extended) 24 Name: timestamp, dtype: int64
def plot_note_histogram(s, title):
plt.figure()
s.plot(kind='barh', color='k', ylim=(0,5), xlim=(0,1), title=title)
plt.xlabel('Proportion of feedback')
plt.show()
notes = (notes
.fillna(0.0)
.astype(int)
.rename(index = {0: 'No value'}))
# 1: emoji.emojize(':-1:', use_aliases=True)
# 5: emoji.emojize(':heart:', use_aliases=True)
notes
All topics | Assembly | Computational chemistry | Contributing to the Galaxy Training Material | Data Manipulation | Development in Galaxy | Ecology | Epigenetics | Galaxy Server administration | Genome Annotation | ... | Metabolomics | Metagenomics | Proteomics | Sequence analysis | Statistics and machine learning | Teaching and Hosting Galaxy training | Transcriptomics | User Interface and Features | Variant Analysis | Visualisation | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
No value | 9 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 2 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 0 |
1 | 51 | 1 | 0 | 2 | 1 | 0 | 0 | 2 | 1 | 5 | ... | 0 | 2 | 1 | 6 | 0 | 0 | 6 | 0 | 8 | 0 |
2 | 23 | 2 | 0 | 1 | 1 | 0 | 0 | 3 | 0 | 2 | ... | 0 | 1 | 0 | 3 | 0 | 0 | 0 | 1 | 2 | 0 |
3 | 67 | 2 | 1 | 1 | 0 | 0 | 0 | 2 | 3 | 1 | ... | 0 | 7 | 0 | 10 | 2 | 0 | 11 | 2 | 2 | 0 |
4 | 246 | 9 | 1 | 0 | 8 | 0 | 0 | 12 | 7 | 5 | ... | 0 | 16 | 5 | 35 | 8 | 1 | 40 | 3 | 4 | 0 |
5 | 852 | 15 | 4 | 10 | 1 | 1 | 1 | 23 | 40 | 9 | ... | 4 | 41 | 17 | 70 | 21 | 2 | 138 | 9 | 21 | 1 |
6 rows × 22 columns
notes_prop = (notes/notes.sum().round(2))
for col in notes_prop.columns:
plot_note_histogram(notes_prop[col], col)
notes_prop
All topics | Assembly | Computational chemistry | Contributing to the Galaxy Training Material | Data Manipulation | Development in Galaxy | Ecology | Epigenetics | Galaxy Server administration | Genome Annotation | ... | Metabolomics | Metagenomics | Proteomics | Sequence analysis | Statistics and machine learning | Teaching and Hosting Galaxy training | Transcriptomics | User Interface and Features | Variant Analysis | Visualisation | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
No value | 0.007212 | 0.033333 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.023256 | 0.019231 | 0.083333 | ... | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.015152 | 0.000000 | 0.000000 | 0.0 |
1 | 0.040865 | 0.033333 | 0.000000 | 0.142857 | 0.090909 | 0.0 | 0.0 | 0.046512 | 0.019231 | 0.208333 | ... | 0.0 | 0.029851 | 0.043478 | 0.048387 | 0.000000 | 0.000000 | 0.030303 | 0.000000 | 0.216216 | 0.0 |
2 | 0.018429 | 0.066667 | 0.000000 | 0.071429 | 0.090909 | 0.0 | 0.0 | 0.069767 | 0.000000 | 0.083333 | ... | 0.0 | 0.014925 | 0.000000 | 0.024194 | 0.000000 | 0.000000 | 0.000000 | 0.066667 | 0.054054 | 0.0 |
3 | 0.053686 | 0.066667 | 0.166667 | 0.071429 | 0.000000 | 0.0 | 0.0 | 0.046512 | 0.057692 | 0.041667 | ... | 0.0 | 0.104478 | 0.000000 | 0.080645 | 0.064516 | 0.000000 | 0.055556 | 0.133333 | 0.054054 | 0.0 |
4 | 0.197115 | 0.300000 | 0.166667 | 0.000000 | 0.727273 | 0.0 | 0.0 | 0.279070 | 0.134615 | 0.208333 | ... | 0.0 | 0.238806 | 0.217391 | 0.282258 | 0.258065 | 0.333333 | 0.202020 | 0.200000 | 0.108108 | 0.0 |
5 | 0.682692 | 0.500000 | 0.666667 | 0.714286 | 0.090909 | 1.0 | 1.0 | 0.534884 | 0.769231 | 0.375000 | ... | 1.0 | 0.611940 | 0.739130 | 0.564516 | 0.677419 | 0.666667 | 0.696970 | 0.600000 | 0.567568 | 1.0 |
6 rows × 22 columns