Purpose: Analyze the feedback that are embeded at the end of the tutorials in Galaxy Training Material

In [1]:

import matplotlib.pyplot as plt
import pandas as pd

from pprint import pprint
import datetime
import emoji

In [2]:

url = 'https://docs.google.com/spreadsheets/d/1NfZhi5Jav7kl9zFCkeb7rIC2F8xW1isruv1TeO4WpNI/export?format=tsv'

In [3]:

str(datetime.datetime.now())

Out[3]:

'2020-07-08 09:01:07.895695'

Load the feedback¶

In [4]:

df = (pd.read_csv(url, sep='\t')
    # remove last column
      .drop(['Make feedback confidential?'], axis=1)
    # rename column
      .rename(columns = {'Timestamp': 'timestamp',
                         'How much did you like this tutorial?': 'note',
                         'What did you like?': 'pro',
                         'What could be improved?': 'con',
                         'Tutorial': 'tutorial_topic'}))
# extract topic from tutorial name
new = df.tutorial_topic.str[::-1].str.split('(', n = 1, expand = True)
df["tutorial"]= new[1].str[::-1].str[:-1]
df["topic"]= new[0].str[::-1].str[:-1]
df = (df.drop(columns =["tutorial_topic"])
    # remove rows with NaN on note, pro and con
      .dropna(subset=['note', 'pro', 'con'], how='all')
    # replace NaN in note by 0
      .fillna(value={'note': 0}))
# format note to integer
df.note = df.note.astype(int)
# format pro and con to string
df.pro = df.pro.astype(str)
df.con = df.con.astype(str)
# format timestamp to remove hour and use datetime
df.timestamp = pd.to_datetime(
    df.timestamp.str.split(' ', n = 1, expand = True)[0],
    dayfirst=True)

In [5]:

# change topic for some tutorials
df.loc[df.tutorial == 'Formation of the Super-Structures on the Inactive X', 'topic'] = 'Epigenetics'
df.loc[df.tutorial == 'Identification of the binding sites of the Estrogen receptor', 'topic'] = 'Epigenetics'
df.loc[df.tutorial == 'Identification of the binding sites of the T-cell acute lymphocytic leukemia protein 1 (TAL1)', 'topic'] = 'Epigenetics'
df.loc[df.tutorial == 'RAD-Seq Reference-based data analysis', 'topic'] = 'Ecology'
df.loc[df.tutorial == 'RAD-Seq de-novo data analysis', 'topic'] = 'Ecology'
df.loc[df.tutorial == 'RAD-Seq to construct genetic maps', 'topic'] = 'Ecology'

Aggregate the feedbacks and notes¶

In [6]:

def get_notes(df, name):
    '''Aggregage the notes
    
    :df: dataframe with all feedbacks
    
    :return: dataframe object with aggregated notes
    '''
    return (df.note
        .value_counts(sort=False)
        .to_frame()
        .rename(columns= {'note': name}))

In [7]:

notes = get_notes(df, 'All topics')

In [8]:

def get_topic_df(grouped_by_topic, topic, notes):
    '''Extract the dataframe for a topic and plot note histogram
    
    :grouped_by_topic: groupby object grouping by topic
    :topic: topic to extract
    
    :return: dataframe object for the topic
    '''
    topic_df = (grouped_by_topic
        .get_group(topic)
        .drop('topic', 1))
    
    notes = pd.concat([notes, get_notes(topic_df, topic)], axis=1)
    return topic_df, notes


def extract_tutorial_feedbacks(topic_df, topic_name):
    '''Extract pro/con per tutorial for a topic and 
    write them in a file
    
    :topic_df: dataframe object for the topic
    :topic_name: name for the topic, name for the file
    '''
    grouped_by_tuto = topic_df.groupby(by="tutorial")
    with open('../results/%s.md' % topic_name, 'w') as f:
        for tuto, group in grouped_by_tuto:
            # get groups
            tuto_df = grouped_by_tuto.get_group(tuto)
            pros = []
            cons = []
            # get pros/cons
            for index, row in tuto_df.iterrows():
                if row['pro'] != 'nan':
                    pros.append("%s (*%s*)" % (row['pro'], row['timestamp']))
                if row['con'] != 'nan':
                    cons.append("%s (*%s*)" % (row['con'], row['timestamp']))
            # write in report file
            f.write("- **%s**\n" % tuto)
            if len(pros) > 0:
                f.write("  - Pro:\n    - ")
                f.write("\n    - ".join(pros))
            if len(cons) > 0:
                f.write("\n  - Con:\n    - ")
                f.write("\n    - ".join(cons))
            f.write("\n\n")

In [9]:

grouped_by_topic = df.groupby(by="topic")
for topic in grouped_by_topic.groups:
    print(topic)
    topic_df, notes = get_topic_df(grouped_by_topic, topic, notes)
    topic_name = topic.lower().replace(' ', '-')
    extract_tutorial_feedbacks(topic_df, topic_name)

Assembly
Computational chemistry
Contributing to the Galaxy Training Material
Data Manipulation
Development in Galaxy
Ecology
Epigenetics
Galaxy Server administration
Genome Annotation
Imaging
Introduction to Galaxy Analyses
Metabolomics
Metagenomics
Proteomics
Sequence analysis
Statistics and machine learning
Teaching and Hosting Galaxy training
Transcriptomics
User Interface and Features
Variant Analysis
Visualisation

Details (pros/cons) for each tutorials are available: https://github.com/bebatut/galaxy-training-material-stats/tree/master/results

General stats about feedback¶

Feedback number:

In [10]:

# number of rows
len(df)

Out[10]:

Feedback number over time

In [11]:

months = df.timestamp.dt.to_period("M")
nb_per_months = (df
                 .groupby(months)
                 .count()
                 .timestamp)
nb_per_months

Out[11]:

timestamp
2018-09     52
2018-10     63
2018-11     39
2018-12     28
2019-01     37
2019-02     41
2019-03     37
2019-04     61
2019-05     42
2019-06     43
2019-07     62
2019-08     62
2019-09     74
2019-10    111
2019-11     51
2019-12     46
2020-01     52
2020-02     62
2020-03     70
2020-04     69
2020-05     58
2020-06     76
2020-07     12
Freq: M, Name: timestamp, dtype: int64

In [12]:

plt.figure()
(nb_per_months
    .cumsum()
    .plot())
plt.xlabel('Months')
plt.ylabel('Cumulative number of feedback')
plt.show()

Feedback number per topics

In [13]:

(grouped_by_topic
     .count()
     .sort_values('timestamp', ascending=False)
     .timestamp)

Out[13]:

topic
Introduction to Galaxy Analyses                 549
Transcriptomics                                 198
Sequence analysis                               124
Metagenomics                                     67
Galaxy Server administration                     52
Epigenetics                                      43
Variant Analysis                                 37
Statistics and machine learning                  31
Assembly                                         30
Genome Annotation                                24
Proteomics                                       23
User Interface and Features                      15
Contributing to the Galaxy Training Material     14
Data Manipulation                                11
Imaging                                           6
Computational chemistry                           6
Metabolomics                                      4
Teaching and Hosting Galaxy training              3
Ecology                                           1
Development in Galaxy                             1
Visualisation                                     1
Name: timestamp, dtype: int64

Notes¶

In [15]:

def plot_note_histogram(s, title):
    plt.figure()
    s.plot(kind='barh', color='k', ylim=(0,5), xlim=(0,1), title=title)
    plt.xlabel('Proportion of feedback')
    plt.show()

In [16]:

notes = (notes
         .fillna(0.0)
         .astype(int)
         .rename(index = {0: 'No value'}))
# 1: emoji.emojize(':-1:', use_aliases=True)
# 5: emoji.emojize(':heart:', use_aliases=True)
notes

Out[16]:

	All topics	Assembly	Computational chemistry	Contributing to the Galaxy Training Material	Data Manipulation	Development in Galaxy	Ecology	Epigenetics	Galaxy Server administration	Genome Annotation	...	Metabolomics	Metagenomics	Proteomics	Sequence analysis	Statistics and machine learning	Teaching and Hosting Galaxy training	Transcriptomics	User Interface and Features	Variant Analysis	Visualisation
No value	9	1	0	0	0	0	0	1	1	2	...	0	0	0	0	0	0	3	0	0	0
1	51	1	0	2	1	0	0	2	1	5	...	0	2	1	6	0	0	6	0	8	0
2	23	2	0	1	1	0	0	3	0	2	...	0	1	0	3	0	0	0	1	2	0
3	67	2	1	1	0	0	0	2	3	1	...	0	7	0	10	2	0	11	2	2	0
4	246	9	1	0	8	0	0	12	7	5	...	0	16	5	35	8	1	40	3	4	0
5	852	15	4	10	1	1	1	23	40	9	...	4	41	17	70	21	2	138	9	21	1

6 rows × 22 columns

In [17]:

notes_prop = (notes/notes.sum().round(2))
for col in notes_prop.columns:
    plot_note_histogram(notes_prop[col], col)

In [18]:

notes_prop

Out[18]:

	All topics	Assembly	Computational chemistry	Contributing to the Galaxy Training Material	Data Manipulation	Development in Galaxy	Ecology	Epigenetics	Galaxy Server administration	Genome Annotation	...	Metabolomics	Metagenomics	Proteomics	Sequence analysis	Statistics and machine learning	Teaching and Hosting Galaxy training	Transcriptomics	User Interface and Features	Variant Analysis	Visualisation
No value	0.007212	0.033333	0.000000	0.000000	0.000000	0.0	0.0	0.023256	0.019231	0.083333	...	0.0	0.000000	0.000000	0.000000	0.000000	0.000000	0.015152	0.000000	0.000000	0.0
1	0.040865	0.033333	0.000000	0.142857	0.090909	0.0	0.0	0.046512	0.019231	0.208333	...	0.0	0.029851	0.043478	0.048387	0.000000	0.000000	0.030303	0.000000	0.216216	0.0
2	0.018429	0.066667	0.000000	0.071429	0.090909	0.0	0.0	0.069767	0.000000	0.083333	...	0.0	0.014925	0.000000	0.024194	0.000000	0.000000	0.000000	0.066667	0.054054	0.0
3	0.053686	0.066667	0.166667	0.071429	0.000000	0.0	0.0	0.046512	0.057692	0.041667	...	0.0	0.104478	0.000000	0.080645	0.064516	0.000000	0.055556	0.133333	0.054054	0.0
4	0.197115	0.300000	0.166667	0.000000	0.727273	0.0	0.0	0.279070	0.134615	0.208333	...	0.0	0.238806	0.217391	0.282258	0.258065	0.333333	0.202020	0.200000	0.108108	0.0
5	0.682692	0.500000	0.666667	0.714286	0.090909	1.0	1.0	0.534884	0.769231	0.375000	...	1.0	0.611940	0.739130	0.564516	0.677419	0.666667	0.696970	0.600000	0.567568	1.0

6 rows × 22 columns