In [1]:
## This code toggles all code cells displaying on or off, so we can see output and markdown cells only.
## All code cells display can be toggled on/off.

from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
The raw code for this IPython notebook is hidden for easier reading.
To toggle on/off the raw code, click <a href="javascript:code_toggle()">here</a>.''')
Out[1]:
The raw code for this IPython notebook is hidden for easier reading. To toggle on/off the raw code, click here.
In [2]:
## Import all the things

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pycountry

%matplotlib inline 

The Carpentries: Programmatic Assessment Report

2017 Year End and 2018 Q1

Authors: Maneesha Sane, Erin Becker

Part 1: Workshops over time

Workshops by Carpentry by Year

We began systematically recording data for our workshops in 2012. We can use this data to investigate how have The Carpentries have grown over the years in terms of number of workshops per year and how Data Carpentry has grown in comparison with Software Carpentry. For these analyses, we will consider only Software Carpentry and Data Carpentry workshops held between January 1st, 2012 and March 31st, 2018. Instructor Training events are analyzed in a later portion of this report.

Data is collected by a team of workshop administrators. In Africa, Australia, Canada, New Zealand, and the United Kingdom, workshop adminstrators are affiliated with our member institutions and provide in-kind staff time. A full-time Carpentries staff member is the Workshop Administrator for the rest the world.

For each of our workshops, we collected the following data:

variable definition
slug Unique identifier for each workshop. Takes the form YYYY-MM-DD-sitename.
start Start date of the workshop. Takes the form YYYY-MM-DD.
attendance Number of learners at the workshop.
host_name Institution that hosted the workshop.
country The two-letter country code for the country in which the workshop was held.
workshop_type Whether this is a Software Carpentry (SWC) or Data Carpentry (DC) workshop.
In [3]:
# Load csv into df
workshops = pd.read_csv("data_files/workshops.csv")


# Translate country codes to country names
# This does not work for Namibia (country code NA)
def get_country_name(alpha_code):
    try:
        return pycountry.countries.get(alpha_2=alpha_code).name
    except KeyError:
        return "online"

workshops['country'] = workshops['country'].apply(get_country_name)


# Clean up the tag names
workshops.loc[workshops['tag_name'].str.contains("SWC"), "workshop_type"] = "SWC"
workshops.loc[workshops['tag_name'].str.contains("DC"), "workshop_type"] = "DC"
workshops.loc[workshops['tag_name'].str.contains("LC"), "workshop_type"] = "LC"
workshops.loc[workshops['tag_name'].str.contains("TTT"), "workshop_type"] = "TTT"

workshops = workshops.drop('tag_name', axis=1)

# Clean up attendance value - this is inconsistently stored as NaN or 0
# All zero values should be NaN
workshops['attendance'] =  workshops['attendance'].replace(0.0, np.nan)

# Remove online events like maintainer onboarding 
workshops = workshops.drop(workshops[workshops.country == "online"].index)

# Date data type
workshops['start_date'] = pd.to_datetime(workshops['start_date'])

# Remove instructor training events; these will be analyzed separately.

# Limit to non TTT workshops
workshops_nonTTT = workshops[workshops['workshop_type'] != "TTT"]

workshops_nonTTT
# Note: to preview the data table, remove the semi-colon (;) from the end of the line above
Out[3]:
slug start_date attendance host_name country workshop_type
0 2012-01-18-stsci 2012-01-18 14.0 Space Telescope Science Institute United States SWC
1 2012-02-20-itcp 2012-02-20 50.0 International Centre for Theoretical Physics Italy SWC
2 2012-02-23-toronto 2012-02-23 28.0 University of Toronto Canada SWC
3 2012-03-07-indiana 2012-03-07 39.0 Indiana University United States SWC
4 2012-03-26-mbari 2012-03-26 38.0 Monterey Bay Aquarium Research Institute United States SWC
5 2012-03-28-nersc 2012-03-28 35.0 NERSC United States SWC
6 2012-04-02-chicago 2012-04-02 35.0 University of Chicago United States SWC
7 2012-04-14-utahstate 2012-04-14 32.0 Utah State University United States SWC
8 2012-04-30-ucl 2012-04-30 44.0 University College London United Kingdom SWC
9 2012-05-07-michiganstate 2012-05-07 55.0 Michigan State University United States SWC
10 2012-05-14-newcastle 2012-05-14 41.0 University of Newcastle United Kingdom SWC
11 2012-05-16-alberta 2012-05-16 38.0 University of Alberta Canada SWC
12 2012-05-22-ubc 2012-05-22 39.0 University of British Columbia Canada SWC
13 2012-06-18-jhu 2012-06-18 20.0 Johns Hopkins University United States SWC
14 2012-06-28-inria 2012-06-28 28.0 INRIA (Paris) France SWC
15 2012-07-09-mit 2012-07-09 36.0 Massachusetts Institute of Technology United States SWC
16 2012-07-10-ral 2012-07-10 24.0 Science & Technology Facilities Council United Kingdom SWC
17 2012-07-12-waterloo 2012-07-12 40.0 University of Waterloo Canada SWC
18 2012-07-16-halifax 2012-07-16 40.0 Saint Mary's University Canada SWC
19 2012-07-19-utsc 2012-07-19 39.0 University of Toronto (Scarborough) Canada SWC
21 2012-09-13-dafx 2012-09-13 33.0 DAFX Conference United Kingdom SWC
22 2012-09-17-oslo 2012-09-17 34.0 University of Oslo Norway SWC
23 2012-09-28-columbia 2012-09-28 39.0 Columbia University United States SWC
24 2012-10-08-purdue 2012-10-08 42.0 Purdue University United States SWC
26 2012-10-17-lbl 2012-10-17 40.0 Lawrence Berkeley National Laboratory United States SWC
27 2012-10-18-ubc 2012-10-18 62.0 University of British Columbia Canada SWC
28 2012-10-20-ucb 2012-10-20 29.0 University of California Berkeley United States SWC
29 2012-10-22-newcastle 2012-10-22 31.0 University of Newcastle United Kingdom SWC
30 2012-10-23-caltech 2012-10-23 34.0 California Institute of Technology United States SWC
31 2012-10-25-gmu 2012-10-25 7.0 George Mason University United States SWC
... ... ... ... ... ... ...
1429 2018-03-08-geneva 2018-03-08 17.0 University of Geneva Switzerland SWC
1430 2018-03-08-gwu 2018-03-08 7.0 George Washington University United States SWC
1431 2018-03-08-manchester-swc 2018-03-08 20.0 University of Manchester United Kingdom SWC
1432 2018-03-10-uscf-R 2018-03-10 25.0 University of California San Francisco United States SWC
1433 2018-03-10-uscf-python 2018-03-10 25.0 University of California San Francisco United States SWC
1434 2018-03-12-mun 2018-03-12 15.0 Memorial University Canada SWC
1435 2018-03-12-stockholm 2018-03-12 13.0 Lund University Sweden SWC
1436 2018-03-12-tamu 2018-03-12 22.0 Texas A&M University United States SWC
1438 2018-03-13-census 2018-03-13 19.0 National Institute of Standards and Technology United States SWC
1440 2018-03-14-SFU-python 2018-03-14 NaN Simon Fraser University Canada SWC
1441 2018-03-14-unt-swc 2018-03-14 14.0 University of North Texas United States SWC
1442 2018-03-15-Boise 2018-03-15 48.0 Boise State University United States SWC
1443 2018-03-15-lund 2018-03-15 16.0 Lund University Sweden SWC
1444 2018-03-19-Imperial 2018-03-19 NaN Imperial College London United Kingdom SWC
1445 2018-03-19-curtin 2018-03-19 NaN Curtin University Australia SWC
1446 2018-03-19-ou 2018-03-19 NaN University of Oklahoma United States SWC
1448 2018-03-20-UFDataSymposium 2018-03-20 21.0 University of Florida United States DC
1449 2018-03-20-genentech 2018-03-20 37.0 Genentech United States DC
1450 2018-03-20-gpn 2018-03-20 NaN Great Plains Network United States SWC
1452 2018-03-22-csumb 2018-03-22 NaN California State University Monterey Bay United States SWC
1453 2018-03-22-nasa 2018-03-22 19.0 NASA (National Aeronautics and Space Administr... United States SWC
1455 2018-03-26-DC-HU 2018-03-26 NaN Hawassa University Ethiopia DC
1457 2018-03-26-ucriverside 2018-03-26 10.0 University of California, Riverside United States SWC
1458 2018-03-28-paris 2018-03-28 12.0 ELIXIR-UK France SWC
1459 2018-03-28-stanford 2018-03-28 21.0 Stanford University United States SWC
1460 2018-03-28-ucirvine 2018-03-28 22.0 University of California Irvine United States SWC
1461 2018-03-29-SWC-HU 2018-03-29 NaN Hawassa University Ethiopia SWC
1462 2018-03-29-standrews 2018-03-29 NaN University of St. Andrews United Kingdom SWC
1463 2018-03-29-stanford 2018-03-29 40.0 Stanford University United States SWC
1464 datacarpentry_2015-11-23_VLSCI 2015-11-23 18.0 VLSCI Australia DC

1322 rows × 6 columns

The Carpentries workshops by year.

The table and chart below show the total number of Data Carpentry and Software Carpentry workshops held each year, including a projection for 2018 based on actual 2017 Q1 data relative to 2017 full year data.

Table 1. The Carpentries workshops by year.

In [4]:
# All workshops 2012 through 2017

workshops_by_carpentry_year_through_2017 = workshops_nonTTT[workshops_nonTTT['start_date'].dt.year<=2017]
workshops_by_carpentry_year_through_2017 = workshops_by_carpentry_year_through_2017.groupby([workshops_by_carpentry_year_through_2017['start_date'].dt.year, 'workshop_type'])['slug'].count().unstack()
workshops_by_carpentry_year_through_2017['total'] = workshops_by_carpentry_year_through_2017.sum(axis=1)


# All workshops in 2017 full year and 2017 Q1
workshops_by_carpentry_year_2017 = workshops_nonTTT[workshops_nonTTT['start_date'].dt.year == 2017]
workshops_by_carpentry_year_2017Q1 = workshops_by_carpentry_year_2017[workshops_by_carpentry_year_2017['start_date'].dt.month <= 3]                                              
workshops_by_carpentry_year_2017Q1 = workshops_by_carpentry_year_2017Q1.groupby([workshops_by_carpentry_year_2017Q1['start_date'].dt.year, 'workshop_type'])['slug'].count().unstack()
workshops_by_carpentry_year_2017 = workshops_by_carpentry_year_2017.groupby([workshops_by_carpentry_year_2017['start_date'].dt.year, 'workshop_type'])['slug'].count().unstack()

# All workshops in 2018 Q1
workshops_by_carpentry_year_2018Q1 = workshops_nonTTT[workshops_nonTTT['start_date'].dt.year == 2018]
workshops_by_carpentry_year_2018Q1 = workshops_by_carpentry_year_2018Q1.groupby([workshops_by_carpentry_year_2018Q1['start_date'].dt.year, 'workshop_type'])['slug'].count().unstack()

# Proportion of 2017 workshops that took place in 2017 Q1
proportion = workshops_by_carpentry_year_2017Q1/workshops_by_carpentry_year_2017
proportion = workshops_by_carpentry_year_2017Q1/workshops_by_carpentry_year_2017
proportion = proportion.rename(index={2017:2018})

# Use proportion to calculate 2018 projections
workshops_by_carpentry_year_2018Q1_projected = workshops_by_carpentry_year_2018Q1 / proportion
workshops_by_carpentry_year_2018Q1_projected = workshops_by_carpentry_year_2018Q1_projected.astype(int)
workshops_by_carpentry_year_2018Q1_projected['total'] = workshops_by_carpentry_year_2018Q1_projected.sum(axis=1)


# Combine real data through 2017 and projected 2018 data in to one dataframe
workshops_by_carpentry_year_with_projections = pd.concat([workshops_by_carpentry_year_through_2017, workshops_by_carpentry_year_2018Q1_projected], axis=0)

# Fill na with zeros and make floats in to ints
workshops_by_carpentry_year_with_projections = workshops_by_carpentry_year_with_projections.fillna(0)
workshops_by_carpentry_year_with_projections = workshops_by_carpentry_year_with_projections.astype(int)

# Rename 2018 to projected
workshops_by_carpentry_year_with_projections.rename(index={2018: '2018 projected'}, inplace=True)



workshops_by_carpentry_year_with_projections.columns.names = ["Workshop Type"]
workshops_by_carpentry_year_with_projections.index.names = ["Year"]

workshops_by_carpentry_year_with_projections
Out[4]:
Workshop Type DC SWC total
Year
2012 0 38 38
2013 0 93 93
2014 2 137 139
2015 31 243 274
2016 72 272 344
2017 81 256 337
2018 projected 130 294 424

Figure 1. The Carpentries workshops by year.

In [5]:
fig = plt.figure(figsize=(12, 6)) # Create matplotlib figure

ax = fig.add_subplot(111) # Create matplotlib axes
width = .5
title = "Carpentries workshops count by year"
workshops_by_carpentry_year_with_projections.plot(y = ["DC", "SWC"], kind='bar', ax=ax, width=width, position=1, title=title)
ax.grid(True)

ax.set_xlabel("Year")


plt.show()

The number of Software Carpentry and Data Carpentry workshops appear to have remained roughly steady for the past several years, after a sharp jump from 2014 to 2015. The year 2015 was the first year in which The Carpentries had staff and a dedicated database to track workshop data, so some of this growth may instead reflect a growth in internal systems.

The data shown here may not account for unreported self-organized workshops. Although The Carpentries attempts to collect data on all workshops run under "The Carpentries" brand, sometimes institutions may run a workshop without reporting it back to The Carpentries staff. Thus, if there has been a shift from more centrally-organized to more self-organized workshops, this may cause an underestimate of our workshop growth. We are working to improve data collection to have more accurate reflections of our scope of work.

This may also reflect a shift to sites running a variation of Carpentries lessons, rather than official full Carpentries workshops. While we've known anecdotally that this happens often, we have not systematically collected any data on when or how Carpentries lessons are used in other contexts.

Workshops by Carpentry by Year and Country

The table and chart below show a count of the number of Carpentry workshops (SWC or DC) that took place in each country each year from 2012 through 2017. This data can help us to see geographic trends in our community and in particular, in which geographic regions The Carpentry community is growing.

Projections for 2018 are not included here as many countries do not have enough of a history for us to base projections on. Projections for countries with a longer history or more activity are shown below.

In many countries, we have seen a steady increase in the number of workshops run. In several countries though (Australia, New Zealand, Canada), we have seen a decline in the number of workshops run. This may be due to the reasons cited above, including unreported self-organized workshops or an increase in variations on Carpentries workshops. In either case this is a motivation for The Carpentries to improve data collection and methods to understand our scope of work beyond our centrally coordinated workshops.

Decreases in the number of workshops run in some countries may also be accounted for by shifts in our instructor community. Carpentries activity is sustained by our instructors. Some instructors may move to new geographies or to new career phases. Without a larger community in place, these geographies may not have had the capacity to sustain their activity. This is a motivation for The Carpentries to build strong and sustainable communities, with systems that account for individual turnover.

Table 2. The Carpentries workshops by year and country.

The table below is a summary of all Carpentries workshops with the year and country they were held in.

In [6]:
workshops_by_country_year = workshops_nonTTT.groupby(['country', workshops_nonTTT['start_date'].dt.year])['slug'].count().unstack()
workshops_by_country_year = workshops_by_country_year.fillna(0)
workshops_by_country_year.rename(columns={2018: '2018Q1'}, inplace=True)
workshops_by_country_year['total'] = workshops_by_country_year.sum(axis=1)
workshops_by_country_year = workshops_by_country_year.astype(int)
workshops_by_country_year
Out[6]:
start_date 2012 2013 2014 2015 2016 2017 2018Q1 total
country
Australia 0 6 10 32 41 33 6 128
Belgium 0 0 0 1 0 1 0 2
Botswana 0 0 0 0 0 1 0 1
Brazil 0 0 6 5 5 0 0 16
Canada 8 11 22 25 41 29 4 140
China 0 1 0 0 0 0 0 1
Colombia 0 0 0 0 1 0 0 1
Cyprus 0 0 1 0 0 0 0 1
Denmark 0 0 1 0 2 3 1 7
Ethiopia 0 0 0 0 0 4 4 8
Finland 0 0 0 2 0 0 0 2
France 1 2 0 1 3 0 1 8
Gabon 0 0 0 0 0 1 0 1
Germany 0 3 3 4 4 9 2 25
Ghana 0 0 1 0 0 1 0 2
Greece 0 0 0 0 1 0 0 1
India 0 0 0 1 0 0 0 1
Indonesia 0 0 0 1 0 0 0 1
Ireland 0 0 0 0 0 0 1 1
Italy 1 0 1 0 2 0 1 5
Jordan 0 0 1 0 0 0 0 1
Kenya 0 0 0 1 1 0 0 2
Korea, Republic of 0 0 0 4 1 0 0 5
Lebanon 0 1 0 0 0 0 0 1
Mauritius 0 0 0 0 0 1 0 1
Mexico 0 0 0 0 1 1 1 3
Netherlands 0 1 0 3 2 2 0 8
New Zealand 0 1 0 7 14 9 2 33
Norway 1 1 1 4 5 1 1 14
Philippines 0 0 0 0 1 0 0 1
Poland 0 1 1 2 4 1 0 9
Puerto Rico 0 0 0 0 1 0 0 1
Saudi Arabia 0 1 0 0 0 0 0 1
Slovenia 0 0 0 1 0 0 0 1
South Africa 0 1 1 5 6 11 2 26
Spain 0 0 1 1 1 2 0 5
Sudan 0 0 0 0 0 0 1 1
Sweden 0 0 1 1 1 1 2 6
Switzerland 0 0 2 4 5 2 1 14
Thailand 0 0 0 1 0 1 0 2
United Kingdom 7 14 21 27 37 44 11 161
United States 20 49 65 140 164 179 56 673
Venezuela, Bolivarian Republic of 0 0 0 1 0 0 0 1
In [7]:
# Get all workshops for 2017
workshops_2017 = workshops_nonTTT[(workshops_nonTTT['start_date'].dt.year == 2017)]

# # Get all workshops for 2017 Q1
workshops_2017Q1 = workshops_nonTTT[(workshops_nonTTT['start_date'].dt.year == 2017) & (workshops_nonTTT['start_date'].dt.month <= 3)]

# # Get all workshops for 2018 Q1
workshops_2018Q1 = workshops_nonTTT[(workshops_nonTTT['start_date'].dt.year == 2018) & (workshops_nonTTT['start_date'].dt.month <= 3)]

# 2017 Q1 workshops by country
workshops_by_country_year_2017Q1 = workshops_2017Q1.groupby(['country', workshops_nonTTT['start_date'].dt.year])['slug'].count().unstack()
workshops_by_country_year_2017Q1.rename(columns={2017:'2017Q1'}, inplace=True)

# 2018 Q1 workshops by country
workshops_by_country_year_2018Q1 = workshops_2018Q1.groupby(['country', workshops_nonTTT['start_date'].dt.year])['slug'].count().unstack()
workshops_by_country_year_2018Q1.rename(columns={2018:'2018Q1'}, inplace=True)

# 2017 workshops by country
workshops_by_country_year_2017 = workshops_2017.groupby(['country', workshops_nonTTT['start_date'].dt.year])['slug'].count().unstack()

# Concatenated table, 2017 Q1 and 2018 Q1 workshops by country
workshops_by_country_year_1718Q1 = pd.concat([workshops_by_country_year_2017Q1, workshops_by_country_year_2018Q1, workshops_by_country_year_2017], axis=1)

# Add column to show what percent of 2017 workshops took place in 2017 Q1
workshops_by_country_year_1718Q1['prop2017Q1'] = workshops_by_country_year_1718Q1['2017Q1']/workshops_by_country_year_1718Q1[2017]

# Add column to show expected 2018 workshops, if rate is same as 2017
workshops_by_country_year_1718Q1['proj2018'] =  workshops_by_country_year_1718Q1['2018Q1']/workshops_by_country_year_1718Q1['prop2017Q1']

# Give df a better name
workshops_by_country_year_with_projections = workshops_by_country_year_1718Q1

# # Relying only on 2017 projections excludes countries that held a workshop in 2018 Q1 but not 2017 Q1
# # Adjust the 2018 projections to be the max of the 2018 projection or the 2018 Q1 actual
workshops_by_country_year_with_projections["adj_proj2018"] = workshops_by_country_year_with_projections[["proj2018", "2018Q1"]].max(axis=1)


# # Fill NA with 0 and cast floats as ints
workshops_by_country_year_with_projections = workshops_by_country_year_with_projections.fillna(0)
workshops_by_country_year_with_projections = workshops_by_country_year_with_projections.astype(int)


workshops_by_country_year_with_projections = workshops_by_country_year_with_projections.reset_index()
workshops_by_country_year_with_projections = workshops_by_country_year_with_projections.rename(columns={'index':'country'})

# Table used to calculate 2018 projections - uncomment next line to hide/show
# workshops_by_country_year_with_projections
In [8]:
# All countries that have held at least 10 workshops (lifetime)

workshops_by_country_year_many = workshops_by_country_year[workshops_by_country_year.total >= 10]
workshops_by_country_year_many = workshops_by_country_year_many.reset_index()
workshops_by_country_year_many = workshops_by_country_year_many.reset_index()

# Uncomment line below to show/hide table
# workshops_by_country_year_many

Table 3. All countries having hosted at least 10 workshops from 2012 through 2018 (projected)

In [9]:
# Join table of countries with at least 10 workshops with 2018 projections

workshops_by_country_year_top = pd.merge(workshops_by_country_year_many, workshops_by_country_year_with_projections, on='country', how='left')
workshops_by_country_year_top = workshops_by_country_year_top[['country', 2012, 2013, 2014, 2015, 2016, '2017_x', 'adj_proj2018']]
workshops_by_country_year_top = workshops_by_country_year_top.rename(columns={'2017_x':2017, 'adj_proj2018':'proj2018'})

workshops_by_country_year_top = workshops_by_country_year_top.fillna(0)
workshops_by_country_year_top['proj2018'] = workshops_by_country_year_top['proj2018'].astype(int)
workshops_by_country_year_top
Out[9]:
start_date country 2012 2013 2014 2015 2016 2017 proj2018
0 Australia 0 6 10 32 41 33 22
1 Brazil 0 0 6 5 5 0 0
2 Canada 8 11 22 25 41 29 11
3 Germany 0 3 3 4 4 9 9
4 New Zealand 0 1 0 7 14 9 6
5 Norway 1 1 1 4 5 1 1
6 South Africa 0 1 1 5 6 11 22
7 Switzerland 0 0 2 4 5 2 1
8 United Kingdom 7 14 21 27 37 44 53
9 United States 20 49 65 140 164 179 244

Figure 3. All countries having hosted at least 10 workshops from 2012 through 2018 (projected)

In [10]:
# Draw bar chart showing most active countries and workshops by year# Draw b 

workshops_by_country_year_top = workshops_by_country_year_top[::-1]
workshops_by_country_year_top = workshops_by_country_year_top.set_index('country')

fig = plt.figure(figsize=(12, 10)) # Create matplotlib figure

ax = fig.add_subplot(111) # Create matplotlib axes

title = "Carpentries workshops by country by year"
workshops_by_country_year_top.plot(y = list(workshops_by_country_year_top)[::-1], kind='barh', ax=ax, position=1, title=title)

ax.grid(True)

handles, labels = ax.get_legend_handles_labels()
ax.legend(handles[::-1], labels[::-1], title='Year', loc='upper right')

plt.show()

workshops_by_country_year_top
Out[10]:
start_date 2012 2013 2014 2015 2016 2017 proj2018
country
United States 20 49 65 140 164 179 244
United Kingdom 7 14 21 27 37 44 53
Switzerland 0 0 2 4 5 2 1
South Africa 0 1 1 5 6 11 22
Norway 1 1 1 4 5 1 1
New Zealand 0 1 0 7 14 9 6
Germany 0 3 3 4 4 9 9
Canada 8 11 22 25 41 29 11
Brazil 0 0 6 5 5 0 0
Australia 0 6 10 32 41 33 22
In [11]:
### When did each country have its first workshop?

# Data goes back to 2012; 
# some countries may have been holding Carpentries workshops before this time.


w2 = workshops_by_country_year.copy()

# See https://stackoverflow.com/questions/31828240/first-non-null-value-per-row-from-a-list-of-pandas-columns
# and
# https://stackoverflow.com/questions/38467749/find-first-non-zero-value-in-each-row-of-pandas-dataframe

# Fill zero values with Nan to make them non valid values
res = w2[w2 != 0.0]

first_workshop = res.apply(pd.Series.first_valid_index, axis=1)
first_workshop = pd.DataFrame({'year': first_workshop.values.astype(str), 'country': first_workshop.index.values}, index=None)
first_workshop = first_workshop.sort_values('year').set_index('year')
# first_workshop
In [12]:
### How many countries had their first workshop each year?

first_workshop.groupby(level=0).count()
Out[12]:
country
year
2012 6
2013 9
2014 8
2015 9
2016 5
2017 4
2018Q1 2

Which countries had their first workshop in 2017?

Four countries, all in Africa, had their first workshop in 2017. This is representative of the significant growth we've seen in this part of the world. (Include information about new workshop administrators and task force in Africa)

In [13]:
first_workshop.loc['2017']
Out[13]:
country
year
2017 Botswana
2017 Mauritius
2017 Ethiopia
2017 Gabon
In [14]:
first_workshop
Out[14]:
country
year
2012 United Kingdom
2012 United States
2012 Canada
2012 Italy
2012 Norway
2012 France
2013 Australia
2013 New Zealand
2013 Poland
2013 Saudi Arabia
2013 Germany
2013 Lebanon
2013 South Africa
2013 China
2013 Netherlands
2014 Sweden
2014 Ghana
2014 Denmark
2014 Cyprus
2014 Switzerland
2014 Jordan
2014 Brazil
2014 Spain
2015 Slovenia
2015 Thailand
2015 Kenya
2015 Venezuela, Bolivarian Republic of
2015 Korea, Republic of
2015 Belgium
2015 Indonesia
2015 India
2015 Finland
2016 Philippines
2016 Puerto Rico
2016 Greece
2016 Mexico
2016 Colombia
2017 Botswana
2017 Mauritius
2017 Ethiopia
2017 Gabon
2018Q1 Sudan
2018Q1 Ireland

Attendance at workshops

Workshops missing attendance

After running a workshop, The Carpentries staff asks hosts or instructors to submit attendance data to us. In some cases, this is detailed account of the number registered and the number attended each day. In other cases, this is a best estimate by the host or instructors.

In some cases, the hosts or instructors do not report back on attendance data. From 2012 through 2018 Q1, 122 of 1323 (about 9%) workshops were missing attendance. While this is not a large percentage of total workshops, it is indication for The Carpentries to understand why attendance data may not have been shared. It could be because the hosts or instructors were uncomfortable sharing low numbers, because the hosts or instructors didn't have clear systems to collect and share attendance numbers, or other reaons. The Carpentries should better understand these reasons in order to address them.

For the analyses below, workshops missing attendance are excluded from the analyses. Because low attendance is a possible factor in not having reported attendance, replacing missing attendance data with means would not be an accurate reflection of our numbers.

In [15]:
# This is a full list of all workshops that did not report attendance 
# back to the Carpentries.  Reasons for this may vary and demonstrate a 
# need for Carpentries staff, workshop hosts, and instructors to better 
# collect and record this data.


# Not sure why this way gives a warning
# missing_attendance = workshops_nonTTT[pd.isnull(workshops['attendance'])]

# This way does not give warning
missing_attendance = workshops[workshops.attendance.isnull()]

# missing_attendance
# Uncomment the above line to see a list of workshops missing attendance
In [16]:
# Calculate projected attendance for 2018

# Total attendance for 2017
total2017 = workshops_2017['attendance'].sum(skipna=True)

# Total attendance for 2017 Q1
total2017Q1 = workshops_2017Q1['attendance'].sum(skipna=True)


# Total attendance for 2018 Q1
total2018Q1 = workshops_2018Q1['attendance'].sum(skipna=True)


proportion_attendance_2017Q1 = total2017Q1/total2017

projected_attendance_2018 = total2018Q1/proportion_attendance_2017Q1

Total attendance by year: Table view

This table and chart show the total number of learners at Carpentries workshops each year.
Based on the number of learners in the first quarter of 2017 relative to the full year, we anticipate a drop in the number of learners we will see in 2018.

In [17]:
attendance = workshops_nonTTT[['start_date', 'attendance']]   #df['date'].dt.year
attendance = attendance.assign(year=attendance['start_date'].dt.year)

# Doing the above like this gives a SettingWithCopyWarning error
# attendance['year'] = attendance['start_date'].dt.year
# See https://stackoverflow.com/questions/12555323/adding-new-column-to-existing-dataframe-in-python-pandas
total_attendance_by_year = attendance.groupby('year')['attendance'].sum()
total_attendance_by_year = total_attendance_by_year.to_frame()

total_attendance_by_year.loc['2018proj'] = [projected_attendance_2018]
total_attendance_by_year = total_attendance_by_year.astype(int)
total_attendance_by_year = total_attendance_by_year.drop([2018])
total_attendance_by_year
Out[17]:
attendance
year
2012 1378
2013 3212
2014 4742
2015 7755
2016 7110
2017 7217
2018proj 6289

Total attendance by year: Chart view

In [18]:
fig = plt.figure(figsize=(10, 8)) # Create matplotlib figure

ax = fig.add_subplot(111) # Create matplotlib axes

title = "Carpentries attendance by year"
total_attendance_by_year.plot(kind='bar', ax=ax, title=title, legend=False)

ax.grid(True)

plt.show()

Box plot showing frequency of class sizes

This box plot shows the range of our class sizes, and demonstrates a trend towards smaller classes in the past few years, with 2018 workshops showing a slight increase. The center line represents the median and the red dot represents the mean. In this case, 2018 data is not a projection but rather actual data from 2018 Q1.

In [19]:
# See https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.boxplot.html

title = "Frequency of class sizes by year"

has_attendance = attendance[pd.notnull(attendance['attendance'])]

meanprops = dict(marker='o', markeredgecolor='black', markerfacecolor='firebrick')
boxprops = dict(linestyle='-', linewidth=2, color='k')

ax = has_attendance.boxplot(by="year", figsize=(10,8), showmeans = True, meanprops = meanprops, boxprops = boxprops)
ax.set_title(title)
ax.get_figure().suptitle("")
plt.show()

Binned attendance by year

This table shows by year, how many workshops had 1-10 learners, 11-20 learners, and so on.

In [20]:
bins = pd.cut(attendance['attendance'], [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, np.inf])
attendance.groupby(bins)['attendance'].agg(['count'])

binned_attendance_by_year = attendance.groupby([ bins, 'year'])['attendance'].agg(['count'])

binned_attendance_by_year =  binned_attendance_by_year.unstack()
binned_attendance_by_year['total'] = binned_attendance_by_year.sum(axis=1)
binned_attendance_by_year = binned_attendance_by_year.rename(index=str, columns={2018:"2018Q1"})

binned_attendance_by_year = binned_attendance_by_year.fillna(0).astype(int)
binned_attendance_by_year
Out[20]:
count total
year 2012 2013 2014 2015 2016 2017 2018Q1
attendance
(1, 10] 1 2 2 16 23 23 6 73
(10, 20] 3 12 23 50 85 112 21 306
(20, 30] 5 25 46 83 111 109 23 402
(30, 40] 20 37 41 69 52 51 15 285
(40, 50] 6 10 11 29 9 9 4 78
(50, 60] 1 3 4 7 7 3 0 25
(60, 70] 1 1 2 3 1 1 1 10
(70, 80] 1 0 4 4 1 1 0 11
(80, 90] 0 0 3 0 0 0 0 3
(90, 100] 0 1 0 0 0 0 0 1
(100, inf] 0 2 2 0 0 0 0 4

Instructor Training

This tables lists all instructor training events the Carpentries has held since 2012. The "count_badged" column is a total of all individuals from that event with at least one badge. We are not distinguishing between Software Carpentry and Data Carpentry badges.

For reference, all training events through 2018 Q1 are listed here. However, the analyses below exclude data from 2018 Q1. Trainees have 90 days to complete their certification requirements, so no one who attended instructor training in 2018 Q1 would be expected to have completed certification.

For each of our instructor training events, we collected the following data:

variable definition
slug Unique identifier for each workshop. Takes the form YYYY-MM-DD-sitename.
start Start date of the workshop. Takes the form YYYY-MM-DD.
country The country in which the workshop was held. Online events are noted as "online" even if all participants were in one country.
attendance Number of trainees at the workshop.
count_badged Number of trainees awarded a Software Carpentry (SWC) or Data Carpentry (DC) badge. *
pct_completion Percent of trainees awarded a Software Carpentry (SWC) or Data Carpentry (DC) badge. *

* While we grant both Software Carpentry and Data Carpentry badges we do not distinguish between them for teaching eligibility or any other status within the Carpentries.

In [21]:
badges_by_training_event = pd.read_csv("data_files/badges_by_training_event.csv")



badges_by_training_event['pct_completion'] = badges_by_training_event['count_badged'] / badges_by_training_event['attendance'] * 100
badges_by_training_event = badges_by_training_event.round({'pct_completion':1})
badges_by_training_event['start'] = pd.to_datetime(badges_by_training_event['start'])

badges_by_training_event_thru2017 = badges_by_training_event[badges_by_training_event['start'].dt.year < 2018]


badges_by_training_event['country'] = badges_by_training_event['country'].apply(get_country_name)

badges_by_training_event_thru2017 = badges_by_training_event[badges_by_training_event['start'].dt.year < 2018]


badges_by_training_event
Out[21]:
slug start country attendance count_badged pct_completion
0 2012-08-26-ttt-online 2012-08-26 online 20 17 85.0
1 2012-10-11-ttt-online 2012-10-11 online 25 19 76.0
2 2013-01-06-ttt-online 2013-01-06 online 12 5 41.7
3 2013-03-12-ttt-online 2013-03-12 online 27 18 66.7
4 2013-05-12-ttt-online 2013-05-12 online 45 18 40.0
5 2013-08-12-ttt-online 2013-08-12 online 41 24 58.5
6 2013-09-30-ttt-online 2013-09-30 online 57 22 38.6
7 2014-01-16-ttt-online 2014-01-16 online 67 25 37.3
8 2014-04-14-ttt-pycon 2014-04-14 online 34 3 8.8
9 2014-04-24-ttt-online 2014-04-24 online 58 24 41.4
10 2014-04-28-ttt-mozilla 2014-04-28 online 43 31 72.1
11 2014-06-05-ttt-online 2014-06-05 online 29 4 13.8
12 2014-06-11-ttt-online 2014-06-11 online 59 27 45.8
13 2014-09-10-ttt-online 2014-09-10 online 81 46 56.8
14 2014-09-22-ttt-uva 2014-09-22 online 31 12 38.7
15 2014-10-22-ttt-tgac 2014-10-22 online 41 21 51.2
16 2014-11-12-ttt-washington 2014-11-12 online 20 9 45.0
17 2015-01-06-ttt-ucdavis 2015-01-06 online 44 24 54.5
18 2015-02-01-ttt-online 2015-02-01 online 127 60 47.2
19 2015-02-11-ttt-melbourne 2015-02-11 online 46 42 91.3
20 2015-03-10-ttt-lbl 2015-03-10 online 12 8 66.7
21 2015-04-08-ttt-nih 2015-04-08 online 17 6 35.3
22 2015-05-11-ttt-compute-canada 2015-05-11 Canada 22 16 72.7
23 2015-05-27-ttt-msu 2015-05-27 United States 26 10 38.5
24 2015-06-08-ttt-online 2015-06-08 online 23 12 52.2
25 2015-06-12-ttt-iplant 2015-06-12 United States 26 12 46.2
26 2015-06-16-ttt-aciref 2015-06-16 United States 24 12 50.0
27 2015-09-08-ttt-online 2015-09-08 online 18 9 50.0
28 2015-09-14-ttt-leeds 2015-09-14 United Kingdom 18 11 61.1
29 2015-09-21-ttt-icl 2015-09-21 United Kingdom 14 4 28.6
... ... ... ... ... ... ...
105 2017-10-10-ttt-doe 2017-10-10 online 24 5 20.8
106 2017-10-16-Monsanto-ttt 2017-10-16 United States 7 6 85.7
107 2017-10-19-ucsf-ttt 2017-10-19 online 5 3 60.0
108 2017-10-24-ttt-online 2017-10-24 online 19 11 57.9
109 2017-10-24-ttt-online-USA 2017-10-24 online 23 10 43.5
110 2017-10-27-ttt-UND 2017-10-27 United States 5 1 20.0
111 2017-10-30-TTT-online 2017-10-30 online 12 8 66.7
112 2017-10-30-imperial-ttt 2017-10-30 United Kingdom 19 12 63.2
113 2017-11-02-TTT-online 2017-11-02 online 25 11 44.0
114 2017-11-06-ttt-Utrecht 2017-11-07 Netherlands 16 6 37.5
115 2017-11-14-ttt-online 2017-11-14 online 12 9 75.0
116 2017-11-15-ttt-stanford 2017-11-15 United States 14 7 50.0
117 2017-12-04-ttt 2017-12-04 online 8 3 37.5
118 2017-12-05-ttt-southampton 2017-12-05 United Kingdom 18 9 50.0
119 2017-12-12-ttt-online 2017-12-12 online 17 12 70.6
120 2017-12-14-ttt-uiuc 2017-12-14 United States 10 4 40.0
121 2018-01-30-ttt-USA 2018-01-30 online 19 15 78.9
122 2018-01-30-ttt-online-UK 2018-01-30 online 14 8 57.1
123 2018-02-05-ttt-online 2018-02-05 online 24 9 37.5
124 2018-02-13-ttt-online 2018-02-13 online 15 7 46.7
125 2018-02-21-South-Africa-ttt 2018-02-21 South Africa 17 9 52.9
126 2018-02-26-ttt-csiro 2018-02-26 Australia 4 1 25.0
127 2018-02-26-ttt-stanford 2018-02-26 United States 12 1 8.3
128 2018-02-28-ttt-online 2018-02-28 online 16 5 31.2
129 2018-03-01-ttt-online 2018-03-01 online 18 4 22.2
130 2018-03-05-ttt-florida 2018-03-05 United States 20 6 30.0
131 2018-03-12-ttt-online 2018-03-12 online 18 3 16.7
132 2018-03-19-ttt-sfu 2018-03-19 Canada 10 1 10.0
133 2018-03-20-ttt-online 2018-03-20 online 11 3 27.3
134 2018-03-26-ttt-davis 2018-03-26 United States 24 1 4.2

135 rows × 6 columns

Number of instructor training events by country

This table shows how many training events took place in each country. Inperson events are noted as taking place in a specific country. All online events are noted as online, even if the trainees came entirely from one country.

In [22]:
trainings_by_country = badges_by_training_event_thru2017.groupby([badges_by_training_event['start'].dt.year, 'country'])['slug'].count().unstack()
trainings_by_country['total'] = trainings_by_country.sum(axis=1)
trainings_by_country.index.values
trainings_by_country = trainings_by_country.rename(index = {2018:'2018Q1'})
trainings_by_country = trainings_by_country.fillna(0)
trainings_by_country = trainings_by_country.astype(int)

trainings_by_country
Out[22]:
country Australia Canada Netherlands New Zealand Norway Poland Puerto Rico South Africa Switzerland United Kingdom United States online total
start
2012 0 0 0 0 0 0 0 0 0 0 0 2 2
2013 0 0 0 0 0 0 0 0 0 0 0 5 5
2014 0 0 0 0 0 0 0 0 0 0 0 10 10
2015 0 1 0 0 0 1 0 0 0 4 3 12 21
2016 2 1 0 3 1 0 0 1 1 3 12 11 35
2017 3 1 1 1 0 0 1 2 0 4 14 21 48

Binned completion rates for instructor training events by year

This shows the count of events by completion rates. For example, six events in 2016 had a completion rate under 25%. The counts are binned by quartiles, separating out events with 100% completion rates.

Training events from 2018 are excluded from this table. Because trainees have 90 days to complete their certification requirements, no trainee from a 2018 Q1 event would be expected to have completed their certification requirements.

Two events have had 100% completion rate: an online event in 2015, bringing together individual trainees across Europe, and a 2017 in-person event at Stanford University in California, USA. The Carpentries can review the coordination and support systems associated with events like these to understand how we can best support trainees throughout the certification process. Likewise, 13 events total has a completion rate under 25%. The Carpentries can review how trainees at these events could have been better supported in this process and welcomed into the Carpentries community.

In [23]:
bins = pd.cut(badges_by_training_event_thru2017['pct_completion'], [0, 25, 50, 75, 99, 100])
binned_badges_by_year = badges_by_training_event_thru2017.groupby([ bins, badges_by_training_event_thru2017['start'].dt.year])['pct_completion'].agg(['count'])

binned_badges_by_year =  binned_badges_by_year.unstack()
binned_badges_by_year['total'] = binned_badges_by_year.sum(axis=1)
binned_badges_by_year

binned_badges_by_year.fillna(0, inplace=True)
binned_badges_by_year = binned_badges_by_year.astype(int)
binned_badges_by_year
Out[23]:
count total
start 2012 2013 2014 2015 2016 2017
pct_completion
(0, 25] 0 0 2 1 6 4 13
(25, 50] 0 3 5 8 18 18 52
(50, 75] 0 2 3 8 8 22 43
(75, 99] 2 0 0 3 3 3 11
(99, 100] 0 0 0 1 0 1 2

Attendance and number badged by year

Total attendance by year

This table shows the total number of trainees at instructor training events each year by country.

In [24]:
trainees_by_year = badges_by_training_event_thru2017.groupby([badges_by_training_event['start'].dt.year, 'country'])['attendance'].sum().unstack()
trainees_by_year.fillna(0, inplace=True)
trainees_by_year = trainees_by_year.astype(int)
trainees_by_year
# trainigs_by_country = badges_by_training_event.groupby([badges_by_training_event['start_date'].dt.year, 'country'])['slug'].count().unstack()
Out[24]:
country Australia Canada Netherlands New Zealand Norway Poland Puerto Rico South Africa Switzerland United Kingdom United States online
start
2012 0 0 0 0 0 0 0 0 0 0 0 45
2013 0 0 0 0 0 0 0 0 0 0 0 182
2014 0 0 0 0 0 0 0 0 0 0 0 463
2015 0 22 0 0 0 13 0 0 0 75 76 393
2016 52 27 0 41 22 0 0 23 20 62 211 206
2017 57 23 16 28 0 0 12 62 0 83 194 359

Total badged by year

This table shows the total number of trainees who were awarded badges at instructor training events each year by country.

In [25]:
badges_by_year = badges_by_training_event_thru2017.groupby([badges_by_training_event['start'].dt.year, 'country'])['count_badged'].sum().unstack()
badges_by_year.fillna(0, inplace=True)
badges_by_year = badges_by_year.astype(int)
badges_by_year
Out[25]:
country Australia Canada Netherlands New Zealand Norway Poland Puerto Rico South Africa Switzerland United Kingdom United States online
start
2012 0 0 0 0 0 0 0 0 0 0 0 36
2013 0 0 0 0 0 0 0 0 0 0 0 87
2014 0 0 0 0 0 0 0 0 0 0 0 202
2015 0 16 0 0 0 3 0 0 0 35 34 249
2016 28 21 0 22 10 0 0 11 17 22 92 92
2017 29 10 6 14 0 0 2 24 0 47 111 193

Percent of trainees badged per year

This is calculated from the above two tables. The year is the year the event was held, not the year the badge was issued.

In [26]:
percent_badged_by_year = badges_by_year/trainees_by_year*100
percent_badged_by_year = percent_badged_by_year.round(1)
percent_badged_by_year.fillna(0, inplace = True)
percent_badged_by_year = percent_badged_by_year.astype(int)
percent_badged_by_year
Out[26]:
country Australia Canada Netherlands New Zealand Norway Poland Puerto Rico South Africa Switzerland United Kingdom United States online
start
2012 0 0 0 0 0 0 0 0 0 0 0 80
2013 0 0 0 0 0 0 0 0 0 0 0 47
2014 0 0 0 0 0 0 0 0 0 0 0 43
2015 0 72 0 0 0 23 0 0 0 46 44 63
2016 53 77 0 53 45 0 0 47 85 35 43 44
2017 50 43 37 50 0 0 16 38 0 56 57 53

Comparison: Inperson vs Online Training Events

The table and char below show the number of training events we have held each year, separated by online and inperson events. Until 2015, our training events were conducted exclusively on line by the founder of Software Carpentry. In recent years, we have built a community of instructor trainers who are avaible to teach online events as well as inperson events at their own sites. We have also formalized and strengthened our institutional membership program, where higher level memberships entitle the member site to inperson training events.

Number of events per year

In [27]:
countries = list(trainings_by_country)
countries.remove('online')
countries.remove('total')
inperson_trainings = trainings_by_country[countries].sum(axis = 1).to_frame()

online_trainings = trainings_by_country[['online']]

trainings_online_inperson = pd.concat([inperson_trainings, online_trainings], axis=1)

trainings_online_inperson.rename(columns = {0:'in-person'}, inplace=True)

trainings_online_inperson.fillna(0, inplace=True)
trainings_online_inperson = trainings_online_inperson.astype(int)
trainings_online_inperson
Out[27]:
in-person online
start
2012 0 2
2013 0 5
2014 0 10
2015 9 12
2016 24 11
2017 27 21
In [28]:
fig = plt.figure(figsize=(10, 8)) # Create matplotlib figure

ax = fig.add_subplot(111) # Create matplotlib axes

title = "Carpentries instructor training events by year"
ax = trainings_online_inperson[['online', 'in-person']].plot(kind='bar', ax=ax, stacked=False, title=title)
ax.grid(True)
plt.show()

Completion rates (percentages)

The table below shows the completion rates of online instructor training events compared to inperson events. In 2012, we ran only two training events, both online. While this was early in The Carpentries' lifecycle, this may have allowed us to target trainees vested in our mission. Requirements for certification have also changed over the years; the high certification rate in 2012 may also reflect more lenient certification requirements.

In our first year of running inperson training events (2015) we saw completion rates were significantly lower for inperson events than for online events. This may have been due to us not yet having recognized different ways in engaging with communities after running in person training events.

In the following years, completion rates for online and inperson instructor training events are comparable. Many of our inperson training events are run by Trainers already affiliated with the host site and have a commitment to building a strong instructor community at their site. Many of these events are also run through our formalized institutional membership program rather than as stand alone events. Through the institutional membership, host sites become connected with the Carpentries community and receive support from Carpentries staff.

In [29]:
countries = list(trainees_by_year)

countries.remove('online')

inperson_trainees = trainees_by_year[countries].sum(axis=1).to_frame()
inperson_badges = badges_by_year[countries].sum(axis=1).to_frame()


online_trainees = trainees_by_year[['online']]
online_badges = badges_by_year[['online']]

pct_completion_online = online_badges/online_trainees
pct_completion_inperson = inperson_badges/inperson_trainees
pct_completion_online_inperson = pd.concat([pct_completion_online, pct_completion_inperson], axis = 1)
pct_completion_online_inperson.rename(columns={0:'in-person'}, inplace=True)
pct_completion_online_inperson = pct_completion_online_inperson.fillna(0)
pct_completion_online_inperson = pct_completion_online_inperson * 100
pct_completion_online_inperson = pct_completion_online_inperson.astype(int)


pct_completion_online_inperson = pct_completion_online_inperson.replace(0, "-")

pct_completion_online_inperson
Out[29]:
online in-person
start
2012 80 -
2013 47 -
2014 43 -
2015 63 47
2016 44 48
2017 53 51
In [30]:
fig = plt.figure(figsize=(10, 8)) # Create matplotlib figure

ax = fig.add_subplot(111) # Create matplotlib axes

# Make dashes into numbers again so numeric data can be ploted
pct_completion_online_inperson_chart = pct_completion_online_inperson.replace("-", 0)

title = "Carpentries instructor training events completion rates (percentages)"
ax = pct_completion_online_inperson_chart[['in-person', 'online']].plot(kind='bar', ax=ax, stacked=False, title=title)
ax.grid(True)
plt.show()

We continued to look at progress from going through instructor training to completing certification and getting badged on to when instructors taught their first workshop. All dates are expressed as the first of the month. Exact dates are masked to preserve anonymity.

For each certified instructor, we collected the following data:

variable definition
date_awarded The first day of the month the badge was awarded.
first_wkshp The first day of the month this instructor taught their first workshop.
days A calculated field representing the difference between these two dates.

The Carpentries requires that all centrally organized workshops are taught by certified Carpentries instructors. However, self organized workshops may be taught by one certified instructor who works with a peer or colleague as a co-instructors. Many of these people go on to complete our instructor training program and become certified instructors themselves, which is why we see some people teaching even years before their certification date.

In [31]:
days_badge_to_teach = pd.read_csv("data_files/days_badge_to_teach.csv")


days_badge_to_teach['date_awarded'] = pd.to_datetime(days_badge_to_teach['date_awarded'])
days_badge_to_teach['first_wkshp'] = pd.to_datetime(days_badge_to_teach['first_wkshp'])
days_badge_to_teach['badge_year'] = days_badge_to_teach['date_awarded'].dt.year

days_badge_to_teach = days_badge_to_teach[days_badge_to_teach['badge_year'] < 2018]

days_badge_to_teach
Out[31]:
date_awarded first_wkshp days badge_year
0 2016-03-01 2013-03-01 -1096 2016
1 2017-09-01 2014-08-01 -1127 2017
2 2016-02-01 2013-03-01 -1067 2016
3 2016-03-01 2013-04-01 -1065 2016
4 2016-12-01 2014-01-01 -1065 2016
5 2016-01-01 2013-04-01 -1005 2016
6 2017-08-01 2014-11-01 -1004 2017
7 2017-05-01 2014-12-01 -882 2017
9 2017-07-01 2015-06-01 -761 2017
10 2016-04-01 2014-04-01 -731 2016
11 2014-10-01 2012-10-01 -730 2014
13 2017-12-01 2016-02-01 -669 2017
14 2016-02-01 2014-05-01 -641 2016
16 2017-12-01 2016-03-01 -640 2017
17 2014-12-01 2013-05-01 -579 2014
18 2014-11-01 2013-04-01 -579 2014
19 2017-05-01 2015-10-01 -578 2017
20 2017-12-01 2016-05-01 -579 2017
21 2014-12-01 2013-07-01 -518 2014
22 2014-07-01 2013-03-01 -487 2014
23 2016-01-01 2014-09-01 -487 2016
24 2016-10-01 2015-06-01 -488 2016
25 2015-06-01 2014-02-01 -485 2015
26 2014-07-01 2013-04-01 -456 2014
27 2014-11-01 2013-08-01 -457 2014
28 2016-09-01 2015-06-01 -458 2016
29 2013-07-01 2012-05-01 -426 2013
30 2013-07-01 2012-05-01 -426 2013
31 2016-02-01 2015-01-01 -396 2016
32 2015-02-01 2013-12-01 -427 2015
... ... ... ... ...
913 2016-05-01 2017-07-01 426 2016
914 2016-03-01 2017-05-01 426 2016
915 2016-08-01 2017-11-01 457 2016
916 2015-10-01 2017-01-01 458 2015
917 2014-05-01 2015-08-01 457 2014
918 2016-02-01 2017-06-01 486 2016
919 2015-05-01 2016-09-01 489 2015
920 2016-11-01 2018-03-01 485 2016
921 2016-06-01 2017-11-01 518 2016
922 2014-12-01 2016-05-01 517 2014
923 2016-02-01 2017-06-01 486 2016
924 2015-07-01 2016-12-01 519 2015
925 2017-01-01 2018-06-01 516 2017
926 2016-10-01 2018-03-01 516 2016
927 2016-02-01 2017-07-01 516 2016
928 2015-05-01 2016-10-01 519 2015
929 2015-03-01 2016-09-01 550 2015
930 2014-09-01 2016-06-01 639 2014
931 2014-10-01 2016-07-01 639 2014
932 2016-01-01 2017-11-01 670 2016
933 2013-11-01 2015-09-01 669 2013
934 2016-07-01 2018-05-01 669 2016
935 2014-12-01 2016-12-01 731 2014
936 2014-11-01 2016-12-01 761 2014
937 2014-07-01 2016-08-01 762 2014
938 2014-10-01 2017-05-01 943 2014
939 2015-01-01 2017-09-01 974 2015
940 2015-06-01 2018-03-01 1004 2015
941 2015-04-01 2018-03-01 1065 2015
942 2014-12-01 2018-03-01 1186 2014

893 rows × 4 columns

This table shows for badges issued each year, how many days after receiving the badge the instructor taught their first workshop in 90-day bins. The first row shows the number of people badged that year that had already taught their first workshop.

No badges were issued before December 2012. This was when we officially started issuing badges to people with a history with Software Carpentry.

Detailed narrative coming. Bullets

  • Many more trainees have previous Carpentries teaching experience than expected
  • Good to know we are engaging people in his way, would also like to think about the people who have not taught a workshop after getting badged. Does the lack of experience keep them from pursuing teaching opportunities?

This table does not yet account for badged instructors who have never taught. (coming soon)

In [32]:
bins = pd.cut(days_badge_to_teach['days'], [-2000, 0, 90, 180, 360, np.inf])
days_badge_to_teach.groupby(bins)['days'].agg(['count'])

binned_days = days_badge_to_teach.groupby([bins, 'badge_year'])['days'].agg(['count'])

binned_days =  binned_days.unstack()

binned_days.fillna(0, inplace=True)


binned_days = binned_days.astype(int)
binned_days
Out[32]:
count
badge_year 2012 2013 2014 2015 2016 2017
days
(-2000, 0] 31 43 60 35 124 98
(0, 90] 1 6 24 41 57 42
(90, 180] 1 7 25 33 43 43
(180, 360] 0 7 21 29 34 38
(360, inf] 0 2 14 12 20 2
In [33]:
binned_days.index.values
Out[33]:
[(-2000, 0], (0, 90], (90, 180], (180, 360], (360, inf]]
Categories (5, object): [(-2000, 0] < (0, 90] < (90, 180] < (180, 360] < (360, inf]]
In [34]:
# Remove bar chart - box plot seems to be more informative.

# fig = plt.figure(figsize=(15, 8)) # Create matplotlib figure

# ax = fig.add_subplot(111) # Create matplotlib axes

# xticks = ['before', '3 mos', '6 mos', '12 mos', 'more than 1 year']

# title = "Days from badging to teaching by year"
# binned_days.plot(y = list(binned_days[::-1]), kind='bar', ax=ax, position=1, title=title)


# plt.show()
In [35]:
title = "Days from badging to teaching by year"

# has_attendance = attendance[pd.notnull(attendance['attendance'])]

meanprops = dict(marker='o', markeredgecolor='black', markerfacecolor='firebrick')
boxprops = dict(linestyle='-', linewidth=2, color='k')

ax = days_badge_to_teach.boxplot(by="badge_year", figsize=(10,8), showmeans = True, meanprops = meanprops, boxprops = boxprops)
ax.set_title(title)
ax.get_figure().suptitle("")
plt.show()

Instructor trainers

Until 2016, all Instructor Training events were run by Software Carpentry founder and former Executive Director Greg Wilson. This limited our growth...

In 2016, The Carpentries launched a training program for instructor trainers. (narrative to be developed - bullets below)

  • Allowed us to expand reach by running several events a month, across timezones for online events
  • Allowed us to build capacity at member organizations who have onsite Instructor Trainers, run events for their site building a community of instructors there, these instructors also have onsite support to run workshops
  • Trainers in African and European countries overlap timezones allowing them to connect with a wider audience
  • Allowed us to reach new geographies - new group of Trainers in Africa
  • Connect this back to increase in number of training events and new instructors
  • Another Trainers training event is being planned for late 2018; anticipate same growth rate as 2017.
In [36]:
trainers = pd.read_csv("data_files/trainers.csv")
trainers['awarded'] = pd.to_datetime(trainers['awarded'])
trainers['country'] = trainers['country'].apply(get_country_name)

# Backwards way of fixing the NA/Namibia issue
trainers.replace('online', 'Namibia', inplace = True)

trainers
Out[36]:
id awarded country
0 4251 2016-02-03 United States
1 6269 2016-02-03 New Zealand
2 7995 2016-02-03 United States
3 1701 2016-02-04 United Kingdom
4 8221 2016-03-16 Australia
5 1828 2016-05-13 United States
6 6886 2016-04-17 United States
7 8777 2012-12-20 Canada
8 5864 2016-07-12 Norway
9 4446 2016-09-26 Norway
10 8291 2016-09-26 South Africa
11 591 2016-10-17 United States
12 3278 2016-10-17 United States
13 8623 2016-12-16 Australia
14 8497 2016-12-16 Australia
15 12350 2017-02-09 United States
16 8755 2017-02-21 United States
17 1687 2017-02-21 Canada
18 11249 2017-02-21 Italy
19 4422 2017-02-21 Netherlands
20 5332 2017-05-08 United States
21 10151 2017-05-08 United States
22 13098 2017-05-08 United States
23 13759 2017-06-05 United States
24 9243 2017-06-05 United States
25 10948 2017-06-05 United States
26 5880 2017-06-05 United Kingdom
27 1126 2017-06-05 United Kingdom
28 2106 2017-06-05 New Zealand
29 7083 2017-06-05 United States
30 4521 2017-06-05 United States
31 3485 2017-08-28 United States
32 4423 2017-08-28 United Kingdom
33 12232 2017-08-28 Netherlands
34 4291 2017-08-28 United Kingdom
35 12356 2017-08-28 South Africa
36 10749 2017-08-28 South Africa
37 9333 2017-08-28 Canada
38 8721 2017-08-28 United States
39 5507 2017-08-28 United States
40 5248 2017-08-28 Belgium
41 10199 2017-08-28 Canada
42 10462 2017-08-28 Greece
43 1806 2017-08-28 United Kingdom
44 11260 2018-03-12 Australia
45 13955 2018-03-12 Ethiopia
46 10513 2018-03-12 Germany
47 11653 2018-03-12 Namibia
48 10344 2018-03-12 Norway
49 751 2018-03-12 New Zealand
50 11280 2018-03-12 New Zealand
51 11657 2018-03-12 South Africa
52 6326 2018-03-12 United Kingdom
53 13563 2018-03-12 United Kingdom
54 13564 2018-03-12 United Kingdom
55 14610 2018-03-12 United States
56 14659 2018-03-12 United States
57 7184 2018-03-12 United States

New Instructor Trainers by Year

In [37]:
trainers_by_year = trainers.groupby(trainers['awarded'].dt.year).id.count().to_frame()

trainers_by_year
Out[37]:
id
awarded
2012 1
2016 14
2017 29
2018 14
In [38]:
fig = plt.figure(figsize=(15, 8)) # Create matplotlib figure

ax = fig.add_subplot(111) # Create matplotlib axes

title = "New badged Trainers by year"
trainers_by_year.plot(kind='bar', ax=ax, position=1, title=title)

plt.show()

New Instructor Trainers by Country

In [39]:
trainers_by_country = trainers.groupby(trainers['country']).id.count().to_frame()

trainers_by_country
Out[39]:
id
country
Australia 4
Belgium 1
Canada 4
Ethiopia 1
Germany 1
Greece 1
Italy 1
Namibia 1
Netherlands 2
New Zealand 4
Norway 3
South Africa 4
United Kingdom 9
United States 22
In [40]:
fig = plt.figure(figsize=(15, 8)) # Create matplotlib figure

ax = fig.add_subplot(111) # Create matplotlib axes


title = "New badged Trainers by Country"
trainers_by_country.plot(kind='bar', ax=ax, position=1, title=title, legend=False, grid=False)


plt.show()
In [41]:
trainers_by_country_year = trainers.groupby(['country', trainers['awarded'].dt.year])['id'].count().to_frame().unstack()

# This transformation prevents plot below from being drawn
# Needs to come after plot
# trainers_by_country_year = trainers_by_country_year.fillna(0)
# trainers_by_country_year = trainers_by_country_year.astype(int)
# trainers_by_country_year = trainers_by_country_year.replace(0, "-")
# trainers_by_country_year.reset_index()

trainers_by_country_year
Out[41]:
id
awarded 2012 2016 2017 2018
country
Australia NaN 3.0 NaN 1.0
Belgium NaN NaN 1.0 NaN
Canada 1.0 NaN 3.0 NaN
Ethiopia NaN NaN NaN 1.0
Germany NaN NaN NaN 1.0
Greece NaN NaN 1.0 NaN
Italy NaN NaN 1.0 NaN
Namibia NaN NaN NaN 1.0
Netherlands NaN NaN 2.0 NaN
New Zealand NaN 1.0 1.0 2.0
Norway NaN 2.0 NaN 1.0
South Africa NaN 1.0 2.0 1.0
United Kingdom NaN 1.0 5.0 3.0
United States NaN 6.0 13.0 3.0
In [42]:
fig = plt.figure(figsize=(10, 6)) # Create matplotlib figure

ax = fig.add_subplot(111) # Create matplotlib axes

title = "New Badged Trainers by Country by Year"
trainers_by_country_year.plot(y = list(trainers_by_country_year)[::-1], kind='barh', ax=ax, position=1, title=title)

ax.grid(True)

plt.show()