This project follows the CRISP-DM process to answer the followig questions:
%load_ext autoreload
%autoreload 2
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import utils
df = pd.read_csv("../data/so_survey_2019/survey_results_public.csv")
schema = pd.read_csv("../data/so_survey_2019/survey_results_schema.csv")
eu_countries = pd.read_csv("../data/listofeucountries.csv")
df.shape
schema.head()
df.head()
df.columns
df['DevType'].dropna()
missing_values_mean = (df.isna().mean().round(4) * 100)
missing_values_mean.sort_values(ascending=False)[:15]
print("columns with no missing values: ", set(df.columns[df.isnull().mean() == 0.0]))
set(missing_values_mean[missing_values_mean < 15].sort_values(ascending=True).index)
countries_vals = df.Country.value_counts()
countries_vals_percentage = (countries_vals[:15] * 100 / df.Country.dropna().shape[0])
# colors_list = ['#5cb85c','#5bc0de','#d9534f']
# ax = countries_vals_percentage.plot(kind="bar", color = colors_list)
ax = countries_vals_percentage.plot(kind="bar", figsize=(8,4),
width = 0.8, edgecolor=None, color="steelblue")
plt.ylabel("Respondents %")
plt.title("Respondents % by Country: Total {}".format(df.Country.dropna().shape[0]));
plt.xticks(fontsize=10)
nb_respondents = list(countries_vals[:15])
utils.display_values_above_bars(ax, nb_respondents)
plt.show()
df_eu_respondents = df[df.Country.isin(eu_countries.x)]
df_eu_respondents.Country.unique()
total_eu_respondents = df_eu_respondents.Country.shape[0]
print("Total respondents in the USA", countries_vals["United States"])
print("Total respondents in EU", total_eu_respondents)
# sanity check
df.Country.value_counts().sum()
ax_eu = (df_eu_respondents.Country.value_counts() * 100 / df_eu_respondents.Country.dropna().shape[0]).plot(
kind="bar", figsize=(12,4),
width = 0.8, edgecolor=None);
# plt.axhline(y=0.05,color='gray',linestyle='--')
# plt.axhline(y=0.1,color='gray',linestyle='--')
plt.ylabel("Respondents %")
plt.title("Respondents % from EU countries");
utils.display_values_above_bars(ax_eu, list(df_eu_respondents.Country.value_counts()))
plt.show()
df.LanguageWorkedWith.head()
languages = df['LanguageWorkedWith'].dropna().apply(lambda row: str(row).split(';')).explode().unique()
languages_worked_with = df['LanguageWorkedWith'].dropna()
language_responses = languages_worked_with.count()
languages_perc = {}
languages_count = {}
for language in languages:
languages_perc[language] = df["LanguageWorkedWith"].str.contains(
language, regex=False, na=False).sum() / languages_worked_with.shape[0]
languages_count[language] = df["LanguageWorkedWith"].str.contains(
language, regex=False, na=False).sum()
languages_perc = utils.sort_dict_by_vals(languages_perc)
languages_count = utils.sort_dict_by_vals(languages_count)
plt.title("Programming, Scripting, and Markup Languages: {} Respondents".format(df["LanguageWorkedWith"].dropna().shape[0]))
plt.xticks(rotation='vertical')
plt.ylabel("Repondents %")
plt.bar(languages_perc.keys(), languages_perc.values(), width=0.8, edgecolor=None);
devtypes = df['DevType'].dropna()
devtypes
devtypes_list = devtypes.apply(lambda row: str(row).split(';')).explode().unique()
print("DevTypes list", devtypes_list)
not_na_devtypes = df['DevType'].dropna()
not_na_devtypes
devtypes_list = not_na_devtypes.apply(lambda row: str(row).split(';')).explode().unique()
print("DevTypes list", devtypes_list)
# Get the developer type without the environment (after the comma e.g. Developer, desktop or enterprise applications)
devtypes = set(dev.split(',')[0].strip() for dev in devtypes_list)
devtype_count, devtype_perc = utils.count_substr_in_column(df, "DevType", devtypes, False)
plt.xticks(rotation='vertical')
plt.xlabel("Developers %")
plt.title("% of Respondents by Developer Type")
y_pos = np.arange(len(devtype_perc))
ax_devs = plt.barh(y_pos, devtype_perc.values());
plt.yticks(y_pos, devtype_perc.keys())
plt.show()
print("\nNumber of respondents: ", df["DevType"].dropna().shape[0])
print("\nDev Types:", devtype_count)
top8_devtypes = list(devtype_perc.keys())[:8]
os_perception = (df.OpenSource.value_counts() * 100 / df.OpenSource.shape[0]).sort_values()
os_perception.plot(kind="barh");
plt.title("Devleopers perception of the OSS quality")
plt.xlabel("Developers %")
plt.yticks(rotation=0, size=8)
plt.get("Developers %");
The column OpenSource doesn't contain any missing value. All Respondents answered the corresponding question: How often do you contribute to open source?
How often do developers contribute to OpenSource Software?
nb_opnesourcers = df.OpenSourcer.count()
opensourcers_count = df.OpenSourcer.value_counts()
colors = ['#ff9999','cornflowerblue','#ffcc99', '#99ff99']
utils.plot_and_show_pie(opensourcers_count.index, opensourcers_count.values,
"Contribution Frequency to OSS: {} respondents".format(df.Respondent.count()), 3, colors=colors);
df.OpenSourcer.replace({
"Less than once a month but more than once per year": "Less than once a month\nbut more than once per year"},
inplace=True)
df.OpenSourcer.value_counts() * 100 / df.shape[0]
colors_list = ['lightgray', 'lightsteelblue', 'mediumseagreen', 'steelblue', 'cornflowerblue', 'cornflowerblue']
ax_oss_contributors = (df.OpenSourcer.value_counts() / df.shape[0]).plot(
kind="bar",
edgecolor=None, figsize=(6,4), color=colors_list);
plt.ylabel("Respondents %")
plt.xticks(rotation=30, ha="right")
ax_oss_contributors.set_xticklabels(list(df.OpenSourcer.unique()))
plt.title("Percentage of respondents contributing to OSS:\n total {}".format(df.OpenSourcer.value_counts().sum()));
utils.display_values_above_bars(ax_eu, list(df.OpenSourcer.value_counts()))
ax_oss_contributors.get_xticklabels()[2].set_color("green")
ax_oss_contributors.get_xticklabels()[0].set_color("gray")
plt.show()
Do Hobbyist developers contribute more often to Open Source projects?
df.Hobbyist.unique()
df[df.OpenSourcer == "Never"].groupby("Hobbyist")["Respondent"].count()
hobbyist_opensourcer = df.filter(["Respondent", "OpenSourcer", "Hobbyist"], axis=1).groupby(['OpenSourcer', 'Hobbyist']).count()
hobbyist_opensourcer
hobbyist_opensourcer = hobbyist_opensourcer.reset_index()
hobbyist_opensourcer
hobbyist_opensourcer['Hobbyist'].replace({"No": "Don't code as a Hobby",
"Yes": "Code as a Hobby"}, inplace=True)
splot = utils.plot_grouped_bars(hobbyist_opensourcer, "OpenSourcer", "Hobbyist", "Respondent",
title="Do Hobbyist contribute more to OSS?",
xlabel="", ylabel="# Respondents");
splot.legend(loc='center left', bbox_to_anchor=(1.00, 0.5), ncol=1);
Does OSS quality perception play a bias role towards OSS contribution
group_oss_quality_hobbyist = df[df.Hobbyist == 'Yes'].filter(["Respondent", "OpenSourcer", "OpenSource"], axis=1).groupby(['OpenSourcer', 'OpenSource']).count()
group_oss_quality_hobbyist
group_oss_quality_nothobbyist = df[df.Hobbyist == 'No'].filter(["Respondent", "OpenSourcer", "OpenSource"], axis=1).groupby(['OpenSourcer', 'OpenSource']).count()
oss_quality_perception_groups = [group_oss_quality_hobbyist, group_oss_quality_nothobbyist]
group_oss_quality_nothobbyist
group_oss_quality_hobbyist = group_oss_quality_hobbyist.reset_index()
group_oss_quality_nothobbyist = group_oss_quality_nothobbyist.reset_index()
nr_rows = 1
nr_cols = 2
fig, axs = plt.subplots(nr_rows, nr_cols, figsize=(nr_cols*7,nr_rows*5), squeeze=False, sharex=False)
splot1 = utils.plot_grouped_bars(group_oss_quality_hobbyist, "OpenSourcer", "OpenSource", "Respondent",
ylabel="# Respondents", ax=axs[0][0], title="Hobbyists", legend=False);
splot2 = utils.plot_grouped_bars(group_oss_quality_nothobbyist, "OpenSourcer", "OpenSource", "Respondent",
ylabel="# Respondents", ax=axs[0][1], title="not Hobbyists", legend=False);
plt.suptitle('Is the OSS quality perception biasing the contribution frequency?', size=14)
plt.legend(loc='center left', bbox_to_anchor=(-0.7, -0.5), ncol=1);
plt.subplots_adjust(top=.86)
plt.show()
Does the number of years of experience influence the opensource contribution frequency?
# Check the years of experience unique values
df.YearsCodePro.unique()
value_map = {'Less than 1 year': '0', 'More than 50 years': '51'}
def mapper(val):
"""
Mapping a string value to an int
:param val(string) value to map
"""
return int(value_map.get(val, val))
df['YearsCodeProCleaned'] = df['YearsCodePro'].dropna().apply(mapper)
df.YearsCodeProCleaned.unique()
tmp = [0, 5, 10, 20, 40, 55]
ranges = list(zip(tmp, tmp[1:]))
ranges
def in_the_range(ranges_):
"""
Map value to a range if it is inside it
"""
def f(x):
i = 0
while i < len(ranges_):
r = ranges[i]
if (x >= r[0]) and (x < r[1]):
if r[0] == 40:
return '>40'
else:
return f'{r[0]} - {r[1]}'
i += 1
return f'>{r[1]}'
return f
df['Years of experience'] = (df['YearsCodeProCleaned'].dropna()).apply(in_the_range(ranges))
df['Years of experience']
(df['Years of experience'].value_counts()/ df["Years of experience"].count()).plot(
kind="bar", title="Coding Years of experience including Education");
# df.loc[df['YearsCodeProCleaned'] < 5, 'YearsCodeProCleaned'] = "<5"
opensourcers_by_age = df.filter(["Respondent", "OpenSourcer", "Years of experience"], axis=1).\
groupby(["Years of experience", "OpenSourcer"]).count()
opensourcers_by_age
opensourcers_by_age = opensourcers_by_age.reset_index()
splot = utils.plot_grouped_bars(opensourcers_by_age, "OpenSourcer", "Years of experience", "Respondent",
xlabel="", ylabel="# Respondents");
plt.title("Frequency of contribution to OSS and years of experience groups", size=13,
fontweight='light')
splot.legend(loc='center left', bbox_to_anchor=(1.00, 0.5), ncol=1);
plt.show()
Do developers contributing to the OSS have a higher income?
# Summarize the central tendency, dispersion and shape of the dataset’s distribution, excluding NaN values.
df.CompTotal.describe()
median_salaries = np.median(df.CompTotal.dropna().values)
median_salaries
# Kurtosis to measure whether the distribution is too peaked.
df.kurtosis(axis = 0, skipna = True)
# Outliers are massively skewing the data. Removing them to get a better interpretation of the remaining data
sns.distplot(df.query('CompTotal<{}'.format(median_salaries*20)).CompTotal.dropna()) #, bins=10000);
plt.xlabel("Salary")
plt.ylabel("Density")
plt.title("Developers salary distribution");
salary_data = df.filter(["Respondent", "OpenSourcer", "Hobbyist" , "CompTotal"], axis=1)
salary_data = salary_data[salary_data.CompTotal < median_salaries * 20]
salary_data.shape[0]
# Number of Data points with salary outliers
df.CompTotal.count()
# Number of Salary Data points without outliers
(df.CompTotal.dropna().values < median_salaries * 20).sum()
salary_data = salary_data.groupby(['OpenSourcer']).agg(CompTotal =("CompTotal",'mean'))
# data.groupby(['OpenSourcer', 'Hobbyist']).agg(mean_salary =("CompTotal",'mean'))
# groupby('StationID', as_index=False)['BiasTemp'].mean()
salary_data["mean_salary_formatted"] = salary_data["CompTotal"].apply(lambda x: '{:,.2f}'.format(x)).values.tolist()
salary_data = salary_data.reset_index()
# salary_data['Hobbyist'].replace({"No": "Don't code as a Hobby",
# "Yes": "Code as a Hobby"}, inplace=True)
salary_data
plt.figure(figsize=(5, 4))
splot = sns.barplot(x="OpenSourcer", y="CompTotal",
data=salary_data, palette="Blues_d", capsize=0.1)
# utils.plot_grouped_bars(salary_data, "OpenSourcer", "Hobbyist", "CompTotal",
# title="What is the average salary of Open Sourcers?",
# xlabel="", ylabel="# Respondents")
# splot.legend(loc='center left', bbox_to_anchor=(1.00, 0.5), ncol=1);
plt.ylabel("Mean Salary", size=8, fontweight='light')
plt.xlabel("Freq. of contribution to OSS")
plt.xticks(rotation=50, size=8)
# plt.tight_layout()
plt.title("What is the average salary of Open Sourcers?",
size=11, fontweight='light')
for p in splot.patches:
splot.annotate(format(round(p.get_height() / 1000), '.0f') + "K", (p.get_x()+0.3, p.get_height()),
ha='center', va='center',
size=10, xytext=(0, -12),
textcoords='offset points', color='black')
plt.show()