#!/usr/bin/env python # coding: utf-8 #

# # # Python Developer Salary Survey Results #

# The survey was conducted over a 1 month period in February 2014 using a simple Google form. The anonymized data was then stored in an SQL Server database. The raw data is publicly accessible via SlashDB API. In this notebook we are doing some simple analysis of the results using (what else) Python. Enjoy. # # ## Access to raw survey data provided by SlashDB # http://demo.slashdb.com/db/pystreet.html # # ### Survey data, HTML representation # * Results table: http://demo.slashdb.com/db/pystreet/response.html # * Results with salaries converted to USD: http://demo.slashdb.com/db/pystreet/response_usd.html # * USA only: http://demo.slashdb.com/db/pystreet/response_usd/country/United%20States.html # # ### Survey data, JSON representation # * Results table: http://demo.slashdb.com/db/pystreet/response.json # * Results with salaries converted to USD: http://demo.slashdb.com/db/pystreet/response_usd.json # * USA only: http://demo.slashdb.com/db/pystreet/response_usd/country/United%20States.json # ### Data Load # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import pandas import numpy pandas.options.display.float_format = '${:,.2f}'.format responses_all = pandas.read_json('http://demo.slashdb.com/db/pystreet/response_usd.json') responses_usa = pandas.read_json('http://demo.slashdb.com/db/pystreet/response_usd/country/United%20States.json') # In[12]: responses_usa # ## Survey participation by country # In[13]: by_country = responses_all.groupby('country').size().sort_values() by_country.plot(kind='barh', figsize=(7,10), fontsize=15) # ## Salary Analysis # ### Remove outliers # Note that there are a few outliers in the raw data, where the annual salary is clearly outside of sensible range. Skip the following cell if you don't want to remove the outliers or modify the cutoff values. # In[5]: min_salary_cutoff = 500 max_salary_cutoff = 800000 responses_all = responses_all[responses_all['salary_usd'] > min_salary_cutoff] responses_all = responses_all[responses_all['salary_usd'] < max_salary_cutoff] responses_usa = responses_usa[responses_usa['salary_usd'] > min_salary_cutoff] responses_usa = responses_usa[responses_usa['salary_usd'] < max_salary_cutoff] # ### Salary paid # In[6]: salary = responses_all["salary_usd"] salary_usa = responses_usa["salary_usd"] desired_salary = responses_all["desired_salary_usd"] d = [{'Min':salary_usa.min(),'Max':salary_usa.max(),'Avg':salary_usa.mean(),'Median':salary_usa.median()}, {'Min':salary.min(),'Max':salary.max(),'Avg':salary.mean(),'Median':salary.median()}] df = pandas.DataFrame.from_records(d) df = df[['Min','Max','Avg','Median']] df.index = ['USA','Worldwide',] df # ### Salary desired # In[7]: desired_salary = responses_all["desired_salary_usd"] desired_salary_usa = responses_usa["desired_salary_usd"] d = [{'Min':desired_salary_usa.min(),'Max':desired_salary_usa.max(),'Avg':desired_salary_usa.mean(),'Median':desired_salary_usa.median()}, {'Min':desired_salary.min(),'Max':desired_salary.max(),'Avg':desired_salary.mean(),'Median':desired_salary.median()}] df = pandas.DataFrame.from_records(d) df = df[['Min','Max','Avg','Median']] df.index = ['USA','Worldwide',] df # ### USA Salary Breakdown # In[8]: plot = responses_usa.hist(column=['salary_usd','desired_salary_usd'], bins=[0, 25000,50000,75000,100000,125000,150000,175000,200000,225000,250000,275000,300000,325000,350000,375000,400000,425000,450000,475000,500000], figsize=(20,5), xrot=45, xlabelsize=15) # ### Worldwide Salary Breakdown # In[9]: plot = responses_all.hist(column=['salary_usd','desired_salary_usd'], bins=[0, 25000,50000,75000,100000,125000,150000,175000,200000,225000,250000,275000,300000,325000,350000,375000,400000,425000,450000,475000,500000], figsize=(20,5), xrot=45, xlabelsize=15, ylabelsize=15) # ### How much more Python developers would like to earn? # In[10]: x = responses_all[['salary_usd','desired_salary_usd']].mean() # x.plot(kind='barh') plot = x.plot(kind="bar", figsize=(7,7), fontsize=15, title="Average salary and average desired salary worldwide") print("At a maximum {0:.0%}".format(((responses_all['desired_salary_usd'] - responses_all['salary_usd'])/responses_all['salary_usd'] ).max())) print("On average {0:.0%}".format(((responses_all['desired_salary_usd'] - responses_all['salary_usd'])/responses_all['salary_usd'] ).mean())) print("At a minimum {0:.0%}".format(((responses_all['desired_salary_usd'] - responses_all['salary_usd'])/responses_all['salary_usd'] ).min())) # ### Salary as a function of years of experience # In[11]: from numpy import max, min df = responses_usa[['years_experience','salary_usd']] g = df.groupby('years_experience') df = g.agg([min, max]) df.plot(kind="line", figsize=(12,7), title="U.S. Python developer salary range as a function of experience.") df = responses_all[['years_experience','salary_usd']] g = df.groupby('years_experience') df = g.agg([min, max]) df.plot(kind="line", figsize=(12,7), title="Worldwide Python developer salary range as a function of experience.") # ## About Us # * Pystreet is a up and coming online community of professional Python developers. We are currently in pre-launch mode at http://pystreet.com # # * SlashDB is a new kind of middleware, which instantly creates REST APIs to SQL databases so their content becomes instantly accessible to authorized web, mobile and enterpise applications and end-users, under standard data formats for reading and writing. http://www.slashdb.com/ # * vt.enterprise is a technology consultancy, co-sourcing and software development firm located in Jersey City, NJ. We are hiring: http://vtenterprise.com/career.html # #

# # #