#!/usr/bin/env python # coding: utf-8 # # Learning how to understand a project management dataset and build models in Python for further analysis # Today we will be exploring a synthetically generated dataset based on the PERIL database. Learn more about the PERIL dataset and it's creator Tom Kendrick [here](http://www.failureproofprojects.com/peril.php). # To accomplish our task of understanding the data, we will need to use some Python libraries. Python libraries can be summarized as code other people have written and shared that we can use to save us time. We can import and use that code with a small import statement. Some of the packages we will use today include [Pandas](https://towardsdatascience.com/a-quick-introduction-to-the-pandas-python-library-f1b678f34673), [Scipy](https://www.scipy.org/getting-started.html), [Numpy](https://docs.scipy.org/doc/numpy-1.15.0/user/whatisnumpy.html), and [MatPlotLib](https://matplotlib.org/). If you want to learn more about how these different packages work with examples, click [here](http://cs231n.github.io/python-numpy-tutorial/). If you would like to see the code that imports these packages - click the show code button below. # In[34]: #Importing python libraries which help us speed our coding up. #The packages we're importing are below. import pandas as pd import random from pandas.plotting import scatter_matrix import scipy as sc import numpy as np import matplotlib import matplotlib.pyplot as plt #Can abbreviate specific parts of the packages so it's easier to type import sklearn as sk import seaborn as sns from sklearn.linear_model import LogisticRegression from sklearn import metrics from sklearn.neighbors import KNeighborsClassifier from sklearn.model_selection import train_test_split from sklearn import datasets, linear_model # In[4]: #We will now read the data in and store it in a dataframe called dataset dataset = pd.read_excel('Peril_Dataset.xlsx', header = 0) dataset.insert(loc = 0, column = 'Project Number', value = np.arange(len(dataset))) # #### Understanding our data # Let's run some code and see how many rows and columns we have. Below - you will see the result as (Number of Rows, Number of Columns) # In[5]: dataset.shape # #### What do the first 10 rows of our data look like? # In[6]: dataset.head(10) # #### What are our column names? # ##### As you can see above - we have 12 columns with different values # Project Number: A unique number to identify each project. # Parameter: The parameter of the project that was impacted. This dataset categorizes the risk in these three main groups of Scope, Resource, and Schedule. # Category: The category of the risk. We have the following categories in the main parameters - Change(Scope), Defect(Scope), People(Resource), Money(Resource), Outsourcing(Resource), Delay(Schedule), Dependency(Schedule), Estimates(Schedule) # Sub cat: The sub-category of the risk are the following - Creep, Dependency, Gap, Hardware, Software, Integration, Limitation, Late Start, Loss, Motivation, Queuing, Temp Loss, Decision, Information, Parts, Infrastructure, Legal, Project, Deadline, Judgement, and Learning # #
# Below is the mapping for the Parameter -> Category -> Sub category: #
# # * Scope -> Change -> Creep # * Scope -> Change -> Dependency # * Scope -> Change -> Gap #
# # * Scope -> Defect -> Hardware # * Scope -> Defect -> Software # * Scope -> Defect -> Integration #
# # * Resource -> Money -> Limitation #
# * Resource -> People -> Late Start # * Resource -> People -> Loss # * Resource -> People -> Motivation # * Resource -> People -> Queuing # * Resource -> People -> Temp Loss #
# * Schedule -> Delay -> Decision # * Schedule -> Delay -> Hardware # * Schedule -> Delay -> Information # * Schedule -> Delay -> Parts #
# * Schedule -> Dependency -> Infrastructure # * Schedule -> Dependency -> Legal # * Schedule -> Dependency -> Project #
# * Schedule -> Estimates -> Deadline # * Schedule -> Estimates -> Judgement # * Schedule -> Estimates -> Learning #
# # Impact: The impact to the project by number of weeks. # TRL: The technology readiness level of the project. You can find more information [here](http://acqnotes.com/acqnote/tasks/technology-readiness-level). # Description: The description of the issue. # Region Numeric: A numeric value representing the region (0 = Americas, 1 = Asia, 2 = Eur/ME, 3 = Africa). # Region: What region of the world did the issue occur in. # Project: What type of project did the issue occur in. (Prod. Dev., or IT/Solution) # Year: The year the issue occurred in. # Cost: The cost, in USD, of the impact on the project. # Month: The month the project failure occurred in. # #### Now let's get a breakdown of what are the counts of the values in each column # Here we can see that there are three different types of parameters that each project is grouped into that we have: Scope, Resource, and Schedule. Below the chart are the raw counts of the projects in those parameter groups. # In[129]: plt.figure(figsize = (3,10)) plot_parameter = dataset['Parameter'].value_counts().plot(kind='bar') plot_parameter.set_xlabel("Parameter") plot_parameter.set_ylabel("Count (Number of Projects)") plot_parameter # In[8]: dataset["Parameter"].value_counts() # What is the breakdown of the different Categories that we have? We find the following categories: Change, People, Delay, Defect, Outsourcing, Estimates, Dependency, and Money. Below the chart are the raw counts of the projects in those categories. # In[9]: plot_category = dataset['Category'].value_counts().plot(kind='bar') plot_category.set_xlabel("Category") plot_category.set_ylabel("Count (Number of Projects)") plot_category # In[10]: dataset["Category"].value_counts() # What is the breakdown of the sub-categories? Here we find the following: Gap, Creep, Late or poor output, Hardware, Parts, Loss, Temp loss, Queueing, Software, Learning, Limitation, Information, Judgement, Decision, Dependency, Infrastructure, Project, Late start, Delayed start, Integration, Deadline, Motivation, Legal, Project, Infrastructure, and Late or poor output. Below the chart are the raw counts of the projects in those sub-categories. # In[11]: plot_subcat = dataset['Sub cat'].value_counts().plot(kind='bar') plot_subcat.set_xlabel("Sub-Category") plot_subcat.set_ylabel("Count (Number of Projects)") plot_subcat # In[12]: dataset["Sub cat"].value_counts() # Here you can see that the Region column has four separate regions and the count of values in those regions is provided below in the table. # In[13]: dataset["Region"].value_counts() # What years do we have in our dataset? 2001-2019 # In[14]: dataset["Year"].value_counts() # What is the distribution of the Impact levels in our data? # In[53]: plot_impact = dataset['Impact'].value_counts().plot(kind='bar') plot_impact.set_xlabel("Impact Level") plot_impact.set_ylabel("Count (Number of Projects)") plot_impact # ## Analysis # #### Below we'll start looking at analyzing projects from different perspectives to understand which parameters, categories, sub-categories, TRL levels, regions, project types, and dates produced the highest costs and impacts. # # The Column Names and Values that can be inputted below: # # ##### The Project Number # * A number between 0-4999. # # ##### Parameter # * Scope # * Resource # * Schedule # # ##### Category # * Change # * Defect # * Delay # * Dependency # * Estimates # * Money # * Outsourcing # * People # # ##### Sub cat # * Creep # * Dependency # * Gap # * Hardware # * Software # * Integration # * Limitation # * Late Start # * Loss # * Motivation # * Queuing # * Temp Loss # * Decision # * Information # * Parts # * Infrastructure # * Legal # * Project # * Deadline # * Judgement # * Learning # # ##### Impact # * Numbers between 1-26 in weeks # # ##### TRL # * Numbers between 1-9 # # ##### Description # # ##### Region Numeric # * 0: Americas # * 1: Asia # * 2: Eur/ME # * 3: Africa # # ##### Region # * Americas # * Asia # * Eur/ME # * Africa # # ##### Project # * IT/Solution # * Prod. Dev. # # ##### Year # * Years from 2001-2019 # # ##### Cost # * Number between 10787 and 2543242 # # ##### Month # * Any month in the calendar year (January - December) # __What is the cost distribution in our data? We find that we have a median between "500,000" and "1,000,000" and the cost values are skewed to the left.__ # In[15]: boxplot = dataset.boxplot(column=['Cost'], vert = False) boxplot.set_xlabel("Cost (in USD)") boxplot # If we want to understand which projects contribute to the outliers, let's filter the data to projects where the cost was greater than $2,000,000. Let's also sort the data so we can see the projects with the highest cost failures first. # In[77]: dataset_high_cost = dataset[dataset['Cost']>2000000] dataset_high_cost = dataset_high_cost.sort_values(['Cost'], ascending = False) dataset_high_cost # We find 72 rows where the cost value is greater than $2,000,000. Let's plot the parameters, categories, and sub-categories off these project failures to get a better sense of what types of projects comprise these high cost failures. # In[59]: plot_high_cost = dataset_high_cost['Parameter'].value_counts().plot(kind='bar') plt.xlabel('Parameter') plt.ylabel('Count (of Project Failures)') plt.title('The Count of High Cost Project Failures by Parameter') # In[60]: plot_high_cost_by_category = dataset_high_cost['Category'].value_counts().plot(kind='bar') plt.xlabel('Category') plt.ylabel('Count (of Project Failures)') plt.title('The Count of High Cost Project Failures by Category') # In[61]: plot_high_cost_by_subcat = dataset_high_cost['Sub cat'].value_counts().plot(kind='bar') plt.xlabel('Sub-Category') plt.ylabel('Count (of Project Failures)') plt.title('The Count of High Cost Project Failures by Sub-Category') # Let's plot these high cost project failures to see which one has the most. # In[62]: plot_high_cost_by_region = dataset_high_cost['Region'].value_counts().plot(kind='bar') plt.xlabel('Region') plt.ylabel('Count (of Project Failures)') plt.title('The Count of High Cost Project Failures by Region') # __Let's say we want to look at the data for the projects in the Americas region - we can run the following code. Let's also sort the data so we can see the projects with the highest cost failures first. If you want to see the data for a difference region then change 'Americas' to a different region (copy and paste the region name from above). For example you can copy and paste the line below to see the data for just the Europe/Middle East region:__ # # Copy and paste the code below:
# region_filter = dataset.loc[dataset['Region'] == 'Eur/ME']
# region_filter = region_filter.sort_values(['Cost'], ascending = False)
# region_filter.head(50) # In[78]: region_filter = dataset.loc[dataset['Region'] == 'Americas'] region_filter = region_filter.sort_values(['Cost'], ascending = False) region_filter.head(50) #Let's look at the first 50 values # __What if we want to see the data for projects in the Americas that had a category of money? Let's also sort the data so we can see the projects with the highest cost failures first. See below for the code to do so, you can change the category that you would like by copying and pasting the category you want from above into the code below.__ # For example, if I want to look at the project in the Americas where the category is People then I could do:
# category_filter = category_filter.sort_values(['Cost'], ascending = False)
# dataset.loc[(dataset['Category'] == 'People') & (dataset['Region'] == 'Americas')] # In[73]: category_filter = dataset.loc[(dataset['Category'] == 'Money') & (dataset['Region'] == 'Americas')] category_filter = category_filter.sort_values(['Cost'], ascending = False) category_filter.head(50) #Let's look at the first 50 rows where the region is the Americas and the category is Money # __How can we visualize this filtered data to understand how TRL level effects the Impact value in projects in the Americas, in the Money Category?__ # In[79]: g = category_filter.groupby('TRL', as_index = False)['Impact'].mean() g['Impact'] = g['Impact'].round(2) g # In[128]: plt.figure(figsize = (3,10)) h = plt.bar(g['TRL'],g['Impact']) plt.xlabel('TRL') plt.ylabel('Impact (in Weeks)') plt.title('Average Impact by TRL for Projects in the Americas with a Failure in the Money Category') h # __Now that we can we graph this data easily and analyze the data for our understanding? Let's start with understanding how the average cost per failure varies by region.__ # In[125]: y = dataset.groupby('Region', as_index = False)['Cost'].mean() y['Cost'] = y['Cost'].round(2) y # In[126]: plt.figure(figsize = (3,10)) a = plt.bar(y['Region'],y['Cost']) plt.xlabel('Region') plt.ylabel('Average Cost (in USD)') plt.title('Average Cost (in USD) per Region') a # __What is the Average Cost Per Year for all the Projects?__ # In[23]: z = dataset.groupby('Year', as_index = False)['Cost'].mean() z['Cost'] = z['Cost'].round(2) z.sort_values(by = ['Year']) z # In[39]: b = plt.plot(z['Year'],z['Cost']) plt.xlabel('Year') plt.ylabel('Average Cost (in USD)') plt.title('Average Cost (in USD) over Time for Projects in all the Regions') plt.xticks(np.arange(2001, 2020, 2)) b # In[40]: y = region_filter.groupby('Year', as_index = False)['Cost'].mean() y['Cost'] = y['Cost'].round(2) y = y.sort_values(by = ['Year']) y # __Now let's look at the Average Cost Per Year for Projects in the Americas.__ # In[26]: a = plt.plot(y['Year'],y['Cost']) plt.xlabel('Year') plt.ylabel('Average Cost (in USD)') plt.title('Average Cost (in USD) over Time for Projects in the Americas') plt.xticks(np.arange(2001, 2020, 2)) a # __Let's take this one step further - what if we look at the average cost for projects in all the regions but the Americas? First we'll filter the data to only include projects in the Asia, Eur/ME, and Africa regions. Let's also sort the data so we can see the projects with the highest cost failures first.__ # In[81]: region_filter_no_americas = dataset.loc[dataset['Region'] != 'Americas'] region_filter_no_americas = region_filter_no_americas.sort_values(['Cost'], ascending = False) region_filter_no_americas.head(50) #Let's look at the first 50 values # In[42]: c = region_filter_no_americas.groupby('Year', as_index = False)['Cost'].mean() c['Cost'] = c['Cost'].round(2) c = c.sort_values(by = ['Year']) c # In[43]: l = plt.plot(c['Year'],c['Cost']) plt.xlabel('Year') plt.ylabel('Average Cost (in USD)') plt.title('Average Cost (in USD) over Time for Projects not in the Americas') plt.xticks(np.arange(2001, 2020, 2)) l # __Now that we've looked at the cost breakdown over time per region - let's look at the difference in the average Impact (in weeks) per region.__ # In[30]: d = dataset.groupby('Region', as_index = False)['Impact'].mean() d['Impact'] = d['Impact'].round(2) d # __Let's also take a look at how TRL values affect Impact.__ # In[31]: e = dataset.groupby('TRL', as_index = False)['Impact'].mean() e['Impact'] = e['Impact'].round(2) e # In[127]: plt.figure(figsize = (3,10)) f = plt.bar(e['TRL'],e['Impact']) plt.xlabel('TRL') plt.ylabel('Average Impact (in Weeks)') plt.title('Average Impact (in Weeks) per TRL') plt.yticks(np.arange(0, 13, 3)) f