In [1]:

from IPython.display import display
from IPython.display import HTML
import IPython.core.display as di

# This line will hide code by default when the notebook is exported as HTML
#di.display_html('<script>jQuery(function() {if (jQuery("body.notebook_app").length == 0) { jQuery(".input_area").toggle(); jQuery(".prompt").toggle();}});</script>', raw=True)
# This line will add a button to toggle visibility of code blocks, for use with the HTML export version
di.display_html('''<button onclick="jQuery('.input_area').toggle(); jQuery('.prompt').toggle();">Toggle code</button>''', raw=True)

Clean Development Mechanism (CDM) data cleaning project part 1¶

Data source: http://www.cdmpipeline.org/cdm-projects-region.htm

Objective: clean data and reorganize so that team can more easily find relevant data and information.

The team was interested in types of CDM projects, number of CERs (certified emission reduction units) per project, trends by region and country, approving organizations, and failed projects.

In [2]:

import pandas as pd
import numpy as np
import openpyxl
import xlrd
import xlwt
import xlsxwriter
import math

In [3]:

tallies_file = "CDMStatesAndProvinces.xlsx"
tallies = pd.ExcelFile(tallies_file)
n = len(tallies.sheet_names) - 1

Example of one sheet inside raw Excel file¶

The raw data was organized such that each country had it's own sheet. There were then counts for the number of CDM projects of each type by county within each country. The team was not interested in counties, so I extracted totals for each country and made a new Excel file with one sheet. Each country got a row and the total types of each CDM project were listed.

In [6]:

tallies.parse('Albania')

Out[6]:

	This workbook was produced by Jørgen Fenhann, UNEP DTU Partnership from the CDMPipeline of 1st October 2018, jqfe@dtu.dk, Phone (+45)40202789	Unnamed: 1	Unnamed: 2	Unnamed: 3	Unnamed: 4	Unnamed: 5	Unnamed: 6	Unnamed: 7	Unnamed: 8	Unnamed: 9	...	Unnamed: 20	Unnamed: 21	Unnamed: 22	Unnamed: 23	Unnamed: 24	Unnamed: 25	Unnamed: 26	Unnamed: 27	Unnamed: 28	Unnamed: 29
NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Albania Counties	Afforestation	Agriculture	Biomass energy	Cement	CO2 capture	Coal bed/mine methane	Energy distribution	EE households	EE industry	EE own generation	...	N2O	PFCs and SF6	Reforestation	Solar	Tidal	Transport	Wind	Total	GDP/cap	Population (millions)
Berat	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0	NaN	NaN
Diber	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	1	NaN	NaN	NaN	NaN	1	NaN	NaN
Durres	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0	NaN	NaN
Elbasan	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	1	NaN	NaN	NaN	NaN	2	NaN	NaN
Fier	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0	NaN	NaN
Gjirokaster	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0	NaN	NaN
Korce	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	1	NaN	NaN	NaN	NaN	2	NaN	NaN
Kukes	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	1	NaN	NaN	NaN	NaN	1	NaN	NaN
Lezhe	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0	NaN	NaN
Shkoder	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	1	NaN	NaN	NaN	NaN	2	NaN	NaN
Tirana	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1	NaN	NaN
Vlore	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0	NaN	NaN
Many	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0	NaN	NaN
n.a.	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0	NaN	NaN
Total	0	0	0	0	0	0	0	0	0	0	...	0	0	5	0	0	0	0	9	NaN	NaN

17 rows × 30 columns

In [7]:

test = tallies.parse('Albania')
test = test.drop(test.index[0])
header = test.iloc[0]
test = test[1:]
test = test.rename(columns = header)
test = test.loc['Total',:]
test = test.to_frame().transpose()
test = test.rename({'Total': 'Albania'})
test = test.iloc[:,0:27]

In [8]:

columns = test.columns.tolist()
tallies_sheets = []
for sheet in tallies.sheet_names[1:]:
    df = tallies.parse(sheet)
    df = df.drop(df.index[0])
    header = df.iloc[0]
    df = df[1:]
    df = df.rename(columns = header)
    df = df.loc['Total',:]
    df = df.to_frame().transpose()
    df = df.rename({'Total': sheet})
    df = df.iloc[:,0:27]
    df.columns = columns
    tallies_sheets.append(df)

In [9]:

cdm_df = pd.concat(tallies_sheets)

In [10]:

cdm_df['Total'] = cdm_df.sum(axis=1)

In [11]:

cols = [cdm_df.columns[-1]] + [col for col in cdm_df if col != cdm_df.columns[-1]]
cdm_df = cdm_df[cols]

Final spreadsheet¶

Below are the first 5 rows (countries) of the new spreadsheet. Notice that I also moved the total column to the front, so it would be easier to get that information if that is all the team was looking for.

In [12]:

cdm_df.head()

Out[12]:

	Total	Afforestation	Biomass energy	CO2 capture	EE industry	...	Landfill gas	Methane avoidance	PFCs and SF6	Reforestation	Solar	Wind
Albania	9.0	0	0	0	0	...	0	0	0	5	0	0
Algeria	0.0	0	0	0	0	...	0	0	0	0	0	0
Angola	0.0	0	0	0	0	...	0	0	0	0	0	0
Argentina	53.0	1	6	1	2	...	12	8	1	1	2	11
Armenia	6.0	0	0	0	0	...	1	1	0	0	0	0

5 rows × 28 columns

In [18]:

# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter('cdm_consolidated.xlsx', engine='xlsxwriter')

# Convert the dataframe to an XlsxWriter Excel object.
cdm_df.to_excel(writer, sheet_name='Sheet1')

# Close the Pandas Excel writer and output the Excel file.
writer.save()

In [ ]: