#!/usr/bin/env python # coding: utf-8 # # Importing the libraries # In[1]: from pandasdmx import Request # In[2]: import pandas as pd # In[3]: from bokeh.io import output_notebook, show from bokeh.plotting import figure from bokeh.models import ColumnDataSource, LabelSet # In[4]: output_notebook() # # Connecting to the webservice # In[5]: estat = Request('ESTAT') # ### Downloading dataflow definitions # In[6]: dflow = estat.dataflow() # ## Defining functions # ### Data structure # In[7]: def dsd_resp(id): """ The function takes a dataflow identifier and requests from the Eurostat SDMX api the dataflow's datastructure, datastructure id and datastructure response. """ # Defining the data set's dataflow based on the data set id. df = dflow.dataflow[id] # Printing the table's name print("The table's name:") print(dflow.write().dataflow.loc[id][0]) # Acquiring the dataflow's datastructure id dsd_id = df.structure.id # Requesting for the dataflow's datastructure dsd_resp = estat.get(resource_type = 'datastructure', resource_id = dsd_id) # Requesting the dataflow's datastructure dsd = dsd_resp.datastructure[dsd_id] return dsd_resp, dsd_id, dsd # ### Datastructure attributes # In[8]: def dsd_att(dsd): """The function takes a table datastructure definition and prints its measure, dimension and attribute lists.""" # Measures print("Measures:") print(dsd.measures.aslist()) # Dimensions print("\nDimensions:") print(dsd.dimensions.aslist()) # Attributes print("\nAttributes:") print(dsd.attributes.aslist()) # ## Selecting the tables # ### Listing the tables from the 'High-tech industry and knowledge-intensive services (**htec**)' database # In[9]: dflow.write().dataflow[dflow.write().dataflow.index.str.startswith('htec') == True] # ### The tables I am going to analyze # In[10]: id1 = 'htec_sti_exp2' # In[11]: dflow.write().dataflow.loc[id1][0] # In[12]: id2 = 'htec_vci_stage2' # In[13]: dflow.write().dataflow.loc[id2][0] # # Venture capital data # ### Getting the table's dataflow definition and datastructure # In[14]: dsd_resp2, dsd_id2, dsd2 = dsd_resp(id2) # ### Downloading the table's datastructure definition # In[15]: dsd_att(dsd2) # Codelist: # In[16]: dsd_resp2.write().codelist.loc[:,:] # [['EXPEND', 'UNIT'],:] # ### Requesting the data # In[17]: dresp2 = estat.get(resource_type = 'data', resource_id = id2, params = {'references': None, #'startPeriod': '2014', #'endPeriod': '2014' }) # In[18]: dat2 = dresp2.data # In[19]: dat2.dim_at_obs # In[20]: ser2 = list(dat2.series) len(ser2) # In[21]: ser2[5].key # In[22]: set(s.key.UNIT for s in dat2.series) # ### Writing the data into DataFrame # In[23]: iter2 = (s for s in dat2.series if (s.key.GEO in ['EU15']) == False) tab2 = dresp2.write(iter2) tab2 = tab2.stack('GEO') tab2.columns = tab2.columns.droplevel(['FREQ']) # Dropping 'FREQ' and 'UNIT' levels tab2.swaplevel('UNIT', 'EXPEND', axis = 1) tab2 = tab2.loc['2015', ['PC_GDP', 'MIO_EUR']] # tab2.dropna(axis = 0, how = 'any', inplace = True) tab2.index = tab2.index.droplevel(0) # Dropping the 'TIME_PERIOD' axis tab2 # In[32]: source = ColumnDataSource(data = dict(names = list(tab2.index), invest = tab2.loc[:,('PC_GDP','INV')], venture = tab2.loc[:,('PC_GDP','INV_VEN')])) p = figure(plot_width = 600, plot_height = 600) p.square(x = tab2.loc[:,('PC_GDP','INV')], y = tab2.loc[:,('PC_GDP','INV_VEN')], size = list(tab2.loc[:,('MIO_EUR','INV')] / 500), color = "blue", alpha = 0.6) p.xaxis.axis_label = 'Total R&D Investment' p.yaxis.axis_label = 'Venture Capital R&D Investment' labels = LabelSet(x = 'invest', y = 'venture', text = 'names', source = source, render_mode='canvas', level='glyph') p.add_layout(labels) show(p) # In[ ]: