Importing the libraries¶

In [1]:

from pandasdmx import Request

In [2]:

import pandas as pd

In [3]:

from bokeh.io import output_notebook, show
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, LabelSet

In [4]:

output_notebook()

Loading BokehJS ...

Connecting to the webservice¶

In [5]:

estat = Request('ESTAT')

Downloading dataflow definitions¶

In [6]:

dflow = estat.dataflow()

Defining functions¶

Data structure¶

In [7]:

def dsd_resp(id):
    """
    The function takes a dataflow identifier and requests from the Eurostat SDMX api the dataflow's datastructure, datastructure id and datastructure response.
    """    
    # Defining the data set's dataflow based on the data set id.
    df = dflow.dataflow[id]
    
    # Printing the table's name
    print("The table's name:")
    print(dflow.write().dataflow.loc[id][0])
    
    # Acquiring the dataflow's datastructure id
    dsd_id = df.structure.id
    
    # Requesting for the dataflow's datastructure
    dsd_resp = estat.get(resource_type = 'datastructure', resource_id = dsd_id)
    
    # Requesting the dataflow's datastructure
    dsd = dsd_resp.datastructure[dsd_id]
    
    return dsd_resp, dsd_id, dsd

Datastructure attributes¶

In [8]:

def dsd_att(dsd):
    """The function takes a table datastructure definition and prints its measure, dimension and attribute lists."""
    # Measures
    print("Measures:")
    print(dsd.measures.aslist())
    
    # Dimensions
    print("\nDimensions:")
    print(dsd.dimensions.aslist())
    
    # Attributes
    print("\nAttributes:")
    print(dsd.attributes.aslist())

Selecting the tables¶

Listing the tables from the 'High-tech industry and knowledge-intensive services (htec)' database¶

In [9]:

dflow.write().dataflow[dflow.write().dataflow.index.str.startswith('htec') == True]

Out[9]:

	name
dataflow
htec_cis3	Innovation in high-tech sectors in SMEs (CIS3)...
htec_cis4	Innovation in high-tech sectors (CIS 2004), EU...
htec_cis5	Innovation in high-tech sectors (CIS 2006), EU...
htec_cis6	Innovation in high-tech sectors (CIS 2008, CIS...
htec_eco_ent	Enterprises in high-tech sectors by NACE Rev.1...
htec_eco_ent2	Enterprises in high-tech sectors by NACE Rev.2...
htec_eco_sbs	Economic data in high-tech sectors by NACE Rev...
htec_eco_sbs2	Economic data in high-tech sectors by NACE Rev...
htec_emp_nat	Employment in technology and knowledge-intensi...
htec_emp_nat2	Employment in technology and knowledge-intensi...
htec_emp_nisced	Employment in technology and knowledge-intensi...
htec_emp_nisced2	Employment in technology and knowledge-intensi...
htec_emp_nisco	Employment in technology and knowledge-intensi...
htec_emp_nisco2	Employment in technology and knowledge-intensi...
htec_emp_reg	Employment in technology and knowledge-intensi...
htec_emp_reg2	Employment in technology and knowledge-intensi...
htec_emp_risced	Employment in technology and knowledge-intensi...
htec_emp_risced2	Employment in technology and knowledge-intensi...
htec_emp_risco	Employment in technology and knowledge-intensi...
htec_emp_risco2	Employment in technology and knowledge-intensi...
htec_emp_sbs	Employment statistics on high-tech industries ...
htec_emp_sbs2	Employment statistics on high-tech industries ...
htec_kia_emp	Annual data on employment in knowledge-intensi...
htec_kia_emp2	Annual data on employment in knowledge-intensi...
htec_si_exp4	High-tech exports - Exports of high technology...
htec_sti_exp	Business enterprise R&D expenditure in high-te...
htec_sti_exp2	Business enterprise R&D expenditure in high-te...
htec_sti_pers	Business enterprise R&D personnel in high-tech...
htec_sti_pers2	Business enterprise R&D personnel in high-tech...
htec_trd_group4	High-tech trade by high-tech group of products...
htec_trd_tot4	Total high-tech trade in million euro and as a...
htec_vci_stage1	Venture capital investment by aggregated stage...
htec_vci_stage2	Venture capital investment by detailed stage o...

The tables I am going to analyze¶

In [10]:

id1 = 'htec_sti_exp2'

In [11]:

dflow.write().dataflow.loc[id1][0]

Out[11]:

'Business enterprise R&D expenditure in high-tech sectors - NACE Rev. 2'

In [12]:

id2 = 'htec_vci_stage2'

In [13]:

dflow.write().dataflow.loc[id2][0]

Out[13]:

'Venture capital investment by detailed stage of development (from 2007, source: EVCA)'

Venture capital data¶

Getting the table's dataflow definition and datastructure¶

In [14]:

dsd_resp2, dsd_id2, dsd2 = dsd_resp(id2)

The table's name:
Venture capital investment by detailed stage of development (from 2007, source: EVCA)

Downloading the table's datastructure definition¶

In [15]:

dsd_att(dsd2)

Measures:
[PrimaryMeasure | OBS_VALUE]

Dimensions:
[Dimension | FREQ, Dimension | EXPEND, Dimension | UNIT, Dimension | GEO, TimeDimension | TIME_PERIOD]

Attributes:
[DataAttribute | OBS_FLAG, DataAttribute | OBS_STATUS]

Codelist:

In [16]:

dsd_resp2.write().codelist.loc[:,:] # [['EXPEND', 'UNIT'],:]

Out[16]:

		dim_or_attr	name
FREQ	FREQ	D	FREQ
	A	D	Annual
	D	D	Daily
	H	D	Half-year
	M	D	Monthly
	Q	D	Quarterly
	S	D	Semi-annual
	W	D	Weekly
EXPEND	EXPEND	D	EXPEND
	INV	D	Investment
	INV_BUY	D	Investment for buyout
	INV_GROW	D	Investment in growth stage
	INV_REPL	D	Investment for replacement
	INV_RESC	D	Investment for rescue/turnaround
	INV_VEN	D	Venture capital investment (seed, start-up and...
	INV_VEN_LATE	D	Investment in later stage
	INV_VEN_SEED	D	Investment in seed stage
	INV_VEN_STAR	D	Investment in start-up stage
UNIT	UNIT	D	UNIT
	MIO_EUR	D	Million euro
	NR_COMP	D	Number of companies
	PC_GDP	D	Percentage of gross domestic product (GDP)
GEO	GEO	D	GEO
	AT	D	Austria
	BE	D	Belgium
	BG	D	Bulgaria
	CH	D	Switzerland
	CZ	D	Czech Republic
	DE	D	Germany (until 1990 former territory of the FRG)
	DK	D	Denmark
	...	...	...
	FI	D	Finland
	FR	D	France
	HU	D	Hungary
	IE	D	Ireland
	IT	D	Italy
	LU	D	Luxembourg
	NL	D	Netherlands
	NO	D	Norway
	PL	D	Poland
	PT	D	Portugal
	RO	D	Romania
	SE	D	Sweden
	UK	D	United Kingdom
OBS_FLAG	OBS_FLAG	A	Observation flag.
	b	A	break in time series
	c	A	confidential
	d	A	definition differs, see metadata
	e	A	estimated
	f	A	forecast
	i	A	see metadata (phased out)
	n	A	not significant
	p	A	provisional
	r	A	revised
	s	A	Eurostat estimate (phased out)
	u	A	low reliability
	z	A	not applicable
OBS_STATUS	OBS_STATUS	A	Observation status.
	-	A	not applicable or real zero or zero by default
	0	A	less than half of the unit used
	na	A	not available

63 rows × 2 columns

Requesting the data¶

In [17]:

dresp2 = estat.get(resource_type = 'data',
                   resource_id = id2,
                   params = {'references': None,
                             #'startPeriod': '2014',
                             #'endPeriod': '2014'
                            })

In [18]:

dat2 = dresp2.data

In [19]:

dat2.dim_at_obs

Out[19]:

'TIME_PERIOD'

In [20]:

ser2 = list(dat2.series)
len(ser2)

Out[20]:

In [21]:

ser2[5].key

Out[21]:

SeriesKey(UNIT='MIO_EUR', EXPEND='INV', GEO='DE', FREQ='A')

In [22]:

set(s.key.UNIT for s in dat2.series)

Out[22]:

{'MIO_EUR', 'NR_COMP', 'PC_GDP'}

Writing the data into DataFrame¶

In [23]:

iter2 = (s for s in dat2.series if (s.key.GEO in ['EU15']) == False)
tab2 = dresp2.write(iter2)
tab2 = tab2.stack('GEO')
tab2.columns = tab2.columns.droplevel(['FREQ']) # Dropping 'FREQ' and 'UNIT' levels
tab2.swaplevel('UNIT', 'EXPEND', axis = 1)
tab2 = tab2.loc['2015', ['PC_GDP', 'MIO_EUR']]
# tab2.dropna(axis = 0, how = 'any', inplace = True)
tab2.index = tab2.index.droplevel(0) # Dropping the 'TIME_PERIOD' axis
tab2

Out[23]:

UNIT	MIO_EUR									PC_GDP
EXPEND	INV	INV_BUY	INV_GROW	INV_REPL	INV_RESC	INV_VEN	INV_VEN_LATE	INV_VEN_SEED	INV_VEN_STAR	INV	INV_BUY	INV_GROW	INV_REPL	INV_RESC	INV_VEN	INV_VEN_LATE	INV_VEN_SEED	INV_VEN_STAR
GEO
AT	109.0	27.0	54.0	0.0	0.0	27.0	5.0	10.0	13.0	0.032	NaN	NaN	NaN	NaN	0.008	NaN	NaN	NaN
BE	722.0	562.0	68.0	18.0	11.0	63.0	30.0	0.0	33.0	0.176	NaN	NaN	NaN	NaN	0.015	NaN	NaN	NaN
BG	16.0	11.0	5.0	0.0	0.0	1.0	0.0	0.0	1.0	0.036	NaN	NaN	NaN	NaN	0.002	NaN	NaN	NaN
CH	640.0	197.0	107.0	27.0	26.0	283.0	143.0	5.0	134.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
CZ	13.0	0.0	9.0	0.0	0.0	4.0	1.0	1.0	1.0	0.008	NaN	NaN	NaN	NaN	0.002	NaN	NaN	NaN
DE	5996.0	4515.0	554.0	165.0	6.0	757.0	330.0	44.0	383.0	0.198	NaN	NaN	NaN	NaN	0.025	NaN	NaN	NaN
DK	1162.0	832.0	35.0	5.0	0.0	290.0	53.0	8.0	230.0	0.437	NaN	NaN	NaN	NaN	0.109	NaN	NaN	NaN
EL	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.000	NaN	NaN	NaN	NaN	0.000	NaN	NaN	NaN
ES	1051.0	667.0	152.0	81.0	45.0	105.0	40.0	3.0	62.0	0.097	NaN	NaN	NaN	NaN	0.010	NaN	NaN	NaN
FI	512.0	371.0	40.0	2.0	1.0	98.0	25.0	9.0	64.0	0.247	NaN	NaN	NaN	NaN	0.047	NaN	NaN	NaN
FR	9451.0	6848.0	1743.0	112.0	3.0	744.0	364.0	11.0	370.0	0.433	NaN	NaN	NaN	NaN	0.034	NaN	NaN	NaN
HU	118.0	94.0	0.0	0.0	0.0	24.0	3.0	3.0	18.0	0.109	NaN	NaN	NaN	NaN	0.022	NaN	NaN	NaN
IE	161.0	64.0	9.0	0.0	0.0	89.0	38.0	0.0	50.0	0.075	NaN	NaN	NaN	NaN	0.041	NaN	NaN	NaN
IT	1160.0	974.0	123.0	30.0	0.0	33.0	13.0	2.0	18.0	0.071	NaN	NaN	NaN	NaN	0.002	NaN	NaN	NaN
LU	74.0	32.0	0.0	0.0	0.0	41.0	7.0	0.0	34.0	0.141	NaN	NaN	NaN	NaN	0.079	NaN	NaN	NaN
NL	1708.0	1031.0	388.0	46.0	23.0	220.0	64.0	8.0	147.0	0.252	NaN	NaN	NaN	NaN	0.032	NaN	NaN	NaN
NO	1165.0	730.0	344.0	0.0	0.0	92.0	53.0	2.0	37.0	0.333	NaN	NaN	NaN	NaN	0.026	NaN	NaN	NaN
PL	802.0	653.0	115.0	5.0	0.0	29.0	15.0	3.0	12.0	0.188	NaN	NaN	NaN	NaN	0.007	NaN	NaN	NaN
PT	150.0	40.0	27.0	13.0	0.0	70.0	18.0	5.0	47.0	0.084	NaN	NaN	NaN	NaN	0.039	NaN	NaN	NaN
RO	50.0	46.0	2.0	0.0	0.0	1.0	1.0	0.0	0.0	0.031	NaN	NaN	NaN	NaN	0.001	NaN	NaN	NaN
SE	1722.0	1429.0	106.0	37.0	0.0	150.0	57.0	0.0	93.0	0.388	NaN	NaN	NaN	NaN	0.034	NaN	NaN	NaN
UK	20521.0	17382.0	2157.0	67.0	82.0	833.0	512.0	10.0	310.0	0.799	NaN	NaN	NaN	NaN	0.032	NaN	NaN	NaN

In [32]:

source = ColumnDataSource(data = dict(names = list(tab2.index), invest = tab2.loc[:,('PC_GDP','INV')], venture = tab2.loc[:,('PC_GDP','INV_VEN')]))

p = figure(plot_width = 600, plot_height = 600)
p.square(x = tab2.loc[:,('PC_GDP','INV')], y = tab2.loc[:,('PC_GDP','INV_VEN')], size = list(tab2.loc[:,('MIO_EUR','INV')] / 500), color = "blue", alpha = 0.6)

p.xaxis.axis_label = 'Total R&D Investment'
p.yaxis.axis_label = 'Venture Capital R&D Investment'

labels = LabelSet(x = 'invest',
                  y = 'venture',
                  text = 'names',
                  source = source,
                  render_mode='canvas',
                  level='glyph')

p.add_layout(labels)
show(p)

In [ ]: