#!/usr/bin/env python
# coding: utf-8

# # Importing the libraries

# In[1]:


from pandasdmx import Request


# In[2]:


import pandas as pd


# In[3]:


from bokeh.io import output_notebook, show
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, LabelSet


# In[4]:


output_notebook()


# # Connecting to the webservice

# In[5]:


estat = Request('ESTAT')


# ### Downloading dataflow definitions

# In[6]:


dflow = estat.dataflow()


# ## Defining functions

# ### Data structure

# In[7]:


def dsd_resp(id):
    """
    The function takes a dataflow identifier and requests from the Eurostat SDMX api the dataflow's datastructure, datastructure id and datastructure response.
    """    
    # Defining the data set's dataflow based on the data set id.
    df = dflow.dataflow[id]
    
    # Printing the table's name
    print("The table's name:")
    print(dflow.write().dataflow.loc[id][0])
    
    # Acquiring the dataflow's datastructure id
    dsd_id = df.structure.id
    
    # Requesting for the dataflow's datastructure
    dsd_resp = estat.get(resource_type = 'datastructure', resource_id = dsd_id)
    
    # Requesting the dataflow's datastructure
    dsd = dsd_resp.datastructure[dsd_id]
    
    return dsd_resp, dsd_id, dsd


# ### Datastructure attributes

# In[8]:


def dsd_att(dsd):
    """The function takes a table datastructure definition and prints its measure, dimension and attribute lists."""
    # Measures
    print("Measures:")
    print(dsd.measures.aslist())
    
    # Dimensions
    print("\nDimensions:")
    print(dsd.dimensions.aslist())
    
    # Attributes
    print("\nAttributes:")
    print(dsd.attributes.aslist())


# ## Selecting the tables

# ### Listing the tables from the 'High-tech industry and knowledge-intensive services (**htec**)' database

# In[9]:


dflow.write().dataflow[dflow.write().dataflow.index.str.startswith('htec') == True]


# ### The tables I am going to analyze

# In[10]:


id1 = 'htec_sti_exp2'


# In[11]:


dflow.write().dataflow.loc[id1][0]


# In[12]:


id2 = 'htec_vci_stage2'


# In[13]:


dflow.write().dataflow.loc[id2][0]


# # Venture capital data

# ### Getting the table's dataflow definition and datastructure

# In[14]:


dsd_resp2, dsd_id2, dsd2 = dsd_resp(id2)


# ### Downloading the table's datastructure definition

# In[15]:


dsd_att(dsd2)


# Codelist:

# In[16]:


dsd_resp2.write().codelist.loc[:,:] # [['EXPEND', 'UNIT'],:]


# ### Requesting the data

# In[17]:


dresp2 = estat.get(resource_type = 'data',
                   resource_id = id2,
                   params = {'references': None,
                             #'startPeriod': '2014',
                             #'endPeriod': '2014'
                            })


# In[18]:


dat2 = dresp2.data


# In[19]:


dat2.dim_at_obs


# In[20]:


ser2 = list(dat2.series)
len(ser2)


# In[21]:


ser2[5].key


# In[22]:


set(s.key.UNIT for s in dat2.series)


# ### Writing the data into DataFrame

# In[23]:


iter2 = (s for s in dat2.series if (s.key.GEO in ['EU15']) == False)
tab2 = dresp2.write(iter2)
tab2 = tab2.stack('GEO')
tab2.columns = tab2.columns.droplevel(['FREQ']) # Dropping 'FREQ' and 'UNIT' levels
tab2.swaplevel('UNIT', 'EXPEND', axis = 1)
tab2 = tab2.loc['2015', ['PC_GDP', 'MIO_EUR']]
# tab2.dropna(axis = 0, how = 'any', inplace = True)
tab2.index = tab2.index.droplevel(0) # Dropping the 'TIME_PERIOD' axis
tab2


# In[32]:


source = ColumnDataSource(data = dict(names = list(tab2.index), invest = tab2.loc[:,('PC_GDP','INV')], venture = tab2.loc[:,('PC_GDP','INV_VEN')]))

p = figure(plot_width = 600, plot_height = 600)
p.square(x = tab2.loc[:,('PC_GDP','INV')], y = tab2.loc[:,('PC_GDP','INV_VEN')], size = list(tab2.loc[:,('MIO_EUR','INV')] / 500), color = "blue", alpha = 0.6)

p.xaxis.axis_label = 'Total R&D Investment'
p.yaxis.axis_label = 'Venture Capital R&D Investment'

labels = LabelSet(x = 'invest',
                  y = 'venture',
                  text = 'names',
                  source = source,
                  render_mode='canvas',
                  level='glyph')

p.add_layout(labels)
show(p)


# In[ ]: