#!/usr/bin/env python
# coding: utf-8

# ## This notebook is part of Hadoop tutorials delivered by IT-DB group

# ### SPARK DATAFRAME Hands-On Lab using table from relational database

# ### Hands-On 1 - Load a relation table into DataFrame

# #### First, lets import the bits we need

# In[ ]:


from pyspark import SparkContext, SQLContext, SparkConf
import os


# #### download the oracle jdbc driver

# In[ ]:


get_ipython().system('curl -o /tmp/ojdbc7.jar https://cernbox.cern.ch/index.php/s/e7NJ5jD3qZB7RNO/download')


# #### This is only required in the notebook mode, in spark-shell, spark-submit, pyspark you can direcly pass --jars (e.g pyspark --jars /tmp/ojdbc7.jar)

# In[ ]:


os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars /tmp/ojdbc7.jar pyspark-shell'


# #### Create the SparkContext and SQLContext explicitly

# In[ ]:


sc = SparkContext()


# In[ ]:


sqlContext = SQLContext(sc)


# #### Create the jdbc DataFrame as below, please note how to pass the driver, connection url, username and password
# *before executing the next cell, please replace PWD, SERVER and PORT with password, database host and port respectively*

# In[ ]:


db_df = sqlContext.read.format('jdbc') \
    .options(driver='oracle.jdbc.driver.OracleDriver',url='jdbc:oracle:thin:meetup/$PWD@$SERVER:$PORT/PIMT_RAC51.cern.ch',dbtable='meetup_data') \
    .load()


# #### Read the data from the dataframe

# In[ ]:


db_df.select("EVENT_ID","EVENT_NAME").show(5)