#!/usr/bin/env python # coding: utf-8 # ## This notebook is part of Hadoop tutorials delivered by IT-DB group # ### SPARK DATAFRAME Hands-On Lab using table from relational database # ### Hands-On 1 - Load a relation table into DataFrame # #### First, lets import the bits we need # In[ ]: from pyspark import SparkContext, SQLContext, SparkConf import os # #### download the oracle jdbc driver # In[ ]: get_ipython().system('curl -o /tmp/ojdbc7.jar https://cernbox.cern.ch/index.php/s/e7NJ5jD3qZB7RNO/download') # #### This is only required in the notebook mode, in spark-shell, spark-submit, pyspark you can direcly pass --jars (e.g pyspark --jars /tmp/ojdbc7.jar) # In[ ]: os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars /tmp/ojdbc7.jar pyspark-shell' # #### Create the SparkContext and SQLContext explicitly # In[ ]: sc = SparkContext() # In[ ]: sqlContext = SQLContext(sc) # #### Create the jdbc DataFrame as below, please note how to pass the driver, connection url, username and password # *before executing the next cell, please replace PWD, SERVER and PORT with password, database host and port respectively* # In[ ]: db_df = sqlContext.read.format('jdbc') \ .options(driver='oracle.jdbc.driver.OracleDriver',url='jdbc:oracle:thin:meetup/$PWD@$SERVER:$PORT/PIMT_RAC51.cern.ch',dbtable='meetup_data') \ .load() # #### Read the data from the dataframe # In[ ]: db_df.select("EVENT_ID","EVENT_NAME").show(5)