In [1]:

from pyspark import *

Starting Spark application

ID	YARN Application ID	Kind	State	Spark UI	Driver log	Current session?
1	application_1499169944872_0005	pyspark3	idle	Link	Link	✔

SparkSession available as 'spark'.

In [2]:

# Create a range of 0 - 100
num = range(100)
# Create a RDD in memory using Spark Context
numbers = sc.parallelize(num)

In [8]:

# Evaluate number of components in `numbers`
numbers.count()
numbers.first()

In [4]:

# Filter and create a new RDD of even numbers between 0 and 100.
# This is a lazy evaluation and is not computed until required.
even = numbers.filter(lambda number: number%2 ==0)

In [5]:

# See the type of `even`
print(even)

PythonRDD[2] at RDD at PythonRDD.scala:48

In [6]:

# Print first 5 components
print(even.take(5))

[0, 2, 4, 6, 8]

In [7]:

# Count the number of even numbers
even.count()

In [14]:

# Create an RDD from a dataset stored in Azure Blob storage
# This is the 2003 NY Taxi data set of trips
trips = sc.textFile("wasb://data@cdspsparksamples.blob.core.windows.net/NYCTaxi/KDD2016/trip_data_12.csv")

In [10]:

# Extract the first component of the data set
trips.first()

'medallion,hack_license,vendor_id,rate_code,store_and_fwd_flag,pickup_datetime,dropoff_datetime,passenger_count,trip_time_in_secs,trip_distance,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude'

In [11]:

# Filter out the trips by a particular medallion holder
# and create a persistant dataset
medallion = trips.filter(lambda line: "0BD7C8F5BA12B88E0B67BED28BEA73D8" in line)
medallion.persist

<bound method RDD.persist of PythonRDD[10] at RDD at PythonRDD.scala:48>

In [12]:

# Print the first line of the filtered new RDD
medallion.first()

'0BD7C8F5BA12B88E0B67BED28BEA73D8,9FD8F69F0804BDB5549F40E9DA1BE472,CMT,1,N,2013-12-26 17:37:16,2013-12-26 17:59:10,1,1314,8.50,-73.974045,40.743279,-73.870285,40.773304'

In [13]:

# Count the number of elements in the RDD
medallion.count()

In [15]:

# Create another RDD for a different medallion and count the number of trips
medallion2 = trips.filter(lambda line: "D7D598CD99978BD012A87A76A7C891B7" in line)
medallion2.count()

In [17]:

# Union operation combining data of medallion 1 and 2
medallions = medallion.union(medallion2)
medallions.count()