from pyspark import *
# Create a range of 0 - 100
num = range(100)
# Create a RDD in memory using Spark Context
numbers = sc.parallelize(num)
# Evaluate number of components in `numbers`
numbers.count()
numbers.first()
0
# Filter and create a new RDD of even numbers between 0 and 100.
# This is a lazy evaluation and is not computed until required.
even = numbers.filter(lambda number: number%2 ==0)
# See the type of `even`
print(even)
PythonRDD[2] at RDD at PythonRDD.scala:48
# Print first 5 components
print(even.take(5))
[0, 2, 4, 6, 8]
# Count the number of even numbers
even.count()
50
# Create an RDD from a dataset stored in Azure Blob storage
# This is the 2003 NY Taxi data set of trips
trips = sc.textFile("wasb://data@cdspsparksamples.blob.core.windows.net/NYCTaxi/KDD2016/trip_data_12.csv")
# Extract the first component of the data set
trips.first()
'medallion,hack_license,vendor_id,rate_code,store_and_fwd_flag,pickup_datetime,dropoff_datetime,passenger_count,trip_time_in_secs,trip_distance,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude'
# Filter out the trips by a particular medallion holder
# and create a persistant dataset
medallion = trips.filter(lambda line: "0BD7C8F5BA12B88E0B67BED28BEA73D8" in line)
medallion.persist
<bound method RDD.persist of PythonRDD[10] at RDD at PythonRDD.scala:48>
# Print the first line of the filtered new RDD
medallion.first()
'0BD7C8F5BA12B88E0B67BED28BEA73D8,9FD8F69F0804BDB5549F40E9DA1BE472,CMT,1,N,2013-12-26 17:37:16,2013-12-26 17:59:10,1,1314,8.50,-73.974045,40.743279,-73.870285,40.773304'
# Count the number of elements in the RDD
medallion.count()
1048
# Create another RDD for a different medallion and count the number of trips
medallion2 = trips.filter(lambda line: "D7D598CD99978BD012A87A76A7C891B7" in line)
medallion2.count()
1311
# Union operation combining data of medallion 1 and 2
medallions = medallion.union(medallion2)
medallions.count()
2359