from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("tutorial-1").getOrCreate()
sc = spark.sparkContext
data = [1, 2, 3, 4, 5]
distData = sc.parallelize(data)
type(distData)
textFilePath = './emails.txt'
emails = ## YOUR CODE GOES HERE ##
type(emails)
data = list(range(1,11))
print(data)
numbers = sc.parallelize(data,2)
print(numbers)
print(numbers.collect())
numbers.filter(lambda x: x % 2 == 0).collect()
emails.filter(## YOUR CODE GOES HERE ##).collect()
numbers.map(lambda x: x*2).collect()
m = numbers.map(lambda x: [x**2, x**3]).collect()
fm = numbers.flatMap(lambda x: [x**2, x**3]).collect()
print(m)
print(fm)
# Hint use the python split() function
username_domain = emails.map(## YOUR CODE GOES HERE ##)
username_domain.collect()
username_domain.keys().collect()
username_domain.values().collect()
data = ["a", "b", "a", "a", "b", "b", "b", "b"]
rdd = sc.parallelize(data)
pairRDD = rdd.map(lambda x: (x, 1))
pairRDD.reduceByKey(lambda x,y: x+y).collect()
# do another mapping operation to make all domains in a list
username_domain = username_domain.map(## YOUR CODE GOES HERE ##)
print("** Results from mapping values to list")
print(username_domain.top(3))
print("\n** Results from reduceByKey ** ")
username_domain.reduceByKey(## YOUR CODE GOES HERE ##).collect()
spark.stop()