print "http://localhost:4040/jobs/"
http://localhost:4040/jobs/
rdd = sc.parallelize([1, 2, 3, 4])
rdd.map(lambda x: x * 2)
PythonRDD[1] at RDD at PythonRDD.scala:48
rdd.filter(lambda x: x % 2 == 0)
PythonRDD[2] at RDD at PythonRDD.scala:48
rdd = sc.parallelize([1, 4, 2, 2, 3])
rdd.distinct()
PythonRDD[8] at RDD at PythonRDD.scala:48
rdd = sc.parallelize([1, 2, 3])
rdd.map(lambda x: [x, x + 5])
PythonRDD[10] at RDD at PythonRDD.scala:48
rdd.flatMap(lambda x: [x, x + 5])
PythonRDD[11] at RDD at PythonRDD.scala:48
rdd = sc.parallelize([1, 2, 3])
rdd.reduce(lambda a, b: a * b)
6
rdd.take(2)
[1, 2]
rdd.collect()
[1, 2, 3]
rdd = sc.parallelize([5, 3, 1, 2])
rdd.takeOrdered(3, lambda s: -1 * s)
[5, 3, 2]
rdd.takeOrdered(3)
[1, 2, 3]
lines = sc.textFile("sample_text.txt", 4)
print lines.count()
5
print lines.count()
5
lines = sc.textFile("sample_text.txt", 4)
lines.cache()
print lines.count()
print lines.count()
5 5
rdd = sc.parallelize([(1, 2), (3, 4)])
rdd.collect()
[(1, 2), (3, 4)]
rdd = sc.parallelize([(1, 2), (3, 4), (3, 6)])
rdd.reduceByKey(lambda a, b: a + b).collect()
[(1, 2), (3, 10)]
rdd = sc.parallelize([(1, "a"), (2, "c"), (1, "b")])
rdd.sortByKey().collect()
[(1, 'a'), (1, 'b'), (2, 'c')]
rdd.groupByKey().collect()
[(1, <pyspark.resultiterable.ResultIterable at 0x1066d5e50>), (2, <pyspark.resultiterable.ResultIterable at 0x1064d4750>)]
x = sc.parallelize([("a", 1), ("b", 4)])
y = sc.parallelize([("a", 2), ("a", 3)])
sorted(x.join(y).collect())
[('a', (1, 2)), ('a', (1, 3))]
x = sc.parallelize([("a", 1), ("b", 4)])
y = sc.parallelize([("a", 2)])
sorted(x.leftOuterJoin(y).collect())
[('a', (1, 2)), ('b', (4, None))]
x = sc.parallelize([("a", 1)])
y = sc.parallelize([("a", 2), ("b", 4)])
sorted(x.rightOuterJoin(y).collect())
[('a', (1, 2)), ('b', (None, 4))]
x = sc.parallelize([("a", 1), ("b", 4)])
y = sc.parallelize([("a", 2), ("c", 8)])
sorted(x.fullOuterJoin(y).collect())
[('a', (1, 2)), ('b', (4, None)), ('c', (None, 8))]