import urllib2
import tensorflow as tf
import numpy as np
import findspark
from sklearn import linear_model
findspark.init()
from pyspark import SparkContext, SparkFiles, SQLContext
if not 'sc' in locals():
sc = SparkContext()
if not 'sqlContext' in locals():
sqlContext = SQLContext(sc)
wordsDF = sqlContext.createDataFrame([('cat',), ('elephant',), ('rat',), ('rat',), ('cat', )], ['word'])
wordCountsDF = wordsDF.groupBy("word").count()
wordCountsDF.show()
x_data = np.random.rand(100).astype(np.float32)
y_data = x_data * 0.1 + 0.3
W = tf.Variable(tf.random_uniform([1], -1.0, 1.0))
b = tf.Variable(tf.zeros([1]))
y = W * x_data + b
loss = tf.reduce_mean(tf.square(y - y_data))
optimizer = tf.train.GradientDescentOptimizer(0.5)
train = optimizer.minimize(loss)
init = tf.initialize_all_variables()
sess = tf.Session()
sess.run(init)
for step in range(201):
sess.run(train)
if step % 20 == 0:
print(step, sess.run(W), sess.run(b))
+--------+-----+ | word|count| +--------+-----+ | rat| 2| | cat| 2| |elephant| 1| +--------+-----+ (0, array([-0.07621384], dtype=float32), array([ 0.56465578], dtype=float32)) (20, array([ 0.04050734], dtype=float32), array([ 0.33296335], dtype=float32)) (40, array([ 0.0856716], dtype=float32), array([ 0.30793899], dtype=float32)) (60, array([ 0.09654912], dtype=float32), array([ 0.30191207], dtype=float32)) (80, array([ 0.09916888], dtype=float32), array([ 0.30046052], dtype=float32)) (100, array([ 0.09979983], dtype=float32), array([ 0.30011091], dtype=float32)) (120, array([ 0.0999518], dtype=float32), array([ 0.30002671], dtype=float32)) (140, array([ 0.09998839], dtype=float32), array([ 0.30000645], dtype=float32)) (160, array([ 0.0999972], dtype=float32), array([ 0.30000156], dtype=float32)) (180, array([ 0.09999932], dtype=float32), array([ 0.30000037], dtype=float32)) (200, array([ 0.09999985], dtype=float32), array([ 0.3000001], dtype=float32))