#!/usr/bin/env python # coding: utf-8 # # PySpark workaround for ML with `sparklyr` # This notebook stems from [this one](./sparklyr_test2.ipynb) where we realized there's no method to `unnest` columns in `sparklyr`!. # Fortunately here comes [PySpark](https://spark.apache.org/docs/0.9.0/python-programming-guide.html) to help us. # The following commands are 'forked' from this great tutorial: Sentiment analysis with Spark ML. [Material for Machine Learning Workshop Galicia 2016](http://nbviewer.jupyter.org/github/javicacheiro/machine_learning_galicia_2016/blob/master/notebooks/sentiment_analysis-amazon_books.ipynb). # We import our data as a **Spark dataframe**: # In[2]: type(sqlContext) # In[1]: bin_reviews = sqlContext.read.json('amazon/bin_reviews.json') # In[2]: bin_reviews.printSchema() # In[6]: select_reviews = bin_reviews.select('reviewText', 'overall', 'label') select_reviews.show(2) # ## Tokenizer # In[4]: from pyspark.ml.feature import Tokenizer tokenizer = Tokenizer(inputCol="reviewText", outputCol="words") # In[7]: tokenized_reviews = tokenizer.transform(select_reviews) tokenized_reviews.show(2) # ## StopWordsRemover # In[8]: from pyspark.ml.feature import StopWordsRemover remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="filtered") # In[9]: removed_reviews = remover.transform(tokenized_reviews) removed_reviews.show(2) sample_review = removed_reviews.first() print sample_review['words'][:10] print sample_review['filtered'][:10] # In[13]: from pyspark.sql.functions import split, explode unnested_reviews = removed_reviews.select('overall', 'label', explode("filtered").alias("word")) # In[17]: unnested_reviews.show(5) # We save our dataframe for further use in our small `sparklyr` pipeline. # It will take a good load of time to save, so be patient! # In[21]: # unnested_reviews.write.json('unnested_reviews_json') unnested_reviews.write.save('amazon/unnested_reviews_json', format='json', mode='overwrite') # Return to the [sparklyr notebook](./sparklyr_test2.ipynb) to follow the pipeline!. # ## References # - [PySpark Programming Guide](https://spark.apache.org/docs/0.9.0/python-programming-guide.html). # - [PySpark cheatsheet](https://s3.amazonaws.com/assets.datacamp.com/blog_assets/PySpark_SQL_Cheat_Sheet_Python.pdf). # - [Material for Machine Learning Workshop Galicia 2016](http://nbviewer.jupyter.org/github/javicacheiro/machine_learning_galicia_2016/blob/master/notebooks/sentiment_analysis-amazon_books.ipynb). # - [PySpark Course](https://github.com/javicacheiro/pyspark_course).