import os
import pyspark
from pyspark.sql import SQLContext
# Add the elasticsearch-hadoop jar
os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars /home/jovyan/elasticsearch-hadoop-6.2.2.jar pyspark-shell'
conf = pyspark.SparkConf()
# Point to the master.
conf.setMaster("spark://tweetsets.library.gwu.edu:7101")
import os
import pyspark
# Add the elasticsearch-hadoop jar
os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars /home/jovyan/elasticsearch-hadoop-6.2.2.jar pyspark-shell'
conf = pyspark.SparkConf()
# Point to the master.
conf.setMaster("spark://tweetsets.library.gwu.edu:7101")
conf.setAppName("ireland-8th-analysis")
conf.set("spark.driver.bindAddress", "0.0.0.0")
# Don't hog all of the cores.
conf.set("spark.cores.max", "3")
# Specify a port for the block manager (which runs as part of the worker). The range 7003-7028 is set
# to be open in the Spark worker container.
conf.set("spark.blockManager.port", "7003")
# create the context
sc = pyspark.SparkContext(conf=conf)
# Configure for ElasticSearch cluster and index.
es_conf = {"es.nodes": "tweetsets.library.gwu.edu",
"es.port": "9200",
"es.resource": "tweets-ba2157/doc",
"es.read.field.as.array.include": "hashtags,text,urls"}
sqlContext = SQLContext(sc)
tweets_df = sqlContext.read.format("org.elasticsearch.spark.sql").options(**es_conf).load()
tweets_df.createOrReplaceTempView("tweets")
tweets_df.count()
478303
hashtags_df = sqlContext.sql("SELECT hashtag, count(hashtag) from (SELECT explode(hashtags) hashtag FROM tweets) group by hashtag order by count(hashtag) desc")
hashtags_df.show(50, truncate=False)
+-------------------+--------------+ |hashtag |count(hashtag)| +-------------------+--------------+ |repealthe8th |84179 | |together4yes |57253 | |savethe8th |43656 | |8thref |35935 | |togetherforyes |18763 | |lovebothvoteno |18290 | |voteyes |8245 | |loveboth |8109 | |latelate |5920 | |men4yes |5884 | |votenotoabortion |5732 | |latelateshow |5584 | |voteno |5324 | |prolife |4596 | |hometovote |4342 | |lifecanvass |3637 | |trustwomen |3483 | |repeal |3450 | |ireland |3192 | |mybodymychoice |2903 | |私は黙らない0428 |2879 | |repealth8th |2614 | |jointherebellion |2602 | |abortion |2463 | |repeal8th |2258 | |rtept |1722 | |repealtheeighth |1601 | |register4yes |1445 | |prochoice |1423 | |repealfacts |1360 | |standupforlife |1322 | |itstime |1293 | |8thamendment |1283 | |praytoendabortion |1196 | |womensayno2abortion|1050 | |wakeupireland |1049 | |ourfuture |1049 | |savelives |980 | |menforyes |957 | |scotref |921 | |chooselife |912 | |yes |899 | |corksaysyes |893 | |tinylivesatstake |863 | |time4choice |832 | |istandwithnicola |820 | |repealmobile |758 | |votenoroadshow |757 | |loveboats |736 | |studentsforchoice |646 | +-------------------+--------------+ only showing top 50 rows
screen_name_df = sqlContext.sql("SELECT user_screen_name, count(user_screen_name) from tweets group by user_screen_name order by count(user_screen_name) desc")
screen_name_df.show(50, truncate=False)
+----------------+-----------------------+ |user_screen_name|count(user_screen_name)| +----------------+-----------------------+ |kaydnan |3449 | |Irishprolifer |2783 | |ShannonBlue |2433 | |mobyrne100 |2375 | |EmmaMurphy12150 |2278 | |Declan1497 |2225 | |MaryOGrady8 |2209 | |BernadetteComm1 |2164 | |Donnchadh32 |1823 | |EamonReilly_com |1727 | |Paul71 |1700 | |christi85573643 |1627 | |rosecaroline9 |1626 | |MeathRight2Life |1594 | |DLTogether4Yes |1587 | |MandyGall7 |1559 | |laurathornton30 |1545 | |MaryThorn85 |1528 | |babydollirish2 |1474 | |ebt51 |1430 | |ExposeMediaBias |1419 | |WolfeTone15 |1397 | |IsabelCorcoran5 |1236 | |JanetOS_ |1164 | |renemccoll |1103 | |seamus6346 |1079 | |ElaineYoung94 |1039 | |daraghnoel |873 | |pnolan26 |872 | |PadraigSagart |809 | |NursepollyRgn |809 | |DrCollins10 |807 | |corkmankeane |789 | |Together4yes |773 | |marybuckley549 |764 | |8threfbot |763 | |Berlinnaeus |757 | |Thebfromtuam |754 | |loveboth8 |741 | |GiveBackMy_Mind |730 | |renebatt26 |723 | |Savethe8thInfo |721 | |theRallyforLife |718 | |TeilHarder |716 | |Colmogorman |700 | |paddylepage |685 | |repeal_shield |681 | |paddyearly |679 | |firstlady10000 |678 | |IrelandStandUp |664 | +----------------+-----------------------+ only showing top 50 rows
screen_name_orig_df = sqlContext.sql("SELECT user_screen_name, count(user_screen_name) from tweets where tweet_type='original' group by user_screen_name order by count(user_screen_name) desc")
screen_name_orig_df.show(50, truncate=False)
+----------------+-----------------------+ |user_screen_name|count(user_screen_name)| +----------------+-----------------------+ |ebt51 |626 | |loveboth8 |546 | |Savethe8thInfo |516 | |testisfidelis |417 | |Together4yes |287 | |Thebfromtuam |263 | |LoveLifeLove8th |246 | |AllPassingThing |218 | |lifeinstitute |209 | |JanetOS_ |183 | |MarieAFlaherty |148 | |InHerIrishShoes |145 | |PacifistIreland |143 | |repeal8thfunds |142 | |RosForChoice |142 | |tvcritics |135 | |StopRepealHate |125 | |MandyGall7 |122 | |TarynDeVere |122 | |paddyearly |110 | |AnnieKatelynch |109 | |rocknrollok |109 | |LawlessRoisin |106 | |TFYEastCork |106 | |EmmaMurphy12150 |102 | |laurathornton30 |99 | |TheUSI |99 | |Donnchadh32 |98 | |GendercideNews |98 | |prolifecampaign |97 | |Irishprolifer |94 | |TipperaryForYes |94 | |LeanneWoodfull |91 | |speakofIMELDA |90 | |john_mcguirk |89 | |Colmogorman |89 | |ZazaFL |88 | |DroghedaT4Y |87 | |IrelandStandUp |87 | |SpotlightEire |84 | |AmnestyIreland |82 | |StopViolenceIRL |81 | |Paul71 |81 | |CoraSherlock |79 | |WingnutParody |75 | |Berlinnaeus |75 | |RepealExpose |73 | |NualaDonnellan |73 | |ExposeMediaBias |72 | |SK4Repeal |71 | +----------------+-----------------------+ only showing top 50 rows
urls_df = sqlContext.sql("SELECT url, count(url) from (SELECT explode(urls) url FROM tweets) where not url like 'http://twitter.com%' group by url order by count(url) desc")
urls_df.show(50, truncate=False)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+ |url |count(url)| +-----------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+ |http://checktheregister.ie |569 | |http://undecided8.org |314 | |http://www.irishtimes.com/opinion/anti-abortion-posters-fail-to-take-account-of-life-1.3470187 |294 | |http://bit.ly/2keysma |292 | |http://www.checktheregister.ie |279 | |http://jrnl.ie/3986043t |163 | |http://crowdfund.togetherforyes.ie |150 | |http://www.checktheregister.ie/publicpages/default.aspx?uilang= |137 | |http://youtu.be/itsxbbkp-tq |135 | |http://adoption.ie/wp-content/uploads/2018/04/ara-position-paper-on-8th-amendment.pdf |131 | |http://www.irishtimes.com/news/politics/obstetricians-body-recommends-yes-vote-in-abortion-referendum-1.3473125 |127 | |http://www.thejournal.ie/together-for-yes-crowdfunding-3957637-apr2018/ |114 | |http://goo.gl/3qa2n5 |113 | |http://www.save8.ie/donate |109 | |http://twibbon.com/support/loveboth-vote-no/twitter |101 | |http://bit.ly/2jp6u0l |93 | |http://bit.ly/2itifbo |91 | |http://www.youtube.com/watch?v=itsxbbkp-tq&sns=tw |84 | |http://save8.ie |82 | |http://www.irishtimes.com/opinion/graham-linehan-men-must-play-their-part-in-repealing-the-eighth-1.3481645 |77 | |http://xytex.com |76 | |http://philippaandneil.wordpress.com |71 | |http://tinyurl.com/ybfp9gpz |71 | |http://trinitynews.ie/katie-ascoughs-cynical-worldview/ |70 | |http://goo.gl/7p2f9x |69 | |http://bit.ly/2rjqt0t |69 | |http://bit.ly/2hzhf4f |69 | |http://clarechampion.ie/i-was-abandoned-by-the-irish-state/ |67 | |http://www.togetherforyes.ie/donate |66 | |http://togetherforyes.causevox.com/ |66 | |http://youtu.be/cw_ylrol_70 |63 | |http://www.jpands.org/vol22no4/coleman.pdf |62 | |http://www.togetherforyes.ie/canvassing-information-contacts/ |61 | |http://youtu.be/8acbuqzkq80 |60 | |http://www.the-pool.com/health/wombs-etc/2018/17/caroline-o-donoghue-one-month-before-ireland-abortion-referendum |58 | |http://bit.ly/2rsxltw |58 | |http://www.liveaction.org/news/abortion-facility-misleads-women-deceiving-depictions-fetal-development/?utm_content=70471828&utm_medium=social&utm_source=twitter|58 | |http://www.irishtimes.com/news/ireland/irish-news/eighth-amendment-causing-uncertainty-for-doctors-gynaecologist-1.3478274 |57 | |http://www.togetherforyes.ie/register4yes/ |57 | |http://www.irishtimes.com/culture/music/u2-support-repeal-of-eighth-amendment-on-eve-of-new-tour-1.3481713 |56 | |http://bit.ly/2qcfkxx |55 | |http://youtu.be/s4pk1d0ob1w |52 | |http://unitedtrusts.com |52 | |http://www.irishtimes.com/news/social-affairs/heart-failure-patient-unable-to-get-abortion-as-life-not-at-immediate-risk-1.3475728 |52 | |http://www.thetimes.co.uk/article/my-options-were-to-feel-my-unborn-child-die-or-watch-her-die-jb8r5jqdz?sharetoken=1bd1e8800606f849d71f461b26aed059 |52 | |http://crowdfund.togetherforyes.ie/ |50 | |http://hometovote.com |49 | |http://chooselife2018.ie |48 | |http://www.pop.org/many-american-women-felt-pressured-abortions-study-finds/ |48 | |http://goo.gl/hhp1ds |48 | +-----------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+ only showing top 50 rows
tz_df = sqlContext.sql("SELECT user_time_zone, count(user_time_zone) FROM tweets group by user_time_zone order by count(user_time_zone) desc")
tz_df.show(10, truncate=False)
+--------------------------+---------------------+ |user_time_zone |count(user_time_zone)| +--------------------------+---------------------+ |Dublin |113521 | |Pacific Time (US & Canada)|37240 | |London |30191 | |Amsterdam |19888 | |Casablanca |15636 | |Europe/Dublin |8694 | |Eastern Time (US & Canada)|7565 | |Hawaii |4303 | |Central Time (US & Canada)|3470 | |Europe/London |1890 | +--------------------------+---------------------+ only showing top 10 rows
lang_df = sqlContext.sql("SELECT user_language, count(user_language) FROM tweets group by user_language order by count(user_language) desc")
lang_df.show(10, truncate=False)
+-------------+--------------------+ |user_language|count(user_language)| +-------------+--------------------+ |en |441308 | |en-gb |19521 | |ja |4473 | |en-GB |4193 | |es |2195 | |fr |1582 | |ga |992 | |de |878 | |it |812 | |ru |517 | +-------------+--------------------+ only showing top 10 rows
rt_df = sqlContext.sql("SELECT CONCAT('https://twitter.com/', retweeted_quoted_screen_name, '/status/', retweet_quoted_status_id), count(retweet_quoted_status_id) FROM tweets group by retweet_quoted_status_id, retweeted_quoted_screen_name order by count(retweet_quoted_status_id) desc")
rt_df.show(25, truncate=False)
+----------------------------------------------------------------------------------------------+-------------------------------+ |concat(https://twitter.com/, retweeted_quoted_screen_name, /status/, retweet_quoted_status_id)|count(retweet_quoted_status_id)| +----------------------------------------------------------------------------------------------+-------------------------------+ |https://twitter.com/Together4yes/status/993174221265174529 |4549 | |https://twitter.com/markohalloran/status/987318846980751360 |2251 | |https://twitter.com/campaignforleo/status/987646457045020672 |1958 | |https://twitter.com/amyhuberman/status/987303602514530304 |1614 | |https://twitter.com/Sarah_Hyland/status/993520504052092928 |1474 | |https://twitter.com/campaignforleo/status/988744931501133825 |1261 | |https://twitter.com/NursepollyRgn/status/985280763942916096 |1222 | |https://twitter.com/SimonHarrisTD/status/989998862164164609 |1221 | |https://twitter.com/Iam_here_2018/status/989056513753874433 |1128 | |https://twitter.com/RealJamesWoods/status/993575065332600834 |998 | |https://twitter.com/davidmcw/status/988357892481929216 |994 | |https://twitter.com/Stephanenny/status/986860375667888128 |942 | |https://twitter.com/itsclairekane/status/985071490709110784 |904 | |https://twitter.com/Together4yes/status/984890051837472768 |883 | |https://twitter.com/sineadgleeson/status/992389180461404161 |855 | |https://twitter.com/obianuju/status/993398334772703232 |841 | |https://twitter.com/aoifegracemoore/status/986295031408578560 |840 | |https://twitter.com/Cllr_Campbell/status/988738546612817920 |835 | |https://twitter.com/annakatclarke/status/990078608650723329 |815 | |https://twitter.com/Longford4Repeal/status/990262959472508933 |803 | |https://twitter.com/DervalORourke/status/985531080580231169 |746 | |https://twitter.com/campaignforleo/status/987644583440994304 |727 | |https://twitter.com/LdnIrishARC/status/988313775144136704 |678 | |https://twitter.com/adrianshanahan/status/988365011096625152 |671 | |https://twitter.com/LilaGraceRose/status/992155086049624064 |666 | +----------------------------------------------------------------------------------------------+-------------------------------+ only showing top 25 rows
from pyspark.ml.feature import RegexTokenizer, NGram, StopWordsRemover
from pyspark.sql.functions import sort_array, udf, explode
from pyspark.sql.types import ArrayType, StringType
# Text (using distinct)
text_df = tweets_df.select(explode("text").alias("text")).distinct()
# Tokenize
tokenizer = RegexTokenizer(pattern="([:\.!?,]|'s|’s)*\\s+[‘]*", inputCol="text", outputCol="words")
tokenized_df = tokenizer.transform(text_df)
# Stopwords
stop_words = StopWordsRemover.loadDefaultStopWords('english')
stop_words.extend(['rt', ' ', '-', '&', 'it’s', '', 'may', 'see', 'want', 'i’m', 'us', 'make', "we've", "you're", "you've", "don't", "i’ve", 'it', 'they’re', 'don’t', 'lets', 'add'])
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words", stopWords=stop_words)
filtered_df = remover.transform(tokenized_df)
# Remove hashtags and URLs and dupes
def clean(arr):
new_arr = set()
for item in arr:
add_to_arr = True
for startswith in ('#', 'http'):
if item.startswith(startswith):
add_to_arr = False
if add_to_arr:
new_arr.add(item)
return list(new_arr)
clean_udf = udf(lambda arr: clean(arr), ArrayType(StringType()))
clean_df = filtered_df.withColumn("clean_words", clean_udf(filtered_df.filtered_words))
# Sort the words
sorted_df = clean_df.select(sort_array('clean_words').alias('sorted_words'))
ngram = NGram(n=3, inputCol="sorted_words", outputCol="ngrams")
ngram_df = ngram.transform(sorted_df).select(explode('ngrams').alias('ngrams'))
ngram_df.groupBy('ngrams').count().orderBy('count', ascending=False).show(50, truncate=False)
+----------------------------------------------+-----+ |ngrams |count| +----------------------------------------------+-----+ |vote women yes |362 | |voting women yes |243 | |help please support |122 | |vote voting yes |117 | |8th abortion amendment |104 | |@josephamadigan @simonharristd @together4yes |99 | |care change compassion |88 | |@conmurphysport @gordonwdarcy @kevinmcgahern |85 | |@andyleeboxing @conmurphysport @gordonwdarcy |83 | |irish referendum urged |78 | |@gordonwdarcy @kevinmcgahern @richiesadlier |75 | |registered sure vote |74 | |please privilege right |74 | |perfect please privilege |73 | |life perfect please |72 | |@simonharristd @together4yes abortion |72 | |child every life |72 | |@kevinmcgahern @richiesadlier @together4yes |71 | |emigrants irish referendum |70 | |today vote yes |69 | |every life perfect |69 | |abortion emigrants irish |68 | |@campaignforleo @josephamadigan @simonharristd|67 | |people person rejected |65 | |little people person |65 | |voting woman yes |64 | |vote voting women |62 | |@ire201261 @irishredale1916 @ivorysiobhan |58 | |ireland life vote |58 | |@richardbrutontd @senatornoone @simonharristd |56 | |1 2 3 |55 | |together yes €500,000 |53 | |woman women yes |52 | |sometimes telling try |52 | |register today vote |51 | |form garda get |50 | |referendum register registered |49 | |oversimplify posters sometimes |49 | |posters sometimes telling |48 | |@nealerichmond @richardbrutontd @simonharristd|48 | |@ireland @ldnirishu4l @siobhka |48 | |neatly oversimplify posters |47 | |register registered sure |47 | |@nwci @orlanwci @repealeight |47 | |raise together yes |46 | |vote woman yes |46 | |8th @together4yes amendment |46 | |daily latest thanks |46 | |last night people |45 | |support vote yes |45 | +----------------------------------------------+-----+ only showing top 50 rows