Analysis of Tweets from Ireland 8th

Setup

In [1]:
import os
import pyspark
from pyspark.sql import SQLContext

# Add the elasticsearch-hadoop jar
os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars /home/jovyan/elasticsearch-hadoop-6.2.2.jar pyspark-shell'
conf = pyspark.SparkConf()

# Point to the master.
conf.setMaster("spark://tweetsets.library.gwu.edu:7101")
import os
import pyspark

# Add the elasticsearch-hadoop jar
os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars /home/jovyan/elasticsearch-hadoop-6.2.2.jar pyspark-shell'
conf = pyspark.SparkConf()

# Point to the master.
conf.setMaster("spark://tweetsets.library.gwu.edu:7101")
conf.setAppName("ireland-8th-analysis")
conf.set("spark.driver.bindAddress", "0.0.0.0")
# Don't hog all of the cores.
conf.set("spark.cores.max", "3")
# Specify a port for the block manager (which runs as part of the worker). The range 7003-7028 is set 
# to be open in the Spark worker container.
conf.set("spark.blockManager.port", "7003")

# create the context
sc = pyspark.SparkContext(conf=conf)

# Configure for ElasticSearch cluster and index.
es_conf = {"es.nodes": "tweetsets.library.gwu.edu",
           "es.port": "9200",
           "es.resource": "tweets-ba2157/doc",
           "es.read.field.as.array.include": "hashtags,text,urls"}

sqlContext = SQLContext(sc)
tweets_df = sqlContext.read.format("org.elasticsearch.spark.sql").options(**es_conf).load()
tweets_df.createOrReplaceTempView("tweets")

Count

In [2]:
tweets_df.count()
Out[2]:
478303

Top hashtags

In [3]:
hashtags_df = sqlContext.sql("SELECT hashtag, count(hashtag) from (SELECT explode(hashtags) hashtag FROM tweets) group by hashtag order by count(hashtag) desc")
hashtags_df.show(50, truncate=False)
+-------------------+--------------+
|hashtag            |count(hashtag)|
+-------------------+--------------+
|repealthe8th       |84179         |
|together4yes       |57253         |
|savethe8th         |43656         |
|8thref             |35935         |
|togetherforyes     |18763         |
|lovebothvoteno     |18290         |
|voteyes            |8245          |
|loveboth           |8109          |
|latelate           |5920          |
|men4yes            |5884          |
|votenotoabortion   |5732          |
|latelateshow       |5584          |
|voteno             |5324          |
|prolife            |4596          |
|hometovote         |4342          |
|lifecanvass        |3637          |
|trustwomen         |3483          |
|repeal             |3450          |
|ireland            |3192          |
|mybodymychoice     |2903          |
|私は黙らない0428         |2879          |
|repealth8th        |2614          |
|jointherebellion   |2602          |
|abortion           |2463          |
|repeal8th          |2258          |
|rtept              |1722          |
|repealtheeighth    |1601          |
|register4yes       |1445          |
|prochoice          |1423          |
|repealfacts        |1360          |
|standupforlife     |1322          |
|itstime            |1293          |
|8thamendment       |1283          |
|praytoendabortion  |1196          |
|womensayno2abortion|1050          |
|wakeupireland      |1049          |
|ourfuture          |1049          |
|savelives          |980           |
|menforyes          |957           |
|scotref            |921           |
|chooselife         |912           |
|yes                |899           |
|corksaysyes        |893           |
|tinylivesatstake   |863           |
|time4choice        |832           |
|istandwithnicola   |820           |
|repealmobile       |758           |
|votenoroadshow     |757           |
|loveboats          |736           |
|studentsforchoice  |646           |
+-------------------+--------------+
only showing top 50 rows

Top users by all tweet types

In [4]:
screen_name_df = sqlContext.sql("SELECT user_screen_name, count(user_screen_name) from tweets group by user_screen_name order by count(user_screen_name) desc")
screen_name_df.show(50, truncate=False)
+----------------+-----------------------+
|user_screen_name|count(user_screen_name)|
+----------------+-----------------------+
|kaydnan         |3449                   |
|Irishprolifer   |2783                   |
|ShannonBlue     |2433                   |
|mobyrne100      |2375                   |
|EmmaMurphy12150 |2278                   |
|Declan1497      |2225                   |
|MaryOGrady8     |2209                   |
|BernadetteComm1 |2164                   |
|Donnchadh32     |1823                   |
|EamonReilly_com |1727                   |
|Paul71          |1700                   |
|christi85573643 |1627                   |
|rosecaroline9   |1626                   |
|MeathRight2Life |1594                   |
|DLTogether4Yes  |1587                   |
|MandyGall7      |1559                   |
|laurathornton30 |1545                   |
|MaryThorn85     |1528                   |
|babydollirish2  |1474                   |
|ebt51           |1430                   |
|ExposeMediaBias |1419                   |
|WolfeTone15     |1397                   |
|IsabelCorcoran5 |1236                   |
|JanetOS_        |1164                   |
|renemccoll      |1103                   |
|seamus6346      |1079                   |
|ElaineYoung94   |1039                   |
|daraghnoel      |873                    |
|pnolan26        |872                    |
|PadraigSagart   |809                    |
|NursepollyRgn   |809                    |
|DrCollins10     |807                    |
|corkmankeane    |789                    |
|Together4yes    |773                    |
|marybuckley549  |764                    |
|8threfbot       |763                    |
|Berlinnaeus     |757                    |
|Thebfromtuam    |754                    |
|loveboth8       |741                    |
|GiveBackMy_Mind |730                    |
|renebatt26      |723                    |
|Savethe8thInfo  |721                    |
|theRallyforLife |718                    |
|TeilHarder      |716                    |
|Colmogorman     |700                    |
|paddylepage     |685                    |
|repeal_shield   |681                    |
|paddyearly      |679                    |
|firstlady10000  |678                    |
|IrelandStandUp  |664                    |
+----------------+-----------------------+
only showing top 50 rows

Top users by original tweets only

In [5]:
screen_name_orig_df = sqlContext.sql("SELECT user_screen_name, count(user_screen_name) from tweets where tweet_type='original' group by user_screen_name order by count(user_screen_name) desc")
screen_name_orig_df.show(50, truncate=False)
+----------------+-----------------------+
|user_screen_name|count(user_screen_name)|
+----------------+-----------------------+
|ebt51           |626                    |
|loveboth8       |546                    |
|Savethe8thInfo  |516                    |
|testisfidelis   |417                    |
|Together4yes    |287                    |
|Thebfromtuam    |263                    |
|LoveLifeLove8th |246                    |
|AllPassingThing |218                    |
|lifeinstitute   |209                    |
|JanetOS_        |183                    |
|MarieAFlaherty  |148                    |
|InHerIrishShoes |145                    |
|PacifistIreland |143                    |
|repeal8thfunds  |142                    |
|RosForChoice    |142                    |
|tvcritics       |135                    |
|StopRepealHate  |125                    |
|MandyGall7      |122                    |
|TarynDeVere     |122                    |
|paddyearly      |110                    |
|AnnieKatelynch  |109                    |
|rocknrollok     |109                    |
|LawlessRoisin   |106                    |
|TFYEastCork     |106                    |
|EmmaMurphy12150 |102                    |
|laurathornton30 |99                     |
|TheUSI          |99                     |
|Donnchadh32     |98                     |
|GendercideNews  |98                     |
|prolifecampaign |97                     |
|Irishprolifer   |94                     |
|TipperaryForYes |94                     |
|LeanneWoodfull  |91                     |
|speakofIMELDA   |90                     |
|john_mcguirk    |89                     |
|Colmogorman     |89                     |
|ZazaFL          |88                     |
|DroghedaT4Y     |87                     |
|IrelandStandUp  |87                     |
|SpotlightEire   |84                     |
|AmnestyIreland  |82                     |
|StopViolenceIRL |81                     |
|Paul71          |81                     |
|CoraSherlock    |79                     |
|WingnutParody   |75                     |
|Berlinnaeus     |75                     |
|RepealExpose    |73                     |
|NualaDonnellan  |73                     |
|ExposeMediaBias |72                     |
|SK4Repeal       |71                     |
+----------------+-----------------------+
only showing top 50 rows

Top URLs

In [6]:
urls_df = sqlContext.sql("SELECT url, count(url) from (SELECT explode(urls) url FROM tweets) where not url like 'http://twitter.com%' group by url order by count(url) desc")
urls_df.show(50, truncate=False)
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+
|url                                                                                                                                                              |count(url)|
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+
|http://checktheregister.ie                                                                                                                                       |569       |
|http://undecided8.org                                                                                                                                            |314       |
|http://www.irishtimes.com/opinion/anti-abortion-posters-fail-to-take-account-of-life-1.3470187                                                                   |294       |
|http://bit.ly/2keysma                                                                                                                                            |292       |
|http://www.checktheregister.ie                                                                                                                                   |279       |
|http://jrnl.ie/3986043t                                                                                                                                          |163       |
|http://crowdfund.togetherforyes.ie                                                                                                                               |150       |
|http://www.checktheregister.ie/publicpages/default.aspx?uilang=                                                                                                  |137       |
|http://youtu.be/itsxbbkp-tq                                                                                                                                      |135       |
|http://adoption.ie/wp-content/uploads/2018/04/ara-position-paper-on-8th-amendment.pdf                                                                            |131       |
|http://www.irishtimes.com/news/politics/obstetricians-body-recommends-yes-vote-in-abortion-referendum-1.3473125                                                  |127       |
|http://www.thejournal.ie/together-for-yes-crowdfunding-3957637-apr2018/                                                                                          |114       |
|http://goo.gl/3qa2n5                                                                                                                                             |113       |
|http://www.save8.ie/donate                                                                                                                                       |109       |
|http://twibbon.com/support/loveboth-vote-no/twitter                                                                                                              |101       |
|http://bit.ly/2jp6u0l                                                                                                                                            |93        |
|http://bit.ly/2itifbo                                                                                                                                            |91        |
|http://www.youtube.com/watch?v=itsxbbkp-tq&sns=tw                                                                                                                |84        |
|http://save8.ie                                                                                                                                                  |82        |
|http://www.irishtimes.com/opinion/graham-linehan-men-must-play-their-part-in-repealing-the-eighth-1.3481645                                                      |77        |
|http://xytex.com                                                                                                                                                 |76        |
|http://philippaandneil.wordpress.com                                                                                                                             |71        |
|http://tinyurl.com/ybfp9gpz                                                                                                                                      |71        |
|http://trinitynews.ie/katie-ascoughs-cynical-worldview/                                                                                                          |70        |
|http://goo.gl/7p2f9x                                                                                                                                             |69        |
|http://bit.ly/2rjqt0t                                                                                                                                            |69        |
|http://bit.ly/2hzhf4f                                                                                                                                            |69        |
|http://clarechampion.ie/i-was-abandoned-by-the-irish-state/                                                                                                      |67        |
|http://www.togetherforyes.ie/donate                                                                                                                              |66        |
|http://togetherforyes.causevox.com/                                                                                                                              |66        |
|http://youtu.be/cw_ylrol_70                                                                                                                                      |63        |
|http://www.jpands.org/vol22no4/coleman.pdf                                                                                                                       |62        |
|http://www.togetherforyes.ie/canvassing-information-contacts/                                                                                                    |61        |
|http://youtu.be/8acbuqzkq80                                                                                                                                      |60        |
|http://www.the-pool.com/health/wombs-etc/2018/17/caroline-o-donoghue-one-month-before-ireland-abortion-referendum                                                |58        |
|http://bit.ly/2rsxltw                                                                                                                                            |58        |
|http://www.liveaction.org/news/abortion-facility-misleads-women-deceiving-depictions-fetal-development/?utm_content=70471828&utm_medium=social&utm_source=twitter|58        |
|http://www.irishtimes.com/news/ireland/irish-news/eighth-amendment-causing-uncertainty-for-doctors-gynaecologist-1.3478274                                       |57        |
|http://www.togetherforyes.ie/register4yes/                                                                                                                       |57        |
|http://www.irishtimes.com/culture/music/u2-support-repeal-of-eighth-amendment-on-eve-of-new-tour-1.3481713                                                       |56        |
|http://bit.ly/2qcfkxx                                                                                                                                            |55        |
|http://youtu.be/s4pk1d0ob1w                                                                                                                                      |52        |
|http://unitedtrusts.com                                                                                                                                          |52        |
|http://www.irishtimes.com/news/social-affairs/heart-failure-patient-unable-to-get-abortion-as-life-not-at-immediate-risk-1.3475728                               |52        |
|http://www.thetimes.co.uk/article/my-options-were-to-feel-my-unborn-child-die-or-watch-her-die-jb8r5jqdz?sharetoken=1bd1e8800606f849d71f461b26aed059             |52        |
|http://crowdfund.togetherforyes.ie/                                                                                                                              |50        |
|http://hometovote.com                                                                                                                                            |49        |
|http://chooselife2018.ie                                                                                                                                         |48        |
|http://www.pop.org/many-american-women-felt-pressured-abortions-study-finds/                                                                                     |48        |
|http://goo.gl/hhp1ds                                                                                                                                             |48        |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+
only showing top 50 rows

Top timezones

In [7]:
tz_df = sqlContext.sql("SELECT user_time_zone, count(user_time_zone) FROM tweets group by user_time_zone order by count(user_time_zone) desc")
tz_df.show(10, truncate=False)
+--------------------------+---------------------+
|user_time_zone            |count(user_time_zone)|
+--------------------------+---------------------+
|Dublin                    |113521               |
|Pacific Time (US & Canada)|37240                |
|London                    |30191                |
|Amsterdam                 |19888                |
|Casablanca                |15636                |
|Europe/Dublin             |8694                 |
|Eastern Time (US & Canada)|7565                 |
|Hawaii                    |4303                 |
|Central Time (US & Canada)|3470                 |
|Europe/London             |1890                 |
+--------------------------+---------------------+
only showing top 10 rows

Top user languages

In [8]:
lang_df = sqlContext.sql("SELECT user_language, count(user_language) FROM tweets group by user_language order by count(user_language) desc")
lang_df.show(10, truncate=False)
+-------------+--------------------+
|user_language|count(user_language)|
+-------------+--------------------+
|en           |441308              |
|en-gb        |19521               |
|ja           |4473                |
|en-GB        |4193                |
|es           |2195                |
|fr           |1582                |
|ga           |992                 |
|de           |878                 |
|it           |812                 |
|ru           |517                 |
+-------------+--------------------+
only showing top 10 rows

Top retweets

In [9]:
rt_df = sqlContext.sql("SELECT CONCAT('https://twitter.com/', retweeted_quoted_screen_name, '/status/', retweet_quoted_status_id), count(retweet_quoted_status_id) FROM tweets group by retweet_quoted_status_id, retweeted_quoted_screen_name order by count(retweet_quoted_status_id) desc")
rt_df.show(25, truncate=False)
+----------------------------------------------------------------------------------------------+-------------------------------+
|concat(https://twitter.com/, retweeted_quoted_screen_name, /status/, retweet_quoted_status_id)|count(retweet_quoted_status_id)|
+----------------------------------------------------------------------------------------------+-------------------------------+
|https://twitter.com/Together4yes/status/993174221265174529                                    |4549                           |
|https://twitter.com/markohalloran/status/987318846980751360                                   |2251                           |
|https://twitter.com/campaignforleo/status/987646457045020672                                  |1958                           |
|https://twitter.com/amyhuberman/status/987303602514530304                                     |1614                           |
|https://twitter.com/Sarah_Hyland/status/993520504052092928                                    |1474                           |
|https://twitter.com/campaignforleo/status/988744931501133825                                  |1261                           |
|https://twitter.com/NursepollyRgn/status/985280763942916096                                   |1222                           |
|https://twitter.com/SimonHarrisTD/status/989998862164164609                                   |1221                           |
|https://twitter.com/Iam_here_2018/status/989056513753874433                                   |1128                           |
|https://twitter.com/RealJamesWoods/status/993575065332600834                                  |998                            |
|https://twitter.com/davidmcw/status/988357892481929216                                        |994                            |
|https://twitter.com/Stephanenny/status/986860375667888128                                     |942                            |
|https://twitter.com/itsclairekane/status/985071490709110784                                   |904                            |
|https://twitter.com/Together4yes/status/984890051837472768                                    |883                            |
|https://twitter.com/sineadgleeson/status/992389180461404161                                   |855                            |
|https://twitter.com/obianuju/status/993398334772703232                                        |841                            |
|https://twitter.com/aoifegracemoore/status/986295031408578560                                 |840                            |
|https://twitter.com/Cllr_Campbell/status/988738546612817920                                   |835                            |
|https://twitter.com/annakatclarke/status/990078608650723329                                   |815                            |
|https://twitter.com/Longford4Repeal/status/990262959472508933                                 |803                            |
|https://twitter.com/DervalORourke/status/985531080580231169                                   |746                            |
|https://twitter.com/campaignforleo/status/987644583440994304                                  |727                            |
|https://twitter.com/LdnIrishARC/status/988313775144136704                                     |678                            |
|https://twitter.com/adrianshanahan/status/988365011096625152                                  |671                            |
|https://twitter.com/LilaGraceRose/status/992155086049624064                                   |666                            |
+----------------------------------------------------------------------------------------------+-------------------------------+
only showing top 25 rows

Top trigrams (combinations of 3 words)

In [11]:
from pyspark.ml.feature import RegexTokenizer, NGram, StopWordsRemover
from pyspark.sql.functions import sort_array, udf, explode
from pyspark.sql.types import ArrayType, StringType


# Text (using distinct)
text_df = tweets_df.select(explode("text").alias("text")).distinct()

# Tokenize
tokenizer = RegexTokenizer(pattern="([:\.!?,]|'s|’s)*\\s+[‘]*", inputCol="text", outputCol="words")
tokenized_df = tokenizer.transform(text_df)

# Stopwords
stop_words = StopWordsRemover.loadDefaultStopWords('english')
stop_words.extend(['rt', ' ', '-', '&', 'it’s', '', 'may', 'see', 'want', 'i’m', 'us', 'make', "we've", "you're", "you've", "don't", "i’ve", 'it', 'they’re', 'don’t', 'lets', 'add'])
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words", stopWords=stop_words)
filtered_df = remover.transform(tokenized_df)

# Remove hashtags and URLs and dupes
def clean(arr):
    new_arr = set()
    for item in arr:
        add_to_arr = True
        for startswith in ('#', 'http'):
            if item.startswith(startswith):
                add_to_arr = False
        if add_to_arr:
            new_arr.add(item)
    return list(new_arr)

clean_udf = udf(lambda arr: clean(arr), ArrayType(StringType()))
clean_df = filtered_df.withColumn("clean_words", clean_udf(filtered_df.filtered_words))

# Sort the words
sorted_df = clean_df.select(sort_array('clean_words').alias('sorted_words'))

ngram = NGram(n=3, inputCol="sorted_words", outputCol="ngrams")
ngram_df = ngram.transform(sorted_df).select(explode('ngrams').alias('ngrams'))
ngram_df.groupBy('ngrams').count().orderBy('count', ascending=False).show(50, truncate=False)
+----------------------------------------------+-----+
|ngrams                                        |count|
+----------------------------------------------+-----+
|vote women yes                                |362  |
|voting women yes                              |243  |
|help please support                           |122  |
|vote voting yes                               |117  |
|8th abortion amendment                        |104  |
|@josephamadigan @simonharristd @together4yes  |99   |
|care change compassion                        |88   |
|@conmurphysport @gordonwdarcy @kevinmcgahern  |85   |
|@andyleeboxing @conmurphysport @gordonwdarcy  |83   |
|irish referendum urged                        |78   |
|@gordonwdarcy @kevinmcgahern @richiesadlier   |75   |
|registered sure vote                          |74   |
|please privilege right                        |74   |
|perfect please privilege                      |73   |
|life perfect please                           |72   |
|@simonharristd @together4yes abortion         |72   |
|child every life                              |72   |
|@kevinmcgahern @richiesadlier @together4yes   |71   |
|emigrants irish referendum                    |70   |
|today vote yes                                |69   |
|every life perfect                            |69   |
|abortion emigrants irish                      |68   |
|@campaignforleo @josephamadigan @simonharristd|67   |
|people person rejected                        |65   |
|little people person                          |65   |
|voting woman yes                              |64   |
|vote voting women                             |62   |
|@ire201261 @irishredale1916 @ivorysiobhan     |58   |
|ireland life vote                             |58   |
|@richardbrutontd @senatornoone @simonharristd |56   |
|1 2 3                                         |55   |
|together yes €500,000                         |53   |
|woman women yes                               |52   |
|sometimes telling try                         |52   |
|register today vote                           |51   |
|form garda get                                |50   |
|referendum register registered                |49   |
|oversimplify posters sometimes                |49   |
|posters sometimes telling                     |48   |
|@nealerichmond @richardbrutontd @simonharristd|48   |
|@ireland @ldnirishu4l @siobhka                |48   |
|neatly oversimplify posters                   |47   |
|register registered sure                      |47   |
|@nwci @orlanwci @repealeight                  |47   |
|raise together yes                            |46   |
|vote woman yes                                |46   |
|8th @together4yes amendment                   |46   |
|daily latest thanks                           |46   |
|last night people                             |45   |
|support vote yes                              |45   |
+----------------------------------------------+-----+
only showing top 50 rows