# Importing the classes from 'gydelt'
from gydelt.gydelt import GetData, ProcessData
# Creating object of GetData
GD = GetData()
# Reading from a csv file (obtained from GKG Exporter)
data = GD.read_from_file(path='sample data/fromGKG.txt', parse_dates=['Date'])
data.head(5)
Date | NumArticles | Counts | Themes | Locations | Persons | Organizations | ToneData | CAMEOEvents | Sources | SourceURLs | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2016-12-01 | 1 | NaN | DRONES;NATURAL_DISASTER;NATURAL_DISASTER_ICY;U... | 2#California, United States#US#USCA#36.17#-119... | steve chien;andrew thompson | artificial intelligence group;propulsion labor... | -1.16279069767442,1.62790697674419,2.790697674... | NaN | sify.com | http://www.sify.com/news/nasas-intelligent-und... |
1 | 2016-12-01 | 1 | SEIZE#85##1#Ireland#EI#EI#53#-8#EI; | TAX_FNCACT;TAX_FNCACT_CHILD;RAPE;WB_2024_ANTI_... | 1#Ireland#EI#EI#53#-8#EI;1#Germany#GM#GM#51#9#... | claudia peersman;awais rashid | p networks;german research centre for artifici... | -4.33815350389321,1.33481646273637,5.672969966... | 604376665,604109997 | wired.co.uk | http://www.wired.co.uk/article/ai-interpol-tra... |
2 | 2016-12-01 | 1 | NaN | EPU_ECONOMY_HISTORIC;ECON_STOCKMARKET;EDUCATIO... | 1#Japan#JA#JA#36#138#JA;4#Tokyo, Tokyo, Japan#... | tom foley | paxton center school;hampshire college in amhe... | 4.13533834586466,5.26315789473684,1.1278195488... | 604128888 | thelandmark.com | http://www.thelandmark.com/news/2016-12-01/Pax... |
3 | 2016-12-01 | 1 | NaN | UNGP_FORESTS_RIVERS_OCEANS;MEDIA_MSM;WB_566_EN... | 3#Moss Landing, California, United States#US#U... | steve chien;andrew thompson | propulsion laboratory;net enterprises;monterey... | -1.03626943005181,2.2020725388601,3.2383419689... | 604152212 | clarksvilleonline.com | http://www.clarksvilleonline.com/2016/12/01/na... |
4 | 2016-12-01 | 1 | NaN | DRONES;NATURAL_DISASTER;NATURAL_DISASTER_ICY;U... | 2#California, United States#US#USCA#36.17#-119... | andrew thompson;steve chien | artificial intelligence group;propulsion labor... | -1.30548302872063,2.088772845953,3.39425587467... | NaN | newkerala.com | http://www.newkerala.com/news/2016/fullnews-14... |
# Creating object of ProcessData
PD = ProcessData(data_frame=data)
# Calling the wrapper function to pre-process the whole data
processed_data_1 = PD.pre_process()
processed_data_1.head(5)
Time taken for pre-processing the data --> 0.49 seconds
Date | NumArticles | Counts | Themes | Locations | Persons | Organizations | ToneData | CAMEOEvents | Sources | SourceURLs | Countries | Tone | Positive Score | Negative Score | Polarity | Activity Reference Density | Self/Group Reference Density | Word Count | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2016-12-01 | 1 | NaN | DRONES;NATURAL_DISASTER;NATURAL_DISASTER_ICY;U... | 2#California, United States#US#USCA#36.17#-119... | steve chien;andrew thompson | artificial intelligence group;propulsion labor... | -1.16279069767442,1.62790697674419,2.790697674... | NaN | sify.com | http://www.sify.com/news/nasas-intelligent-und... | United States | -1.162791 | 1.627907 | 2.790698 | 4.418605 | 24.186047 | 1.162791 | None |
1 | 2016-12-01 | 1 | SEIZE#85##1#Ireland#EI#EI#53#-8#EI; | TAX_FNCACT;TAX_FNCACT_CHILD;RAPE;WB_2024_ANTI_... | 1#Ireland#EI#EI#53#-8#EI;1#Germany#GM#GM#51#9#... | claudia peersman;awais rashid | p networks;german research centre for artifici... | -4.33815350389321,1.33481646273637,5.672969966... | 604376665,604109997 | wired.co.uk | http://www.wired.co.uk/article/ai-interpol-tra... | Ireland;France;Germany | -4.338154 | 1.334816 | 5.672970 | 7.007786 | 21.357063 | 0.556174 | None |
2 | 2016-12-01 | 1 | NaN | EPU_ECONOMY_HISTORIC;ECON_STOCKMARKET;EDUCATIO... | 1#Japan#JA#JA#36#138#JA;4#Tokyo, Tokyo, Japan#... | tom foley | paxton center school;hampshire college in amhe... | 4.13533834586466,5.26315789473684,1.1278195488... | 604128888 | thelandmark.com | http://www.thelandmark.com/news/2016-12-01/Pax... | Japan | 4.135338 | 5.263158 | 1.127820 | 6.390977 | 25.939850 | 0.751880 | None |
3 | 2016-12-01 | 1 | NaN | UNGP_FORESTS_RIVERS_OCEANS;MEDIA_MSM;WB_566_EN... | 3#Moss Landing, California, United States#US#U... | steve chien;andrew thompson | propulsion laboratory;net enterprises;monterey... | -1.03626943005181,2.2020725388601,3.2383419689... | 604152212 | clarksvilleonline.com | http://www.clarksvilleonline.com/2016/12/01/na... | United States;France | -1.036269 | 2.202073 | 3.238342 | 5.440415 | 25.259067 | 1.683938 | None |
4 | 2016-12-01 | 1 | NaN | DRONES;NATURAL_DISASTER;NATURAL_DISASTER_ICY;U... | 2#California, United States#US#USCA#36.17#-119... | andrew thompson;steve chien | artificial intelligence group;propulsion labor... | -1.30548302872063,2.088772845953,3.39425587467... | NaN | newkerala.com | http://www.newkerala.com/news/2016/fullnews-14... | United States | -1.305483 | 2.088773 | 3.394256 | 5.483029 | 25.587467 | 1.827676 | None |
# Flattening(One-hot Encoding) the 'Countries' column
processed_data_2 = PD.flat_column(columns=['Countries'])
processed_data_2.head(5)
Time taken for flattening the column(s) --> 0.58 seconds
Date | NumArticles | Counts | Themes | Locations | Persons | Organizations | ToneData | CAMEOEvents | Sources | ... | United Kingdom | United States | Uruguay | Uzbekistan | Venezuela | Vietnam | Western Sahara | Yemen | Zambia | Zimbabwe | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2016-12-01 | 1 | NaN | DRONES;NATURAL_DISASTER;NATURAL_DISASTER_ICY;U... | 2#California, United States#US#USCA#36.17#-119... | steve chien;andrew thompson | artificial intelligence group;propulsion labor... | -1.16279069767442,1.62790697674419,2.790697674... | NaN | sify.com | ... | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | 2016-12-01 | 1 | SEIZE#85##1#Ireland#EI#EI#53#-8#EI; | TAX_FNCACT;TAX_FNCACT_CHILD;RAPE;WB_2024_ANTI_... | 1#Ireland#EI#EI#53#-8#EI;1#Germany#GM#GM#51#9#... | claudia peersman;awais rashid | p networks;german research centre for artifici... | -4.33815350389321,1.33481646273637,5.672969966... | 604376665,604109997 | wired.co.uk | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | 2016-12-01 | 1 | NaN | EPU_ECONOMY_HISTORIC;ECON_STOCKMARKET;EDUCATIO... | 1#Japan#JA#JA#36#138#JA;4#Tokyo, Tokyo, Japan#... | tom foley | paxton center school;hampshire college in amhe... | 4.13533834586466,5.26315789473684,1.1278195488... | 604128888 | thelandmark.com | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | 2016-12-01 | 1 | NaN | UNGP_FORESTS_RIVERS_OCEANS;MEDIA_MSM;WB_566_EN... | 3#Moss Landing, California, United States#US#U... | steve chien;andrew thompson | propulsion laboratory;net enterprises;monterey... | -1.03626943005181,2.2020725388601,3.2383419689... | 604152212 | clarksvilleonline.com | ... | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
4 | 2016-12-01 | 1 | NaN | DRONES;NATURAL_DISASTER;NATURAL_DISASTER_ICY;U... | 2#California, United States#US#USCA#36.17#-119... | andrew thompson;steve chien | artificial intelligence group;propulsion labor... | -1.30548302872063,2.088772845953,3.39425587467... | NaN | newkerala.com | ... | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 154 columns
# How can flattening(one-hot encoding) be useful ?
include_India = processed_data_2['India'] == 1
include_China = processed_data_2['China'] == 1
# It makes filtering way more simpler (in case of the data that GDELT provides)
required_data = processed_data_2[include_India & include_China]
print('\n{} records have the mentions of both India and China'.format(required_data.shape[0]))
required_data.head(5)
67 records have the mentions of both India and China
Date | NumArticles | Counts | Themes | Locations | Persons | Organizations | ToneData | CAMEOEvents | Sources | ... | United Kingdom | United States | Uruguay | Uzbekistan | Venezuela | Vietnam | Western Sahara | Yemen | Zambia | Zimbabwe | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
48 | 2016-12-03 | 1 | NaN | GENERAL_GOVERNMENT;SOVEREIGNTY;TAX_FNCACT;TAX_... | 1#Vietnam, Republic Of#VM#VM#16#106#VM;4#Dongh... | marco polo;longji dragon;zonghe guangbo jiaoto... | artificial intelligence laboratory;google;inst... | 1.01102104016759,2.36815739138355,1.3571363512... | NaN | wn.com | ... | NaN | 1.0 | NaN | NaN | NaN | 1.0 | NaN | NaN | NaN | NaN |
138 | 2016-12-12 | 1 | NaN | TAX_FNCACT;TAX_FNCACT_LEADER;ENV_OIL;TAX_ECON_... | 1#China#CH#CH#35#105#CH;1#Mexico#MX#MX#23#-102... | unknown | broadcom;ericsson;ibm;facebook;visa;artificial... | 1.36570561456753,3.33839150227618,1.9726858877... | NaN | sharesinv.com | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
165 | 2016-12-14 | 1 | NaN | unknown | 3#University Of Louisville, Kentucky, United S... | deepmind alphago;demis hassabis;david kenny;jo... | houston methodist research institute;cybersecu... | 0.916380297823597,3.55097365406644,2.634593356... | NaN | techrepublic.com | ... | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
214 | 2016-12-19 | 1 | NaN | TECH_AUTOMATION;WB_1921_PRIVATE_SECTOR_DEVELOP... | 1#China#CH#CH#35#105#CH;2#New York, United Sta... | paul allen;jon talton;bill gates | boeing;allen institute for artificial intellig... | -0.900900900900901,2.5025025025025,3.403403403... | 609815768,609817195,609817718,609649509 | crosscut.com | ... | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
231 | 2016-12-21 | 1 | NaN | unknown | 1#United States#US#US#38#-97#US;1#China#CH#CH#... | baidu facebook;gabriele ketterl | ibm;united states artificial intelligence mach... | 0.809716599190283,3.96761133603239,3.157894736... | NaN | openpr.com | ... | 1.0 | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 154 columns
# Also, since each component of 'ToneData' has been seperated, visualization becomes a lot more simpler
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
%matplotlib inline
# Resampling the 'Tone' (first component of 'ToneData') on Monthly basis
resampled = required_data[['Date', 'Tone']].resample(rule='M', on='Date').apply(np.mean)
# Plotting the resampled data
resampled.plot(figsize=(10, 7), kind='line')
<matplotlib.axes._subplots.AxesSubplot at 0x1c5ce41e898>
# Saving the data frame (using the function in ProcessData)
PD.save_data_frame()
# (or)
# GD.save_data_frame(data_frame=processed_data_2)
Success: Data frame saved as - C:\Users\Mrinal Jain\gydelt\tutorial\Result(2017-08-29 14.52.09).csv