import numpy as np
import pandas as pd
import json
from requests_oauthlib import OAuth1Session
import time
import datetime
import configparser
from tqdm import tqdm_notebook
# http://bigdata.naist.jp/~ysuzuki/data/twitter/
raw_df = pd.read_csv('../data/tweets_open.csv.bz2', header=None,
names=['id', 'genre_id', 'status_id', 'is_both', 'is_positive', 'is_negative', 'is_neutral', 'is_irrelevant'])
raw_df.head()
id | genre_id | status_id | is_both | is_positive | is_negative | is_neutral | is_irrelevant | |
---|---|---|---|---|---|---|---|---|
0 | 10025 | 10000 | 522407718091366400 | 0 | 0 | 1 | 1 | 0.0 |
1 | 10026 | 10000 | 522407768003592192 | 0 | 0 | 1 | 0 | 0.0 |
2 | 10027 | 10000 | 522408018642628609 | 0 | 0 | 1 | 1 | 0.0 |
3 | 10028 | 10000 | 522408394871672832 | 0 | 0 | 0 | 1 | 0.0 |
4 | 10029 | 10000 | 522408454778929153 | 0 | 0 | 0 | 1 | 0.0 |
raw_df.groupby('genre_id').sum()
id | status_id | is_both | is_positive | is_negative | is_neutral | is_irrelevant | |
---|---|---|---|---|---|---|---|
genre_id | |||||||
10000 | 2.533529e+10 | 4.993758e+22 | 603.0 | 5650.0 | 9526.0 | 51404.0 | 18818.0 |
10001 | 8.759670e+09 | 1.036641e+22 | 479.0 | 1996.0 | 3060.0 | 3557.0 | 10027.0 |
10002 | 4.096559e+10 | 4.227049e+22 | 115.0 | 909.0 | 2128.0 | 12434.0 | 57611.0 |
10020 | 7.331238e+09 | 7.468428e+21 | 41.0 | 741.0 | 311.0 | 6894.0 | 4371.0 |
10021 | 1.559539e+11 | 5.147131e+22 | 343.0 | 3443.0 | 6074.0 | 44822.0 | 28622.0 |
10022 | 8.453269e+10 | 4.705838e+22 | 79.0 | 1499.0 | 933.0 | 13516.0 | 54459.0 |
10024 | 1.226563e+11 | 4.242347e+22 | 240.0 | 3881.0 | 3482.0 | 29866.0 | 35235.0 |
10025 | 1.278649e+11 | 4.487937e+22 | 49.0 | 949.0 | 1084.0 | 20417.0 | 47780.0 |
10026 | 1.080555e+11 | 4.715787e+22 | 75.0 | 744.0 | 4420.0 | 40787.0 | 25988.0 |
roomba_negative_df = raw_df.query('genre_id==10025 and is_positive==0 and is_negative==1') #1079件
roomba_positive_df = raw_df.query('genre_id==10025 and is_positive==1 and is_negative==0') #944件
roomba_df = pd.concat([roomba_positive_df, roomba_negative_df], axis=0, ignore_index=True)
roomba_df.head()
id | genre_id | status_id | is_both | is_positive | is_negative | is_neutral | is_irrelevant | |
---|---|---|---|---|---|---|---|---|
0 | 1141159 | 10025 | 551949125961252864 | 0 | 1 | 0 | 1 | 0.0 |
1 | 1135337 | 10025 | 550852542322581507 | 0 | 1 | 0 | 0 | 0.0 |
2 | 1138242 | 10025 | 551413741560930304 | 0 | 1 | 0 | 0 | 0.0 |
3 | 1139529 | 10025 | 551692497269690368 | 0 | 1 | 0 | 0 | 0.0 |
4 | 1133634 | 10025 | 550318759898918913 | 0 | 1 | 0 | 0 | 0.0 |
tweet_ids = roomba_df.status_id.values
conf = configparser.ConfigParser()
conf.read('../config/setting.ini')
['../config/setting.ini']
def get_tweet_text(tweet_id):
twitter = OAuth1Session(
conf['twitterapi']['CONSUMER_KEY'],
conf['twitterapi']['CONSUMER_SECRET'],
conf['twitterapi']['TOKEN'],
conf['twitterapi']['TOKEN_SECRET']
)
url = 'https://api.twitter.com/1.1/statuses/show.json'
params ={'id' : tweet_id}
res = twitter.get(url, params = params)
if res.status_code == 200:
return json.loads(res.text)['text'], res
else:
return np.nan, res
tweet_texts = []
for tweet_id in tqdm_notebook(tweet_ids):
text, res = get_tweet_text(tweet_id)
tweet_texts.append(text)
#API制限までの残り回数が来たら寝る(15分900回)
try:
if int(res.headers['x-rate-limit-remaining']) <= 0:
print(f'{datetime.datetime.now()}: 15分後に起きます。')
time.sleep(15 * 60 + 2)
print(f'{datetime.datetime.now()}: 起きました。')
except KeyError:
print('なんやろね')
pass
HBox(children=(IntProgress(value=0, max=2023), HTML(value='')))
2019-05-05 18:26:33.633980: 15分後に起きます。 2019-05-05 18:41:35.680097: 起きました。 2019-05-05 18:44:33.241502: 15分後に起きます。 2019-05-05 18:59:35.290670: 起きました。
len(tweet_texts)
2023
roomba_df['tweet_text'] = tweet_texts
roomba_df.to_csv('../data/roomba.csv.gz', compression='gzip', index=False)