TweetIDを紐づけてテキストを取得する¶

In [1]:

import numpy as np
import pandas as pd
import json
from requests_oauthlib import OAuth1Session
import time
import datetime
import configparser
from tqdm import tqdm_notebook

In [5]:

# http://bigdata.naist.jp/~ysuzuki/data/twitter/
raw_df = pd.read_csv('../data/tweets_open.csv.bz2', header=None,
                     names=['id', 'genre_id', 'status_id', 'is_both', 'is_positive', 'is_negative', 'is_neutral', 'is_irrelevant'])
raw_df.head()

Out[5]:

	id	genre_id	status_id	is_negative	is_neutral
0	10025	10000	522407718091366400	1	1
1	10026	10000	522407768003592192	1	0
2	10027	10000	522408018642628609	1	1
3	10028	10000	522408394871672832	0	1
4	10029	10000	522408454778929153	0	1

In [10]:

raw_df.groupby('genre_id').sum()

Out[10]:

	id	status_id	is_both	is_positive	is_negative	is_neutral	is_irrelevant
genre_id
10000	2.533529e+10	4.993758e+22	603.0	5650.0	9526.0	51404.0	18818.0
10001	8.759670e+09	1.036641e+22	479.0	1996.0	3060.0	3557.0	10027.0
10002	4.096559e+10	4.227049e+22	115.0	909.0	2128.0	12434.0	57611.0
10020	7.331238e+09	7.468428e+21	41.0	741.0	311.0	6894.0	4371.0
10021	1.559539e+11	5.147131e+22	343.0	3443.0	6074.0	44822.0	28622.0
10022	8.453269e+10	4.705838e+22	79.0	1499.0	933.0	13516.0	54459.0
10024	1.226563e+11	4.242347e+22	240.0	3881.0	3482.0	29866.0	35235.0
10025	1.278649e+11	4.487937e+22	49.0	949.0	1084.0	20417.0	47780.0
10026	1.080555e+11	4.715787e+22	75.0	744.0	4420.0	40787.0	25988.0

ジャンル10025（ルンバ）のネガティブorポジティブなツイートのみ集める¶

In [14]:

roomba_negative_df = raw_df.query('genre_id==10025 and is_positive==0 and is_negative==1') #1079件
roomba_positive_df = raw_df.query('genre_id==10025 and is_positive==1 and is_negative==0') #944件
roomba_df = pd.concat([roomba_positive_df, roomba_negative_df], axis=0, ignore_index=True)
roomba_df.head()

Out[14]:

	id	genre_id	status_id	is_positive	is_neutral
0	1141159	10025	551949125961252864	1	1
1	1135337	10025	550852542322581507	1	0
2	1138242	10025	551413741560930304	1	0
3	1139529	10025	551692497269690368	1	0
4	1133634	10025	550318759898918913	1	0

In [15]:

tweet_ids = roomba_df.status_id.values

In [16]:

conf = configparser.ConfigParser()
conf.read('../config/setting.ini')

Out[16]:

['../config/setting.ini']

In [17]:

def get_tweet_text(tweet_id):
    twitter = OAuth1Session(
        conf['twitterapi']['CONSUMER_KEY'],
        conf['twitterapi']['CONSUMER_SECRET'],
        conf['twitterapi']['TOKEN'],
        conf['twitterapi']['TOKEN_SECRET']
    )

    url = 'https://api.twitter.com/1.1/statuses/show.json' 

    params ={'id' : tweet_id}
    res = twitter.get(url, params = params)
    
    if res.status_code == 200:
        return json.loads(res.text)['text'], res
    else:
        return np.nan, res

In [18]:

tweet_texts = []
for tweet_id in tqdm_notebook(tweet_ids):
    text, res = get_tweet_text(tweet_id)
    tweet_texts.append(text)
    #API制限までの残り回数が来たら寝る（15分900回）
    try:                                                                                                                                                                                                                                       
        if int(res.headers['x-rate-limit-remaining']) <= 0:
            print(f'{datetime.datetime.now()}: 15分後に起きます。')
            time.sleep(15 * 60 + 2)
            print(f'{datetime.datetime.now()}: 起きました。')
    except KeyError:
        print('なんやろね')
        pass

HBox(children=(IntProgress(value=0, max=2023), HTML(value='')))

2019-05-05 18:26:33.633980: 15分後に起きます。
2019-05-05 18:41:35.680097: 起きました。
2019-05-05 18:44:33.241502: 15分後に起きます。
2019-05-05 18:59:35.290670: 起きました。

In [19]:

len(tweet_texts)

Out[19]:

In [20]:

roomba_df['tweet_text'] = tweet_texts

In [22]:

roomba_df.to_csv('../data/roomba.csv.gz', compression='gzip', index=False)

In [ ]: