from pymongo import MongoClient
from datetime import datetime
class DBConnector():
def __init__(self,db_name,usr,ip,psw):
self.db_name = db_name
self.usr = usr
self.psw = psw
self.uri = 'mongodb://{0}:{1}@{2}/{3}'.format(usr,psw,ip,db_name)
print(self.uri)
def mongoClient(self, collection):
self.mongo_client = MongoClient(self.uri)
db = self.mongo_client[self.db_name]
db_client = db[collection]
return db_client
def close_connection(self):
self.mongo_client.close()
db_connector = DBConnector("twitter_public","btw17_public_user","10.6.13.55","btw17_public")
c = "politicians_sample"
collection = db_connector.mongoClient(c)
Example of a query to fetch tweets between 19:00 hours to 22:00 hours is shown below.
query = collection.find({"createdAt" : {"$gte" : datetime(2017, 9, 23, 19, 0, 0), "$lt": datetime(2017, 9, 23, 22, 0 , 0)}})
from pprint import pprint
print("Print the first 5 elements")
pprint([el for el in query[0:5]])
collection.count_documents({"createdAt" : {"$gte" : datetime(2017, 9, 23, 19, 0, 0), "$lt": datetime(2017, 9, 23, 22, 0 , 0)}})
import pandas as pd
query = collection.find({"createdAt" : {"$gte" : datetime(2017, 9, 23, 19, 0, 0), "$lt": datetime(2017, 9, 23, 22, 0 , 0)}})
tweets = pd.DataFrame(list(query))
tweets
%matplotlib inline
# aggregate per hour
counts = tweets['createdAt'].dt.to_period('H').value_counts()
# plot
ax = counts.plot(kind='bar', title='Number of Tweets Per Hour')
ax.tick_params(axis='x', labelsize=8)
ax.tick_params(axis='y', labelsize=10)
ax.set_xlabel('Time', fontsize=15)
ax.set_ylabel('Number of tweets' , fontsize=15)
ax.set_title('Number of Tweets Per Hour', fontsize=15, fontweight='bold')
pass