import requests
import json
import pandas as pd
from pandas import DataFrame,Series
import numpy as np
import re
import datetime
from datetime import timedelta
from dateutil import relativedelta
import calendar
import getpass
import psycopg2
from sqlalchemy import create_engine
host = raw_input("Host Address: ")
Host Address: localhost
db_name = "pokemon_go"
username = "pokemon_go_role"
password = getpass.getpass()
········
years = xrange(2016, 2025)
start_date = str(datetime.date(years[0], 1, 1))
end_date = str(datetime.date(years[-1], 12, 31))
all_dates = pd.date_range(start_date, end_date, freq='D')
all_dates
DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04', '2016-01-05', '2016-01-06', '2016-01-07', '2016-01-08', '2016-01-09', '2016-01-10', ... '2024-12-22', '2024-12-23', '2024-12-24', '2024-12-25', '2024-12-26', '2024-12-27', '2024-12-28', '2024-12-29', '2024-12-30', '2024-12-31'], dtype='datetime64[ns]', length=3288, freq='D')
# Given a datetime timestamp, produce a datekey
def datetimeToDateKey(date):
date_str = str(date)
date_str = date_str.split(" ")[0]
date_str = re.sub('-', '', date_str)
return date_str
# Given a datetime timestamp, convert it to a string of just the date in YYYY-mm-dd format
def fullDate(date):
date_str = str(date)
date_str = date_str.split(" ")[0]
return date_str
# Returns the weekday as a number
def weekdayNum(date):
return date.isoweekday()
# Returns the weekday as a string in its full-length form
def weekdayStr(date):
return date.strftime("%A")
# Returns the weekday as a string in its abbreviated form
def weekdayStrAbv(date):
return date.strftime("%a")
# Returns the day of the month
def dayNumMonth(date):
return date.day
# Numbers each day, constantly increasing from the first day
def dayNumOverall(date):
day_one = all_dates[0]
date_diff = date - day_one
return date_diff.days + 1
# Returns the day of the month
def isWeekday(date):
if date.isoweekday() in range(1,6):
return "Weekday"
else:
return "Weekend"
# Returns the day of the month
def weekNum(date):
return date.isocalendar()[1]
def weekBeginDate(date):
dow = date.isoweekday()
week_start = date - timedelta(days=(dow - 1))
return week_start
def weekBeginDateKey(date):
week_begin_date = weekBeginDate(date)
date_key = datetimeToDateKey(week_begin_date)
return int(date_key)
# Numbers each day, constantly increasing from the first day
def weekNumOverall(date):
# Find the date that the first week in the entire data set starts
first_day = all_dates[0]
first_week_start = weekBeginDate(first_day)
# Find the date that starts the week of the current date
curr_week_start = weekBeginDate(date)
# Get the difference and find out how many weeks have passed
date_diff = curr_week_start - first_week_start
week_number = int(date_diff.days / 7.0 + 1.0)
return week_number
# Returns the weekday as a number
def monthNum(date):
return date.month
# Returns the weekday as a number
def monthNumOverall(date):
start_date = all_dates[0]
rel_date = relativedelta.relativedelta(date, start_date)
month_diff = rel_date.years * 12 + rel_date.months
return month_diff + 1
# Returns the month as a string in its full-length form
def monthStr(date):
return date.strftime("%B")
# Returns the month as a string in its abbreviated form
def monthStrAbv(date):
return date.strftime("%b")
# Returns the quarter in the year
def quarter(date):
month = date.month
quarter = month / 4 + 1
return quarter
# Returns the year as a string
def year(date):
return date.strftime("%Y")
# Returns the year and month as a concatenated string
def yearmo(date):
year = date.strftime("%Y")
month = date.strftime("%m")
return year + month
# Returns whether or not the date is the last day of the month
def isMonthEnd(date):
year = date.year
month = date.month
month_end = calendar.monthrange(year, month)[1]
if (month_end == date.day):
return "Month End"
else:
return "Not Month End"
# Use the date functions to make a dateframe
# Dates
date_dim = DataFrame(all_dates, columns=["full_date"])
date_dim["date_key"] = date_dim["full_date"].map(datetimeToDateKey)
date_dim = date_dim[['date_key', 'full_date']] # Reorder
# Days of Week
date_dim["day_of_week"] = date_dim["full_date"].map(weekdayNum)
date_dim["day_of_week_name"] = date_dim["full_date"].map(weekdayStr)
date_dim["day_of_week_name_abbrev"] = date_dim["full_date"].map(weekdayStrAbv)
date_dim["day_of_month"] = date_dim["full_date"].map(dayNumMonth)
date_dim["day_number_overall"] = date_dim["full_date"].map(dayNumOverall)
date_dim["day_number_overall"] = date_dim["full_date"].map(dayNumOverall)
date_dim["weekday_flag"] = date_dim["full_date"].map(isWeekday)
date_dim["week_number"] = date_dim["full_date"].map(weekNum)
date_dim["week_number_overall"] = date_dim["full_date"].map(weekNumOverall)
date_dim["week_begin_date"] = date_dim["full_date"].map(weekBeginDate)
date_dim["week_begin_date_key"] = date_dim["full_date"].map(weekBeginDateKey)
date_dim["month_number"] = date_dim["full_date"].map(monthNum)
date_dim["month_number_overall"] = date_dim["full_date"].map(monthNumOverall)
date_dim["month"] = date_dim["full_date"].map(monthStr)
date_dim["month_abbrev"] = date_dim["full_date"].map(monthStrAbv)
date_dim["quarter"] = date_dim["full_date"].map(quarter)
date_dim["year"] = date_dim["full_date"].map(year)
date_dim["year_month"] = date_dim["full_date"].map(yearmo)
date_dim["month_end_flag"] = date_dim["full_date"].map(isMonthEnd)
from datetime import time
import math
# 1440 minutes in a day
minutes = xrange(0,1440)
# Given a minute number, return the 12-hour time label
def time_label_12(min_num):
hours, minutes = divmod(int(min_num), 60)
timestamp = time(hour=hours, minute=minutes)
return str(time.strftime(timestamp, '%I:%M %p'))
# Given a minute number, return the 24-hour time label
def time_label_24(min_num):
hours, minutes = divmod(int(min_num), 60)
timestamp = time(hour=hours, minute=minutes)
return str(time.strftime(timestamp, '%H:%M'))
# Given a minute number, return the 15 minute interval it occures in
def time_interval_15_min(min_num):
return str(int(math.floor(min_num / 15.0)))
# Given a minute number, return the 30 minute interval it occures in
def time_interval_30_min(min_num):
return str(int(math.floor(min_num / 30.0)))
# Given a minute number, return the 60 minute interval it occures in
def time_interval_60_min(min_num):
return str(int(math.floor(min_num / 60.0)))
# Given a minute number, return the 12-hour time label
# with only hours (this takes up less space and is useful in some cases)
def label_hh(min_num):
hours, minutes = divmod(int(min_num), 60)
timestamp = time(hour=hours, minute=minutes)
return str(time.strftime(timestamp, '%I %p'))
# Given a minute number, return the 24-hour time label with just hours
def label_hh24(min_num):
hours, minutes = divmod(int(min_num), 60)
timestamp = time(hour=hours, minute=minutes)
return str(time.strftime(timestamp, '%H'))
# Given a minute number, return the 15 minute interval label for a 24-hour clock
def label_15_min_24(min_num):
interval_num = time_interval_15_min(min_num)
int_min_num = int(interval_num) * 15
hours, minutes = divmod(int_min_num, 60)
timestamp = time(hour=hours, minute=minutes)
return str(time.strftime(timestamp, '%H:%M'))
# Given a minute number, return the 30 minute interval label for a 24-hour clock
def label_30_min_24(min_num):
interval_num = time_interval_30_min(min_num)
int_min_num = int(interval_num) * 30
hours, minutes = divmod(int_min_num, 60)
timestamp = time(hour=hours, minute=minutes)
return str(time.strftime(timestamp, '%H:%M'))
# Given a minute number, return the 60 minute interval label for a 24-hour clock
def label_60_min_24(min_num):
interval_num = time_interval_60_min(min_num)
int_min_num = int(interval_num) * 60
hours, minutes = divmod(int_min_num, 60)
timestamp = time(hour=hours, minute=minutes)
return str(time.strftime(timestamp, '%H:%M'))
# Given a minute number, return the 15 minute interval label for a 12-hour clock
def label_15_min_12(min_num):
interval_num = time_interval_15_min(min_num)
int_min_num = int(interval_num) * 15
hours, minutes = divmod(int_min_num, 60)
timestamp = time(hour=hours, minute=minutes)
return str(time.strftime(timestamp, '%I:%M %p'))
# Given a minute number, return the 30 minute interval label for a 12-hour clock
def label_30_min_12(min_num):
interval_num = time_interval_30_min(min_num)
int_min_num = int(interval_num) * 30
hours, minutes = divmod(int_min_num, 60)
timestamp = time(hour=hours, minute=minutes)
return str(time.strftime(timestamp, '%I:%M %p'))
# Given a miute number, return the 60 minute interval label for a 12-hour clock
def label_60_min_12(min_num):
interval_num = time_interval_60_min(min_num)
int_min_num = int(interval_num) * 60
hours, minutes = divmod(int_min_num, 60)
timestamp = time(hour=hours, minute=minutes)
return str(time.strftime(timestamp, '%I:%M %p'))
# Given a minute, return just the minute portion
def minute_after_hour(min_num):
hours, minutes = divmod(int(min_num), 60)
timestamp = time(hour=hours, minute=minutes)
return str(int(time.strftime(timestamp, '%M')))
minute_after_hour(61)
'1'
time_dim = DataFrame(Series(minutes), columns=["time_key"])
time_dim["time_label_24"] = time_dim["time_key"].map(time_label_24)
time_dim["time_label_12"] = time_dim["time_key"].map(time_label_12)
time_dim["time_interval_15min"] = time_dim["time_key"].map(time_interval_15_min)
time_dim["time_interval_30min"] = time_dim["time_key"].map(time_interval_30_min)
time_dim["time_interval_60min"] = time_dim["time_key"].map(time_interval_60_min)
time_dim["label_hh"] = time_dim["time_key"].map(label_hh)
time_dim["label_hh24"] = time_dim["time_key"].map(label_hh24)
time_dim["label_15min_24"] = time_dim["time_key"].map(label_15_min_24)
time_dim["label_30min_24"] = time_dim["time_key"].map(label_30_min_24)
time_dim["label_60min_24"] = time_dim["time_key"].map(label_60_min_24)
time_dim["label_15min_12"] = time_dim["time_key"].map(label_15_min_12)
time_dim["label_30min_12"] = time_dim["time_key"].map(label_30_min_12)
time_dim["label_60min_12"] = time_dim["time_key"].map(label_60_min_12)
time_dim["minute_after_hour"] = time_dim["time_key"].map(minute_after_hour)
date_dim.head()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-71-243793b0a336> in <module>() ----> 1 date_dim.head() NameError: name 'date_dim' is not defined
time_dim.head(n=100)
time_key | time_label_24 | time_label_12 | time_interval_15min | time_interval_30min | time_interval_60min | label_hh | label_hh24 | label_15min_24 | label_30min_24 | label_60min_24 | label_15min_12 | label_30min_12 | label_60min_12 | minute_after_hour | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 00:00 | 12:00 AM | 0 | 0 | 0 | 12 AM | 00 | 00:00 | 00:00 | 00:00 | 12:00 AM | 12:00 AM | 12:00 AM | 0 |
1 | 1 | 00:01 | 12:01 AM | 0 | 0 | 0 | 12 AM | 00 | 00:00 | 00:00 | 00:00 | 12:00 AM | 12:00 AM | 12:00 AM | 1 |
2 | 2 | 00:02 | 12:02 AM | 0 | 0 | 0 | 12 AM | 00 | 00:00 | 00:00 | 00:00 | 12:00 AM | 12:00 AM | 12:00 AM | 2 |
3 | 3 | 00:03 | 12:03 AM | 0 | 0 | 0 | 12 AM | 00 | 00:00 | 00:00 | 00:00 | 12:00 AM | 12:00 AM | 12:00 AM | 3 |
4 | 4 | 00:04 | 12:04 AM | 0 | 0 | 0 | 12 AM | 00 | 00:00 | 00:00 | 00:00 | 12:00 AM | 12:00 AM | 12:00 AM | 4 |
5 | 5 | 00:05 | 12:05 AM | 0 | 0 | 0 | 12 AM | 00 | 00:00 | 00:00 | 00:00 | 12:00 AM | 12:00 AM | 12:00 AM | 5 |
6 | 6 | 00:06 | 12:06 AM | 0 | 0 | 0 | 12 AM | 00 | 00:00 | 00:00 | 00:00 | 12:00 AM | 12:00 AM | 12:00 AM | 6 |
7 | 7 | 00:07 | 12:07 AM | 0 | 0 | 0 | 12 AM | 00 | 00:00 | 00:00 | 00:00 | 12:00 AM | 12:00 AM | 12:00 AM | 7 |
8 | 8 | 00:08 | 12:08 AM | 0 | 0 | 0 | 12 AM | 00 | 00:00 | 00:00 | 00:00 | 12:00 AM | 12:00 AM | 12:00 AM | 8 |
9 | 9 | 00:09 | 12:09 AM | 0 | 0 | 0 | 12 AM | 00 | 00:00 | 00:00 | 00:00 | 12:00 AM | 12:00 AM | 12:00 AM | 9 |
10 | 10 | 00:10 | 12:10 AM | 0 | 0 | 0 | 12 AM | 00 | 00:00 | 00:00 | 00:00 | 12:00 AM | 12:00 AM | 12:00 AM | 10 |
11 | 11 | 00:11 | 12:11 AM | 0 | 0 | 0 | 12 AM | 00 | 00:00 | 00:00 | 00:00 | 12:00 AM | 12:00 AM | 12:00 AM | 11 |
12 | 12 | 00:12 | 12:12 AM | 0 | 0 | 0 | 12 AM | 00 | 00:00 | 00:00 | 00:00 | 12:00 AM | 12:00 AM | 12:00 AM | 12 |
13 | 13 | 00:13 | 12:13 AM | 0 | 0 | 0 | 12 AM | 00 | 00:00 | 00:00 | 00:00 | 12:00 AM | 12:00 AM | 12:00 AM | 13 |
14 | 14 | 00:14 | 12:14 AM | 0 | 0 | 0 | 12 AM | 00 | 00:00 | 00:00 | 00:00 | 12:00 AM | 12:00 AM | 12:00 AM | 14 |
15 | 15 | 00:15 | 12:15 AM | 1 | 0 | 0 | 12 AM | 00 | 00:15 | 00:00 | 00:00 | 12:15 AM | 12:00 AM | 12:00 AM | 15 |
16 | 16 | 00:16 | 12:16 AM | 1 | 0 | 0 | 12 AM | 00 | 00:15 | 00:00 | 00:00 | 12:15 AM | 12:00 AM | 12:00 AM | 16 |
17 | 17 | 00:17 | 12:17 AM | 1 | 0 | 0 | 12 AM | 00 | 00:15 | 00:00 | 00:00 | 12:15 AM | 12:00 AM | 12:00 AM | 17 |
18 | 18 | 00:18 | 12:18 AM | 1 | 0 | 0 | 12 AM | 00 | 00:15 | 00:00 | 00:00 | 12:15 AM | 12:00 AM | 12:00 AM | 18 |
19 | 19 | 00:19 | 12:19 AM | 1 | 0 | 0 | 12 AM | 00 | 00:15 | 00:00 | 00:00 | 12:15 AM | 12:00 AM | 12:00 AM | 19 |
20 | 20 | 00:20 | 12:20 AM | 1 | 0 | 0 | 12 AM | 00 | 00:15 | 00:00 | 00:00 | 12:15 AM | 12:00 AM | 12:00 AM | 20 |
21 | 21 | 00:21 | 12:21 AM | 1 | 0 | 0 | 12 AM | 00 | 00:15 | 00:00 | 00:00 | 12:15 AM | 12:00 AM | 12:00 AM | 21 |
22 | 22 | 00:22 | 12:22 AM | 1 | 0 | 0 | 12 AM | 00 | 00:15 | 00:00 | 00:00 | 12:15 AM | 12:00 AM | 12:00 AM | 22 |
23 | 23 | 00:23 | 12:23 AM | 1 | 0 | 0 | 12 AM | 00 | 00:15 | 00:00 | 00:00 | 12:15 AM | 12:00 AM | 12:00 AM | 23 |
24 | 24 | 00:24 | 12:24 AM | 1 | 0 | 0 | 12 AM | 00 | 00:15 | 00:00 | 00:00 | 12:15 AM | 12:00 AM | 12:00 AM | 24 |
25 | 25 | 00:25 | 12:25 AM | 1 | 0 | 0 | 12 AM | 00 | 00:15 | 00:00 | 00:00 | 12:15 AM | 12:00 AM | 12:00 AM | 25 |
26 | 26 | 00:26 | 12:26 AM | 1 | 0 | 0 | 12 AM | 00 | 00:15 | 00:00 | 00:00 | 12:15 AM | 12:00 AM | 12:00 AM | 26 |
27 | 27 | 00:27 | 12:27 AM | 1 | 0 | 0 | 12 AM | 00 | 00:15 | 00:00 | 00:00 | 12:15 AM | 12:00 AM | 12:00 AM | 27 |
28 | 28 | 00:28 | 12:28 AM | 1 | 0 | 0 | 12 AM | 00 | 00:15 | 00:00 | 00:00 | 12:15 AM | 12:00 AM | 12:00 AM | 28 |
29 | 29 | 00:29 | 12:29 AM | 1 | 0 | 0 | 12 AM | 00 | 00:15 | 00:00 | 00:00 | 12:15 AM | 12:00 AM | 12:00 AM | 29 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
70 | 70 | 01:10 | 01:10 AM | 4 | 2 | 1 | 01 AM | 01 | 01:00 | 01:00 | 01:00 | 01:00 AM | 01:00 AM | 01:00 AM | 10 |
71 | 71 | 01:11 | 01:11 AM | 4 | 2 | 1 | 01 AM | 01 | 01:00 | 01:00 | 01:00 | 01:00 AM | 01:00 AM | 01:00 AM | 11 |
72 | 72 | 01:12 | 01:12 AM | 4 | 2 | 1 | 01 AM | 01 | 01:00 | 01:00 | 01:00 | 01:00 AM | 01:00 AM | 01:00 AM | 12 |
73 | 73 | 01:13 | 01:13 AM | 4 | 2 | 1 | 01 AM | 01 | 01:00 | 01:00 | 01:00 | 01:00 AM | 01:00 AM | 01:00 AM | 13 |
74 | 74 | 01:14 | 01:14 AM | 4 | 2 | 1 | 01 AM | 01 | 01:00 | 01:00 | 01:00 | 01:00 AM | 01:00 AM | 01:00 AM | 14 |
75 | 75 | 01:15 | 01:15 AM | 5 | 2 | 1 | 01 AM | 01 | 01:15 | 01:00 | 01:00 | 01:15 AM | 01:00 AM | 01:00 AM | 15 |
76 | 76 | 01:16 | 01:16 AM | 5 | 2 | 1 | 01 AM | 01 | 01:15 | 01:00 | 01:00 | 01:15 AM | 01:00 AM | 01:00 AM | 16 |
77 | 77 | 01:17 | 01:17 AM | 5 | 2 | 1 | 01 AM | 01 | 01:15 | 01:00 | 01:00 | 01:15 AM | 01:00 AM | 01:00 AM | 17 |
78 | 78 | 01:18 | 01:18 AM | 5 | 2 | 1 | 01 AM | 01 | 01:15 | 01:00 | 01:00 | 01:15 AM | 01:00 AM | 01:00 AM | 18 |
79 | 79 | 01:19 | 01:19 AM | 5 | 2 | 1 | 01 AM | 01 | 01:15 | 01:00 | 01:00 | 01:15 AM | 01:00 AM | 01:00 AM | 19 |
80 | 80 | 01:20 | 01:20 AM | 5 | 2 | 1 | 01 AM | 01 | 01:15 | 01:00 | 01:00 | 01:15 AM | 01:00 AM | 01:00 AM | 20 |
81 | 81 | 01:21 | 01:21 AM | 5 | 2 | 1 | 01 AM | 01 | 01:15 | 01:00 | 01:00 | 01:15 AM | 01:00 AM | 01:00 AM | 21 |
82 | 82 | 01:22 | 01:22 AM | 5 | 2 | 1 | 01 AM | 01 | 01:15 | 01:00 | 01:00 | 01:15 AM | 01:00 AM | 01:00 AM | 22 |
83 | 83 | 01:23 | 01:23 AM | 5 | 2 | 1 | 01 AM | 01 | 01:15 | 01:00 | 01:00 | 01:15 AM | 01:00 AM | 01:00 AM | 23 |
84 | 84 | 01:24 | 01:24 AM | 5 | 2 | 1 | 01 AM | 01 | 01:15 | 01:00 | 01:00 | 01:15 AM | 01:00 AM | 01:00 AM | 24 |
85 | 85 | 01:25 | 01:25 AM | 5 | 2 | 1 | 01 AM | 01 | 01:15 | 01:00 | 01:00 | 01:15 AM | 01:00 AM | 01:00 AM | 25 |
86 | 86 | 01:26 | 01:26 AM | 5 | 2 | 1 | 01 AM | 01 | 01:15 | 01:00 | 01:00 | 01:15 AM | 01:00 AM | 01:00 AM | 26 |
87 | 87 | 01:27 | 01:27 AM | 5 | 2 | 1 | 01 AM | 01 | 01:15 | 01:00 | 01:00 | 01:15 AM | 01:00 AM | 01:00 AM | 27 |
88 | 88 | 01:28 | 01:28 AM | 5 | 2 | 1 | 01 AM | 01 | 01:15 | 01:00 | 01:00 | 01:15 AM | 01:00 AM | 01:00 AM | 28 |
89 | 89 | 01:29 | 01:29 AM | 5 | 2 | 1 | 01 AM | 01 | 01:15 | 01:00 | 01:00 | 01:15 AM | 01:00 AM | 01:00 AM | 29 |
90 | 90 | 01:30 | 01:30 AM | 6 | 3 | 1 | 01 AM | 01 | 01:30 | 01:30 | 01:00 | 01:30 AM | 01:30 AM | 01:00 AM | 30 |
91 | 91 | 01:31 | 01:31 AM | 6 | 3 | 1 | 01 AM | 01 | 01:30 | 01:30 | 01:00 | 01:30 AM | 01:30 AM | 01:00 AM | 31 |
92 | 92 | 01:32 | 01:32 AM | 6 | 3 | 1 | 01 AM | 01 | 01:30 | 01:30 | 01:00 | 01:30 AM | 01:30 AM | 01:00 AM | 32 |
93 | 93 | 01:33 | 01:33 AM | 6 | 3 | 1 | 01 AM | 01 | 01:30 | 01:30 | 01:00 | 01:30 AM | 01:30 AM | 01:00 AM | 33 |
94 | 94 | 01:34 | 01:34 AM | 6 | 3 | 1 | 01 AM | 01 | 01:30 | 01:30 | 01:00 | 01:30 AM | 01:30 AM | 01:00 AM | 34 |
95 | 95 | 01:35 | 01:35 AM | 6 | 3 | 1 | 01 AM | 01 | 01:30 | 01:30 | 01:00 | 01:30 AM | 01:30 AM | 01:00 AM | 35 |
96 | 96 | 01:36 | 01:36 AM | 6 | 3 | 1 | 01 AM | 01 | 01:30 | 01:30 | 01:00 | 01:30 AM | 01:30 AM | 01:00 AM | 36 |
97 | 97 | 01:37 | 01:37 AM | 6 | 3 | 1 | 01 AM | 01 | 01:30 | 01:30 | 01:00 | 01:30 AM | 01:30 AM | 01:00 AM | 37 |
98 | 98 | 01:38 | 01:38 AM | 6 | 3 | 1 | 01 AM | 01 | 01:30 | 01:30 | 01:00 | 01:30 AM | 01:30 AM | 01:00 AM | 38 |
99 | 99 | 01:39 | 01:39 AM | 6 | 3 | 1 | 01 AM | 01 | 01:30 | 01:30 | 01:00 | 01:30 AM | 01:30 AM | 01:00 AM | 39 |
100 rows × 15 columns
## Export each to a CSV first so that we can use the COPY command. It's substantially more efficient.
# date_dim.to_csv(path_or_buf="./date_dim.csv", index=False)
time_dim.to_csv(path_or_buf="./time_dim.csv", index=False)
We want to load the data from the CSV to save time, but we also want to programmatically create the table in postgres. This simply takes our dataframe, removes all the data but keeps the column names, and uses that to create the table. It also gives them appropriate data types (most of the time). Then, we fill the table with a copy command.
engine = create_engine('postgresql://' + username + ':' + password + '@' + host + '/' + db_name)
empty_date_dim = date_dim.copy()
empty_date_dim = empty_date_dim.drop(empty_date_dim.index[0:date_dim.shape[0]])
empty_date_dim.to_sql("date_dimension", engine, if_exists="replace", index=False)
empty_time_dim = time_dim.copy()
empty_time_dim = empty_time_dim.drop(empty_time_dim.index[0:time_dim.shape[0]])
empty_time_dim.to_sql("time_dimension", engine, if_exists="replace", index=False)
We have to create a new connection here. The above uses sqlalchemy, which then itself uses psycopg2, but we want to use psycopg2 directly.
connection_string = "dbname='" + db_name + "' "
connection_string += "user='" + username + "' "
connection_string += "host='" + host + "' "
connection_string += "password='" + password + "' "
# Set up a copy statement. The %s will be replaced later
sql_statement = """
COPY %s FROM STDIN WITH
CSV
HEADER
DELIMITER AS ','
"""
def load_file(conn, table_name, primary_key, file_object):
cursor = conn.cursor()
cursor.copy_expert(sql=sql_statement % table_name, file=file_object)
conn.commit()
# Add add primary key, index and then vacuum
cursor.execute("ALTER TABLE " + table_name + " ADD PRIMARY KEY (" + primary_key + ")")
cursor.execute("VACUUM VERBOSE ANALYZE " + table_name)
cursor.execute("CREATE INDEX ON " + table_name + " (" + primary_key + " ASC NULLS LAST);")
cursor.close()
date_dim_file = open("./date_dim.csv")
date_dim_file
<open file './date_dim.csv', mode 'r' at 0x1148f5f60>
conn = psycopg2.connect(connection_string)
conn.autocommit = True
try:
load_file(conn, table_name='date_dimension', primary_key="date_key", file_object=date_dim_file)
finally:
conn.close()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-57-270064a96dda> in <module>() 2 conn.autocommit = True 3 try: ----> 4 load_file(conn, table_name='date_dimension', primary_key="date_key", file_object=date_dim_file) 5 finally: 6 conn.close() NameError: name 'date_dim_file' is not defined
time_dim_file = open("./time_dim.csv")
time_dim_file
<open file './time_dim.csv', mode 'r' at 0x114aacdb0>
conn = psycopg2.connect(connection_string)
conn.autocommit = True
try:
load_file(conn, table_name='time_dimension', primary_key="time_key", file_object=time_dim_file)
finally:
conn.close()
pokemon_info_df = pd.read_csv(filepath_or_buffer="./dimension_table_csvs/pokemon_info.csv")
empty_pokemon_pk_info = pokemon_info_df.copy()
empty_pokemon_pk_info = empty_pokemon_pk_info.drop(empty_pokemon_pk_info.index[0:empty_pokemon_pk_info.shape[0]])
empty_pokemon_pk_info.to_sql("pokemon_info", engine, if_exists="replace", index=False)
pokemon_info_dim_file = open("./dimension_table_csvs/pokemon_info.csv")
pokemon_info_dim_file
<open file './dimension_table_csvs/pokemon_info.csv', mode 'r' at 0x1148b5ed0>
conn = psycopg2.connect(connection_string)
conn.autocommit = True
try:
load_file(conn, table_name='pokemon_info', primary_key="pokemon_id", file_object=pokemon_info_dim_file)
finally:
conn.close()