This notebook contains the Python-3 script to get IMDB movie data into a database from their downloadable datasets. It assumes you have psycopg2, which you can install from conda. I recommend making a new conda environment, so your root environment won't get corrupted. For example, the following command in a conda shell would create a new Python-3.6 environment with psycopg2.
$ conda create -n py36-imdb python=3.6.3 psycopg2
psycopg2 is a binding to the PostgreSQL C-API that uses the Python DB-API 2.0 specification. This means it uses the same syntax to connect and execute queries as the builtin sqlite3
Python module.
# imports
import csv
import gzip
import os
import urllib
import psycopg2, psycopg2.extras
import logging
# logging
logging.basicConfig()
LOGGER = logging.getLogger(__name__)
LOGGER.setLevel(logging.DEBUG)
# meta data about the URL, the data tarballs, and their schema
IMDB_datasets = 'https://datasets.imdbws.com/'
datasets = {
'title_basics': 'title.basics.tsv.gz', # information for titles
'title_crew': 'title.crew.tsv.gz', # the director and writer information for all the titles in IMDb
'title_episode': 'title.episode.tsv.gz', # the tv episode information
'title_principals': 'title.principals.tsv.gz',
'title_ratings': 'title.ratings.tsv.gz',
'name_basics': 'name.basics.tsv.gz'
}
DETAILS = {
'title_basics': [
('tconst', 'TEXT PRIMARY KEY'), ('titleType', 'TEXT'), ('primaryTitle', 'TEXT'),
('originalTitle', 'TEXT'), ('isAdult', 'INT'), ('startYear', 'INT'),
('endYear', 'INT'), ('runtimeMinutes', 'INT'), ('genres', 'TEXT')
],
}
# info to connect to an online database set up for this tutorial
DBNAME = 'breaking-bytes_imdb'
HOST = 'postgresql-breaking-bytes.alwaysdata.net'
PORT = 5432
USER = os.getenv('USER')
PASSWORD = os.getenv('PASSWORD')
# connect to the database and get a Python DB-API 2.0 "cursor"
conn = psycopg2.connect(dbname=DBNAME, user=USER, password=PASSWORD, host=HOST, port=PORT)
cur = conn.cursor()
# create title.basic table
EXPR = ('CREATE TABLE title_basics(%s, %s, %s, %s, %s, %s, %s, %s, %s);'
% tuple(' '.join(kv) for kv in DETAILS['title_basics']))
cur.execute(EXPR)
conn.commit()
# get the 'title.basics' dataset
with urllib.request.urlopen(IMDB_datasets + datasets['title_basics']) as http_response:
tsv = gzip.decompress(http_response.read())
# split the lines into rows
rows = tsv.decode('utf-8').split('\n')
LOGGER.debug(rows[0].split('\t')) # take a look at the headers
# skip the first row with the headers and the last row with only a newline
rows = rows[1:-1]
# split the rows into columns
records = [row.split('\t') for row in rows]
# replace IMDB's NULL, "\\N", which None
records = [tuple(None if r == '\\N' else r for r in record)
for record in records]
DEBUG:__main__:['tconst', 'titleType', 'primaryTitle', 'originalTitle', 'isAdult', 'startYear', 'endYear', 'runtimeMinutes', 'genres']
# copy this data to the database
COUNT = len(records)
LOOPS = 100
CHUNKS = COUNT // LOOPS
idx = 0
for n in range(LOOPS):
jdx = idx + CHUNKS
LOGGER.debug('loop: %d, start: %d, stop: %d', n, idx, jdx)
psycopg2.extras.execute_values(cur, 'INSERT INTO title_basics VALUES %s;', records[idx:jdx])
conn.commit()
idx += CHUNKS
DEBUG:__main__:loop: 0, start: 0, stop: 48137 DEBUG:__main__:loop: 1, start: 48137, stop: 96274 DEBUG:__main__:loop: 2, start: 96274, stop: 144411 DEBUG:__main__:loop: 3, start: 144411, stop: 192548 DEBUG:__main__:loop: 4, start: 192548, stop: 240685 DEBUG:__main__:loop: 5, start: 240685, stop: 288822 DEBUG:__main__:loop: 6, start: 288822, stop: 336959 DEBUG:__main__:loop: 7, start: 336959, stop: 385096 DEBUG:__main__:loop: 8, start: 385096, stop: 433233 DEBUG:__main__:loop: 9, start: 433233, stop: 481370 DEBUG:__main__:loop: 10, start: 481370, stop: 529507 DEBUG:__main__:loop: 11, start: 529507, stop: 577644 DEBUG:__main__:loop: 12, start: 577644, stop: 625781 DEBUG:__main__:loop: 13, start: 625781, stop: 673918 DEBUG:__main__:loop: 14, start: 673918, stop: 722055 DEBUG:__main__:loop: 15, start: 722055, stop: 770192 DEBUG:__main__:loop: 16, start: 770192, stop: 818329 DEBUG:__main__:loop: 17, start: 818329, stop: 866466 DEBUG:__main__:loop: 18, start: 866466, stop: 914603 DEBUG:__main__:loop: 19, start: 914603, stop: 962740 DEBUG:__main__:loop: 20, start: 962740, stop: 1010877 DEBUG:__main__:loop: 21, start: 1010877, stop: 1059014 DEBUG:__main__:loop: 22, start: 1059014, stop: 1107151 DEBUG:__main__:loop: 23, start: 1107151, stop: 1155288 DEBUG:__main__:loop: 24, start: 1155288, stop: 1203425 DEBUG:__main__:loop: 25, start: 1203425, stop: 1251562 DEBUG:__main__:loop: 26, start: 1251562, stop: 1299699 DEBUG:__main__:loop: 27, start: 1299699, stop: 1347836 DEBUG:__main__:loop: 28, start: 1347836, stop: 1395973 DEBUG:__main__:loop: 29, start: 1395973, stop: 1444110 DEBUG:__main__:loop: 30, start: 1444110, stop: 1492247 DEBUG:__main__:loop: 31, start: 1492247, stop: 1540384 DEBUG:__main__:loop: 32, start: 1540384, stop: 1588521 DEBUG:__main__:loop: 33, start: 1588521, stop: 1636658 DEBUG:__main__:loop: 34, start: 1636658, stop: 1684795 DEBUG:__main__:loop: 35, start: 1684795, stop: 1732932 DEBUG:__main__:loop: 36, start: 1732932, stop: 1781069 DEBUG:__main__:loop: 37, start: 1781069, stop: 1829206 DEBUG:__main__:loop: 38, start: 1829206, stop: 1877343 DEBUG:__main__:loop: 39, start: 1877343, stop: 1925480 DEBUG:__main__:loop: 40, start: 1925480, stop: 1973617 DEBUG:__main__:loop: 41, start: 1973617, stop: 2021754 DEBUG:__main__:loop: 42, start: 2021754, stop: 2069891 DEBUG:__main__:loop: 43, start: 2069891, stop: 2118028 DEBUG:__main__:loop: 44, start: 2118028, stop: 2166165 DEBUG:__main__:loop: 45, start: 2166165, stop: 2214302 DEBUG:__main__:loop: 46, start: 2214302, stop: 2262439 DEBUG:__main__:loop: 47, start: 2262439, stop: 2310576 DEBUG:__main__:loop: 48, start: 2310576, stop: 2358713 DEBUG:__main__:loop: 49, start: 2358713, stop: 2406850 DEBUG:__main__:loop: 50, start: 2406850, stop: 2454987 DEBUG:__main__:loop: 51, start: 2454987, stop: 2503124 DEBUG:__main__:loop: 52, start: 2503124, stop: 2551261 DEBUG:__main__:loop: 53, start: 2551261, stop: 2599398 DEBUG:__main__:loop: 54, start: 2599398, stop: 2647535 DEBUG:__main__:loop: 55, start: 2647535, stop: 2695672 DEBUG:__main__:loop: 56, start: 2695672, stop: 2743809 DEBUG:__main__:loop: 57, start: 2743809, stop: 2791946 DEBUG:__main__:loop: 58, start: 2791946, stop: 2840083 DEBUG:__main__:loop: 59, start: 2840083, stop: 2888220 DEBUG:__main__:loop: 60, start: 2888220, stop: 2936357 DEBUG:__main__:loop: 61, start: 2936357, stop: 2984494 DEBUG:__main__:loop: 62, start: 2984494, stop: 3032631 DEBUG:__main__:loop: 63, start: 3032631, stop: 3080768 DEBUG:__main__:loop: 64, start: 3080768, stop: 3128905 DEBUG:__main__:loop: 65, start: 3128905, stop: 3177042 DEBUG:__main__:loop: 66, start: 3177042, stop: 3225179 DEBUG:__main__:loop: 67, start: 3225179, stop: 3273316 DEBUG:__main__:loop: 68, start: 3273316, stop: 3321453 DEBUG:__main__:loop: 69, start: 3321453, stop: 3369590 DEBUG:__main__:loop: 70, start: 3369590, stop: 3417727 DEBUG:__main__:loop: 71, start: 3417727, stop: 3465864 DEBUG:__main__:loop: 72, start: 3465864, stop: 3514001 DEBUG:__main__:loop: 73, start: 3514001, stop: 3562138 DEBUG:__main__:loop: 74, start: 3562138, stop: 3610275 DEBUG:__main__:loop: 75, start: 3610275, stop: 3658412 DEBUG:__main__:loop: 76, start: 3658412, stop: 3706549 DEBUG:__main__:loop: 77, start: 3706549, stop: 3754686 DEBUG:__main__:loop: 78, start: 3754686, stop: 3802823 DEBUG:__main__:loop: 79, start: 3802823, stop: 3850960 DEBUG:__main__:loop: 80, start: 3850960, stop: 3899097 DEBUG:__main__:loop: 81, start: 3899097, stop: 3947234 DEBUG:__main__:loop: 82, start: 3947234, stop: 3995371 DEBUG:__main__:loop: 83, start: 3995371, stop: 4043508 DEBUG:__main__:loop: 84, start: 4043508, stop: 4091645 DEBUG:__main__:loop: 85, start: 4091645, stop: 4139782 DEBUG:__main__:loop: 86, start: 4139782, stop: 4187919 DEBUG:__main__:loop: 87, start: 4187919, stop: 4236056 DEBUG:__main__:loop: 88, start: 4236056, stop: 4284193 DEBUG:__main__:loop: 89, start: 4284193, stop: 4332330 DEBUG:__main__:loop: 90, start: 4332330, stop: 4380467 DEBUG:__main__:loop: 91, start: 4380467, stop: 4428604 DEBUG:__main__:loop: 92, start: 4428604, stop: 4476741 DEBUG:__main__:loop: 93, start: 4476741, stop: 4524878 DEBUG:__main__:loop: 94, start: 4524878, stop: 4573015 DEBUG:__main__:loop: 95, start: 4573015, stop: 4621152 DEBUG:__main__:loop: 96, start: 4621152, stop: 4669289 DEBUG:__main__:loop: 97, start: 4669289, stop: 4717426 DEBUG:__main__:loop: 98, start: 4717426, stop: 4765563 DEBUG:__main__:loop: 99, start: 4765563, stop: 4813700
cur.close()
conn.close()
print(COUNT)
4813797