Running DB queries in parallel¶

There does not seem to be much of an improvement in running multiple queries in parallel on threads.

Clearly I'm doing something wrong.

In [1]:

import sqlalchemy
import pandas as pd
import concurrent.futures

MySQL¶

In [2]:

engine = sqlalchemy.create_engine('mysql://root@localhost/indiatoday')

In [3]:

def query():
    return pd.read_sql('SELECT DISTINCT ordering, cat_id FROM jos_article_section', engine)

In [4]:

def parallel(workers=1, count=4):
    with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
        future_query = {executor.submit(query): n for n in range(count)}
        result = []
        for future in concurrent.futures.as_completed(future_query):
            try:
                result.append(future.result())
            except Exception as exc:
                print('Exception: %s' % exc)
        return len(pd.concat(result))

In [5]:

%timeit -r1 parallel(workers=1, count=4)

1 loops, best of 1: 8.75 s per loop

In [6]:

%timeit -r1 parallel(workers=4, count=4)

1 loops, best of 1: 7.14 s per loop

PostgreSQL¶

In [7]:

engine = sqlalchemy.create_engine('postgresql://gitlab@localhost/gitlabhq_production')

In [8]:

def query():
    return pd.read_sql('SELECT * FROM events', engine)

In [9]:

%timeit -r1 parallel(workers=1, count=4)

1 loops, best of 1: 2.03 s per loop

In [10]:

%timeit -r1 parallel(workers=4, count=4)

1 loops, best of 1: 1.97 s per loop