#Load the ipython-sql magic command %load_ext sql # Normally use the following: # %sql postgresql://user:passwd@server/db # Use a workaround to not display password! with open("./.config") as conn_config: conn_info = conn_config.readline().strip() ip = get_ipython() ip.magic("%sql " + conn_info) %%sql -- Some quick tests of the capability of PL/Python on -- PostgreSQL and Greenplum DB. -- Create a schema to do some work in DROP SCHEMA plp CASCADE; CREATE SCHEMA plp; %%sql -- Simple SQL User Defined Function to get started CREATE FUNCTION plp.times2(INT) RETURNS INT AS $$ SELECT 2 * $1; $$ LANGUAGE sql; %%sql --Try it out SELECT plp.times2(10); %%sql -- Test using Python code CREATE FUNCTION plp.pymax (a integer, b integer) RETURNS integer AS $$ if a > b: return a return b $$ LANGUAGE plpythonu; %%sql --Test Python code SELECT plp.pymax(10, 5); %%sql -- Create a composite return type CREATE TYPE plp.named_value AS ( name text, value integer ); %%sql --Simple function which returns a composite object CREATE OR REPLACE FUNCTION plp.make_pair (name text, value integer) RETURNS named_value AS $$ return [ name, value ] # or alternatively, as tuple: return ( name, value ) # or as dict: return { "name": name, "value": value } # or as an object with attributes .name and .value $$ LANGUAGE plpythonu; %%sql --Try out the function -- (See https://en.wikipedia.org/wiki/Zozimus) SELECT plp.make_pair('Zozimus', 1); %%sql --Using NumPy inside a PL/Python function CREATE OR REPLACE FUNCTION plp.make_pair (name text) RETURNS named_value AS $$ import numpy as np a = np.arange(100) return [name, a[2]] $$ LANGUAGE plpythonu; %%sql --Try it out SELECT plp.make_pair('Horatio'); %%sql SELECT * FROM plp.make_pair('Horatio'); %%sql --Returning a set of results using SETOF CREATE OR REPLACE FUNCTION make_pair_sets (name text) RETURNS SETOF named_value AS $$ import numpy as np return ((name, i) for i in np.arange(3)) $$ LANGUAGE plpythonu; %%sql --Try it out SELECT * FROM make_pair_sets('Gerald'); %%sql --Set up some data to show parallelisation DROP TABLE IF EXISTS plp.test_data; CREATE TABLE plp.test_data AS SELECT 'a'::text AS name , generate_series(0,1000000)::float AS x , generate_series(0,1000000)/100.0 AS y DISTRIBUTED BY (name); INSERT INTO plp.test_data SELECT 'b'::text AS name , generate_series(0,1000000)::float AS x , sin(generate_series(0,1000000)/100.0) AS y; INSERT INTO plp.test_data SELECT 'c'::text AS name , generate_series(0,1000000)::float AS x , 100.0 + sin(generate_series(0,1000000)/100.0) AS y; %%sql -- Create a function to find the mean of some numbers DROP FUNCTION IF EXISTS plp.np_mean(double precision[]); CREATE OR REPLACE FUNCTION plp.np_mean(value_array double precision[]) RETURNS float AS $$ import numpy as np return np.mean(value_array) $$ LANGUAGE plpythonu; %%sql -- Need to pass the numbers as an array using array_agg SELECT plp.np_mean(array_agg(y)) FROM plp.test_data; %%sql -- Now try to do this for each type of data in parallel by grouping SELECT name, plp.np_mean(array_agg(y)) FROM plp.test_data GROUP BY name ORDER BY name; %%sql -- Now try do something even more interesting DROP FUNCTION IF EXISTS plp.linregr(double precision[]); CREATE OR REPLACE FUNCTION plp.linregr(x double precision[], y double precision[]) RETURNS float[] AS $$ from scipy import stats return stats.linregress(x, y) $$ LANGUAGE plpythonu; %%sql -- Do linear regression for all data SELECT plp.linregr(array_agg(x), array_agg(y)) FROM plp.test_data; %%sql -- Now do it separately for each 'name' SELECT name, plp.linregr(array_agg(x), array_agg(y)) FROM plp.test_data GROUP BY name ORDER BY name;