- This notebook shows how to connect database to Jupyter Notebook, and join multi tables, and write complex queries to analyze data. --by Lu Tang
# Imports
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import sqlite3
import matplotlib.pyplot as plt
# Create the connection to the database, and see what tables we have
conn = sqlite3.connect('factbook.db')
tables = pd.read_sql("""SELECT *
FROM sqlite_master
WHERE type='table';""", conn)
tables
type | name | tbl_name | rootpage | sql | |
---|---|---|---|---|---|
0 | table | sqlite_sequence | sqlite_sequence | 3 | CREATE TABLE sqlite_sequence(name,seq) |
1 | table | facts | facts | 47 | CREATE TABLE "facts" ("id" INTEGER PRIMARY KEY... |
2 | table | cities | cities | 2 | CREATE TABLE cities (\n id integer prim... |
# List of cities
cities = pd.read_sql("""SELECT *
FROM cities;""", conn)
print(len(cities))
cities.head()
397
id | name | population | capital | facts_id | |
---|---|---|---|---|---|
0 | 1 | Oranjestad | 37000 | 1 | 216 |
1 | 2 | Saint John'S | 27000 | 1 | 6 |
2 | 3 | Abu Dhabi | 942000 | 1 | 184 |
3 | 4 | Dubai | 1978000 | 0 | 184 |
4 | 5 | Sharjah | 983000 | 0 | 184 |
print(cities.id.nunique())
print(cities.name.nunique())
# SELECT COUNT(DISTINCT(facts_id)) FROM cities;
print(cities.facts_id.nunique())
# several cities share the same facts
397 396 210
# view table of facts
facts = pd.read_sql("""SELECT *
FROM facts;""", conn)
print(len(facts))
facts.head()
261
id | code | name | area | area_land | area_water | population | population_growth | birth_rate | death_rate | migration_rate | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | af | Afghanistan | 652230.0 | 652230.0 | 0.0 | 32564342.0 | 2.32 | 38.57 | 13.89 | 1.51 |
1 | 2 | al | Albania | 28748.0 | 27398.0 | 1350.0 | 3029278.0 | 0.30 | 12.92 | 6.58 | 3.30 |
2 | 3 | ag | Algeria | 2381741.0 | 2381741.0 | 0.0 | 39542166.0 | 1.84 | 23.67 | 4.31 | 0.92 |
3 | 4 | an | Andorra | 468.0 | 468.0 | 0.0 | 85580.0 | 0.12 | 8.13 | 6.96 | 0.00 |
4 | 5 | ao | Angola | 1246700.0 | 1246700.0 | 0.0 | 19625353.0 | 2.78 | 38.78 | 11.49 | 0.46 |
# Inner Join
q="""SELECT *
FROM facts
INNER JOIN cities
ON cities.facts_id = facts.id"""
inner_join = pd.read_sql(q, conn)
print(len(inner_join))
# view joined table
inner_join.head()
397
id | code | name | area | area_land | area_water | population | population_growth | birth_rate | death_rate | migration_rate | id | name | population | capital | facts_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 216 | aa | Aruba | 180.0 | 180.0 | 0.0 | 112162 | 1.33 | 12.56 | 8.18 | 8.92 | 1 | Oranjestad | 37000 | 1 | 216 |
1 | 6 | ac | Antigua and Barbuda | 442.0 | 442.0 | 0.0 | 92436 | 1.24 | 15.85 | 5.69 | 2.21 | 2 | Saint John'S | 27000 | 1 | 6 |
2 | 184 | ae | United Arab Emirates | 83600.0 | 83600.0 | 0.0 | 5779760 | 2.58 | 15.43 | 1.97 | 12.36 | 3 | Abu Dhabi | 942000 | 1 | 184 |
3 | 184 | ae | United Arab Emirates | 83600.0 | 83600.0 | 0.0 | 5779760 | 2.58 | 15.43 | 1.97 | 12.36 | 4 | Dubai | 1978000 | 0 | 184 |
4 | 184 | ae | United Arab Emirates | 83600.0 | 83600.0 | 0.0 | 5779760 | 2.58 | 15.43 | 1.97 | 12.36 | 5 | Sharjah | 983000 | 0 | 184 |
# Not all the facts_id is in the cities table
inner_join['facts_id'].nunique()
210
# Left Join
q2="""SELECT *
FROM facts
LEFT JOIN cities
ON cities.facts_id = facts.id;"""
left_join = pd.read_sql(q2, conn)
print(len(left_join))
left_join.head()
448
id | code | name | area | area_land | area_water | population | population_growth | birth_rate | death_rate | migration_rate | id | name | population | capital | facts_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | af | Afghanistan | 652230.0 | 652230.0 | 0.0 | 32564342.0 | 2.32 | 38.57 | 13.89 | 1.51 | 6.0 | Kabul | 3097000.0 | 1.0 | 1 |
1 | 2 | al | Albania | 28748.0 | 27398.0 | 1350.0 | 3029278.0 | 0.30 | 12.92 | 6.58 | 3.30 | 10.0 | Tirana | 419000.0 | 1.0 | 2 |
2 | 3 | ag | Algeria | 2381741.0 | 2381741.0 | 0.0 | 39542166.0 | 1.84 | 23.67 | 4.31 | 0.92 | 7.0 | Algiers | 2916000.0 | 1.0 | 3 |
3 | 3 | ag | Algeria | 2381741.0 | 2381741.0 | 0.0 | 39542166.0 | 1.84 | 23.67 | 4.31 | 0.92 | 8.0 | Oran | 783000.0 | 0.0 | 3 |
4 | 4 | an | Andorra | 468.0 | 468.0 | 0.0 | 85580.0 | 0.12 | 8.13 | 6.96 | 0.00 | 12.0 | Andorra La Vella | 23000.0 | 1.0 | 4 |
- For left join, the order of FROM and LEFT JOIN matters
- FROM facts means every rows in the facts will be included, and some same facts have different cities, and some facts do not have cities at all.
# name subquery table as c, and from facts join c
q3="""
SELECT f.name country, c.name capital_city
FROM facts f
JOIN (
SELECT * FROM cities
WHERE capital = 1) c
ON c.facts_id = f.id;
"""
subquery_join = pd.read_sql(q3, conn)
print(len(subquery_join))
subquery_join.head()
208
country | capital_city | |
---|---|---|
0 | Aruba | Oranjestad |
1 | Antigua and Barbuda | Saint John'S |
2 | United Arab Emirates | Abu Dhabi |
3 | Afghanistan | Kabul |
4 | Algeria | Algiers |
# Build connection
conn=sqlite3.connect('chinook.db')
# view all tables
tables = pd.read_sql("""SELECT *
FROM sqlite_master
WHERE type='table';""", conn)
tables
type | name | tbl_name | rootpage | sql | |
---|---|---|---|---|---|
0 | table | album | album | 2 | CREATE TABLE [album]\n(\n [album_id] INTEGE... |
1 | table | artist | artist | 3 | CREATE TABLE [artist]\n(\n [artist_id] INTE... |
2 | table | customer | customer | 4 | CREATE TABLE [customer]\n(\n [customer_id] ... |
3 | table | employee | employee | 5 | CREATE TABLE [employee]\n(\n [employee_id] ... |
4 | table | genre | genre | 6 | CREATE TABLE [genre]\n(\n [genre_id] INTEGE... |
5 | table | invoice | invoice | 7 | CREATE TABLE [invoice]\n(\n [invoice_id] IN... |
6 | table | invoice_line | invoice_line | 8 | CREATE TABLE [invoice_line]\n(\n [invoice_l... |
7 | table | media_type | media_type | 9 | CREATE TABLE [media_type]\n(\n [media_type_... |
8 | table | playlist | playlist | 10 | CREATE TABLE [playlist]\n(\n [playlist_id] ... |
9 | table | playlist_track | playlist_track | 11 | CREATE TABLE [playlist_track]\n(\n [playlis... |
10 | table | track | track | 13 | CREATE TABLE [track]\n(\n [track_id] INTEGE... |
# Join three tables
q_1="""
SELECT t.track_id, t.name track_name, m.name track_type, il.unit_price, il.quantity
FROM invoice_line il
JOIN track t
ON t.track_id = il.track_id
JOIN media_type m
ON m.media_type_id=t.media_type_id
WHERE il.invoice_id = 4"""
table_1=pd.read_sql(q_1, conn)
table_1
track_id | track_name | track_type | unit_price | quantity | |
---|---|---|---|---|---|
0 | 3448 | Lamentations of Jeremiah, First Set \ Incipit ... | Protected AAC audio file | 0.99 | 1 |
1 | 2560 | Violent Pornography | MPEG audio file | 0.99 | 1 |
2 | 3336 | War Pigs | Purchased AAC audio file | 0.99 | 1 |
3 | 829 | Let's Get Rocked | MPEG audio file | 0.99 | 1 |
4 | 1872 | Attitude | MPEG audio file | 0.99 | 1 |
5 | 748 | Dealer | MPEG audio file | 0.99 | 1 |
6 | 1778 | You're What's Happening (In The World Today) | MPEG audio file | 0.99 | 1 |
7 | 2514 | Spoonman | MPEG audio file | 0.99 | 1 |
# Join five tables
q_2= """
SELECT
il.track_id,
t.name track_name,
ar.name artist_name,
mt.name track_type,
il.unit_price,
il.quantity
FROM invoice_line il
INNER JOIN track t ON t.track_id = il.track_id
INNER JOIN media_type mt ON mt.media_type_id = t.media_type_id
INNER JOIN album al ON al.album_id = t.album_id
INNER JOIN artist ar on ar.artist_id = al.artist_id
WHERE il.invoice_id = 4;"""
table_2=pd.read_sql(q_2, conn)
table_2
track_id | track_name | artist_name | track_type | unit_price | quantity | |
---|---|---|---|---|---|---|
0 | 3448 | Lamentations of Jeremiah, First Set \ Incipit ... | The King's Singers | Protected AAC audio file | 0.99 | 1 |
1 | 2560 | Violent Pornography | System Of A Down | MPEG audio file | 0.99 | 1 |
2 | 3336 | War Pigs | Cake | Purchased AAC audio file | 0.99 | 1 |
3 | 829 | Let's Get Rocked | Def Leppard | MPEG audio file | 0.99 | 1 |
4 | 1872 | Attitude | Metallica | MPEG audio file | 0.99 | 1 |
5 | 748 | Dealer | Deep Purple | MPEG audio file | 0.99 | 1 |
6 | 1778 | You're What's Happening (In The World Today) | Marvin Gaye | MPEG audio file | 0.99 | 1 |
7 | 2514 | Spoonman | Soundgarden | MPEG audio file | 0.99 | 1 |
invoice_line table contains each individual song from each customer purchase, it contains information about which songs are purchased the most. We can use the table to find out which artists are purchased the most.
sub_q = """SELECT
t.track_id,
ar.name artist_name
FROM track t
INNER JOIN album al ON al.album_id = t.album_id
INNER JOIN artist ar ON ar.artist_id = al.artist_id"""
sub_table=pd.read_sql(sub_q, conn)
print(len(sub_table))
sub_table.head()
3503
track_id | artist_name | |
---|---|---|
0 | 1 | AC/DC |
1 | 6 | AC/DC |
2 | 7 | AC/DC |
3 | 8 | AC/DC |
4 | 9 | AC/DC |
q_3= """
SELECT
ta.artist_name artist, il.quantity
FROM invoice_line il
INNER JOIN (
SELECT
t.track_id,
ar.name artist_name
FROM track t
INNER JOIN album al ON al.album_id = t.album_id
INNER JOIN artist ar ON ar.artist_id = al.artist_id
) ta
ON ta.track_id = il.track_id;"""
table_3=pd.read_sql(q_3, conn)
print(len(table_3))
print(table_3.artist.nunique())
table_3.head()
4757 130
artist | quantity | |
---|---|---|
0 | Guns N' Roses | 1 |
1 | Guns N' Roses | 1 |
2 | Guns N' Roses | 1 |
3 | Guns N' Roses | 1 |
4 | Guns N' Roses | 1 |
# Join table with subquery and group by
q_4="""
SELECT
ta.artist_name artist,
COUNT(*) tracks_purchased
FROM invoice_line il
INNER JOIN (
SELECT
t.track_id,
ar.name artist_name
FROM track t
INNER JOIN album al ON al.album_id = t.album_id
INNER JOIN artist ar ON ar.artist_id = al.artist_id
) ta
ON ta.track_id = il.track_id
GROUP BY 1
ORDER BY 2 DESC;"""
table_4 = pd.read_sql(q_4, conn)
print(len(table_4))
table_4.head(10)
130
artist | tracks_purchased | |
---|---|---|
0 | Queen | 192 |
1 | Jimi Hendrix | 187 |
2 | Nirvana | 130 |
3 | Red Hot Chili Peppers | 130 |
4 | Pearl Jam | 129 |
5 | AC/DC | 124 |
6 | Guns N' Roses | 124 |
7 | Foo Fighters | 121 |
8 | The Rolling Stones | 117 |
9 | Metallica | 106 |
q_5="""
SELECT al.title album, ar.name artist, COUNT(*) tracks_purchased
FROM invoice_line il
JOIN track t ON t.track_id=il.track_id
JOIN album al ON al.album_id= t.album_id
JOIN artist ar ON ar.artist_id=al.artist_id
GROUP BY 1
ORDER BY 3 DESC;"""
table_5 = pd.read_sql(q_5, conn)
print(len(table_5))
table_5.head()
251
album | artist | tracks_purchased | |
---|---|---|---|
0 | Are You Experienced? | Jimi Hendrix | 187 |
1 | Faceless | Godsmack | 96 |
2 | Mezmerize | System Of A Down | 93 |
3 | Get Born | JET | 90 |
4 | The Doors | The Doors | 83 |
# same as q_5
q_6="""
SELECT
ta.album_title album,
ta.artist_name artist,
COUNT(*) tracks_purchased
FROM invoice_line il
INNER JOIN (
SELECT
t.track_id,
al.title album_title,
ar.name artist_name
FROM track t
INNER JOIN album al ON al.album_id = t.album_id
INNER JOIN artist ar ON ar.artist_id = al.artist_id
) ta
ON ta.track_id = il.track_id
GROUP BY 1, 2
ORDER BY 3 DESC;"""
table_6 = pd.read_sql(q_6, conn)
print(len(table_6))
table_6.head()
251
album | artist | tracks_purchased | |
---|---|---|---|
0 | Are You Experienced? | Jimi Hendrix | 187 |
1 | Faceless | Godsmack | 96 |
2 | Mezmerize | System Of A Down | 93 |
3 | Get Born | JET | 90 |
4 | The Doors | The Doors | 83 |
# view the table employee
q_7="""
SELECT *
FROM employee"""
table_7=pd.read_sql(q_7, conn)
table_7
employee_id | last_name | first_name | title | reports_to | birthdate | hire_date | address | city | state | country | postal_code | phone | fax | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Adams | Andrew | General Manager | NaN | 1962-02-18 00:00:00 | 2016-08-14 00:00:00 | 11120 Jasper Ave NW | Edmonton | AB | Canada | T5K 2N1 | +1 (780) 428-9482 | +1 (780) 428-3457 | andrew@chinookcorp.com |
1 | 2 | Edwards | Nancy | Sales Manager | 1.0 | 1958-12-08 00:00:00 | 2016-05-01 00:00:00 | 825 8 Ave SW | Calgary | AB | Canada | T2P 2T3 | +1 (403) 262-3443 | +1 (403) 262-3322 | nancy@chinookcorp.com |
2 | 3 | Peacock | Jane | Sales Support Agent | 2.0 | 1973-08-29 00:00:00 | 2017-04-01 00:00:00 | 1111 6 Ave SW | Calgary | AB | Canada | T2P 5M5 | +1 (403) 262-3443 | +1 (403) 262-6712 | jane@chinookcorp.com |
3 | 4 | Park | Margaret | Sales Support Agent | 2.0 | 1947-09-19 00:00:00 | 2017-05-03 00:00:00 | 683 10 Street SW | Calgary | AB | Canada | T2P 5G3 | +1 (403) 263-4423 | +1 (403) 263-4289 | margaret@chinookcorp.com |
4 | 5 | Johnson | Steve | Sales Support Agent | 2.0 | 1965-03-03 00:00:00 | 2017-10-17 00:00:00 | 7727B 41 Ave | Calgary | AB | Canada | T3B 1Y7 | 1 (780) 836-9987 | 1 (780) 836-9543 | steve@chinookcorp.com |
5 | 6 | Mitchell | Michael | IT Manager | 1.0 | 1973-07-01 00:00:00 | 2016-10-17 00:00:00 | 5827 Bowness Road NW | Calgary | AB | Canada | T3B 0C5 | +1 (403) 246-9887 | +1 (403) 246-9899 | michael@chinookcorp.com |
6 | 7 | King | Robert | IT Staff | 6.0 | 1970-05-29 00:00:00 | 2017-01-02 00:00:00 | 590 Columbia Boulevard West | Lethbridge | AB | Canada | T1K 5N8 | +1 (403) 456-9986 | +1 (403) 456-8485 | robert@chinookcorp.com |
7 | 8 | Callahan | Laura | IT Staff | 6.0 | 1968-01-09 00:00:00 | 2017-03-04 00:00:00 | 923 7 ST NW | Lethbridge | AB | Canada | T1H 1Y8 | +1 (403) 467-3351 | +1 (403) 467-8772 | laura@chinookcorp.com |
# Recursive Joins
q_8="""
SELECT
e1.first_name || " " || e1.last_name employee_name,
e1.title employee_title,
e2.first_name || " " || e2.last_name supervisor_name,
e2.title supervisor_title
FROM employee e1
LEFT JOIN employee e2 ON e1.reports_to = e2.employee_id
ORDER BY 1;"""
table_8=pd.read_sql(q_8, conn)
table_8
employee_name | employee_title | supervisor_name | supervisor_title | |
---|---|---|---|---|
0 | Andrew Adams | General Manager | None | None |
1 | Jane Peacock | Sales Support Agent | Nancy Edwards | Sales Manager |
2 | Laura Callahan | IT Staff | Michael Mitchell | IT Manager |
3 | Margaret Park | Sales Support Agent | Nancy Edwards | Sales Manager |
4 | Michael Mitchell | IT Manager | Andrew Adams | General Manager |
5 | Nancy Edwards | Sales Manager | Andrew Adams | General Manager |
6 | Robert King | IT Staff | Michael Mitchell | IT Manager |
7 | Steve Johnson | Sales Support Agent | Nancy Edwards | Sales Manager |
sub="""
SELECT *
FROM invoice i
JOIN customer c
ON c.customer_id = i.customer_id;"""
customer_invoice = pd.read_sql(sub, conn)
# to avoid truncated output
pd.options.display.max_columns = 150
customer_invoice.head()
invoice_id | customer_id | invoice_date | billing_address | billing_city | billing_state | billing_country | billing_postal_code | total | customer_id | first_name | last_name | company | address | city | state | country | postal_code | phone | fax | support_rep_id | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 18 | 2017-01-03 00:00:00 | 627 Broadway | New York | NY | USA | 10012-2612 | 15.84 | 18 | Michelle | Brooks | None | 627 Broadway | New York | NY | USA | 10012-2612 | +1 (212) 221-3546 | +1 (212) 221-4679 | michelleb@aol.com | 3 |
1 | 2 | 30 | 2017-01-03 00:00:00 | 230 Elgin Street | Ottawa | ON | Canada | K2P 1L7 | 9.90 | 30 | Edward | Francis | None | 230 Elgin Street | Ottawa | ON | Canada | K2P 1L7 | +1 (613) 234-3322 | None | edfrancis@yachoo.ca | 3 |
2 | 3 | 40 | 2017-01-05 00:00:00 | 8, Rue Hanovre | Paris | None | France | 75002 | 1.98 | 40 | Dominique | Lefebvre | None | 8, Rue Hanovre | Paris | None | France | 75002 | +33 01 47 42 71 71 | None | dominiquelefebvre@gmail.com | 4 |
3 | 4 | 18 | 2017-01-06 00:00:00 | 627 Broadway | New York | NY | USA | 10012-2612 | 7.92 | 18 | Michelle | Brooks | None | 627 Broadway | New York | NY | USA | 10012-2612 | +1 (212) 221-3546 | +1 (212) 221-4679 | michelleb@aol.com | 3 |
4 | 5 | 27 | 2017-01-07 00:00:00 | 1033 N Park Ave | Tucson | AZ | USA | 85719 | 16.83 | 27 | Patrick | Gray | None | 1033 N Park Ave | Tucson | AZ | USA | 85719 | +1 (520) 622-4200 | None | patrick.gray@aol.com | 4 |
# join customer and invoice tabel, groupby customer name and create a new column
q_9="""
SELECT
c.first_name || " " || c.last_name customer_name,
COUNT(i.invoice_id) number_of_purchases,
SUM(i.total) total_spent,
CASE
WHEN sum(i.total) < 40 THEN 'small spender'
WHEN sum(i.total) > 100 THEN 'big spender'
ELSE 'regular'
END
AS customer_category
FROM invoice i
INNER JOIN customer c ON i.customer_id = c.customer_id
GROUP BY 1 ORDER BY 1;"""
table_9=pd.read_sql(q_9, conn)
table_9.head()
customer_name | number_of_purchases | total_spent | customer_category | |
---|---|---|---|---|
0 | Aaron Mitchell | 8 | 70.29 | regular |
1 | Alexandre Rocha | 10 | 69.30 | regular |
2 | Astrid Gruber | 9 | 69.30 | regular |
3 | Bjørn Hansen | 9 | 72.27 | regular |
4 | Camille Bernard | 9 | 79.20 | regular |
# view the table playlist
q="""
SELECT *
FROM playlist_track"""
playlist_track=pd.read_sql(q,conn)
print(playlist_track.track_id.nunique())
print(playlist_track.playlist_id.nunique())
# total length of playlist is 18, means not all playlist is in playlist_track.
3503 14
# (inner) join three tables
join_from_track="""
SELECT *
FROM track t
JOIN playlist_track pt ON t.track_id=pt.track_id
JOIN playlist p ON p.playlist_id=pt.playlist_id;"""
join_from_track=pd.read_sql(join_from_track,conn)
print(len(join_from_track))
join_from_track.head()
8715
track_id | name | album_id | media_type_id | genre_id | composer | milliseconds | bytes | unit_price | playlist_id | track_id | playlist_id | name | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 3402 | Band Members Discuss Tracks from "Revelations" | 271 | 3 | 23 | None | 294294 | 61118891 | 0.99 | 1 | 3402 | 1 | Music |
1 | 3389 | Revelations | 271 | 2 | 23 | None | 252376 | 4111051 | 0.99 | 1 | 3389 | 1 | Music |
2 | 3390 | One and the Same | 271 | 2 | 23 | None | 217732 | 3559040 | 0.99 | 1 | 3390 | 1 | Music |
3 | 3391 | Sound of a Gun | 271 | 2 | 23 | None | 260154 | 4234990 | 0.99 | 1 | 3391 | 1 | Music |
4 | 3392 | Until We Fall | 271 | 2 | 23 | None | 230758 | 3766605 | 0.99 | 1 | 3392 | 1 | Music |
# same as above, this is to show, for (inner) join, table order does not matter
join_from_playlist="""
SELECT *
FROM playlist p
JOIN playlist_track pt ON p.playlist_id=pt.playlist_id
JOIN track t ON t.track_id=pt.track_id;"""
join_from_playlist=pd.read_sql(join_from_playlist,conn)
print(len(join_from_playlist))
join_from_playlist.head()
8715
playlist_id | name | playlist_id | track_id | track_id | name | album_id | media_type_id | genre_id | composer | milliseconds | bytes | unit_price | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Music | 1 | 3402 | 3402 | Band Members Discuss Tracks from "Revelations" | 271 | 3 | 23 | None | 294294 | 61118891 | 0.99 |
1 | 1 | Music | 1 | 3389 | 3389 | Revelations | 271 | 2 | 23 | None | 252376 | 4111051 | 0.99 |
2 | 1 | Music | 1 | 3390 | 3390 | One and the Same | 271 | 2 | 23 | None | 217732 | 3559040 | 0.99 |
3 | 1 | Music | 1 | 3391 | 3391 | Sound of a Gun | 271 | 2 | 23 | None | 260154 | 4234990 | 0.99 |
4 | 1 | Music | 1 | 3392 | 3392 | Until We Fall | 271 | 2 | 23 | None | 230758 | 3766605 | 0.99 |
# left join 3 tables
LEFT_join_from_track="""
SELECT *
FROM track t
LEFT JOIN playlist_track pt ON t.track_id=pt.track_id
LEFT JOIN playlist p ON p.playlist_id=pt.playlist_id;"""
LEFT_join_from_track=pd.read_sql(LEFT_join_from_track,conn)
print(len(LEFT_join_from_track))
LEFT_join_from_track.head()
8715
track_id | name | album_id | media_type_id | genre_id | composer | milliseconds | bytes | unit_price | playlist_id | track_id | playlist_id | name | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | For Those About To Rock (We Salute You) | 1 | 1 | 1 | Angus Young, Malcolm Young, Brian Johnson | 343719 | 11170334 | 0.99 | 1 | 1 | 1 | Music |
1 | 1 | For Those About To Rock (We Salute You) | 1 | 1 | 1 | Angus Young, Malcolm Young, Brian Johnson | 343719 | 11170334 | 0.99 | 8 | 1 | 8 | Music |
2 | 1 | For Those About To Rock (We Salute You) | 1 | 1 | 1 | Angus Young, Malcolm Young, Brian Johnson | 343719 | 11170334 | 0.99 | 17 | 1 | 17 | Heavy Metal Classic |
3 | 2 | Balls to the Wall | 2 | 2 | 1 | None | 342562 | 5510424 | 0.99 | 1 | 2 | 1 | Music |
4 | 2 | Balls to the Wall | 2 | 2 | 1 | None | 342562 | 5510424 | 0.99 | 8 | 2 | 8 | Music |
# this is to show different order in table can have different result for left join
LEFT_join_from_playlist="""
SELECT *
FROM playlist p
LEFT JOIN playlist_track pt ON pt.playlist_id = p.playlist_id
LEFT JOIN track t ON t.track_id = pt.track_id"""
LEFT_join_from_playlist=pd.read_sql(LEFT_join_from_playlist,conn)
print(len(LEFT_join_from_playlist))
LEFT_join_from_playlist.head()
8719
playlist_id | name | playlist_id | track_id | track_id | name | album_id | media_type_id | genre_id | composer | milliseconds | bytes | unit_price | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Music | 1.0 | 1.0 | 1.0 | For Those About To Rock (We Salute You) | 1.0 | 1.0 | 1.0 | Angus Young, Malcolm Young, Brian Johnson | 343719.0 | 11170334.0 | 0.99 |
1 | 1 | Music | 1.0 | 2.0 | 2.0 | Balls to the Wall | 2.0 | 2.0 | 1.0 | None | 342562.0 | 5510424.0 | 0.99 |
2 | 1 | Music | 1.0 | 3.0 | 3.0 | Fast As a Shark | 3.0 | 2.0 | 1.0 | F. Baltes, S. Kaufman, U. Dirkscneider & W. Ho... | 230619.0 | 3990994.0 | 0.99 |
3 | 1 | Music | 1.0 | 4.0 | 4.0 | Restless and Wild | 3.0 | 2.0 | 1.0 | F. Baltes, R.A. Smith-Diesel, S. Kaufman, U. D... | 252051.0 | 4331779.0 | 0.99 |
4 | 1 | Music | 1.0 | 5.0 | 5.0 | Princess of the Dawn | 3.0 | 2.0 | 1.0 | Deaffy & R.A. Smith-Diesel | 375418.0 | 6290521.0 | 0.99 |
- The result has 8719 rows, instead of 8715 rows for left_join_from_track
sub_join_from_track
sub="""
SELECT
p.playlist_id,
p.name AS playlist_name,
t.name AS track_name,
(t.milliseconds / 1000) length_seconds
FROM track t
JOIN playlist_track pt ON t.track_id=pt.track_id
JOIN playlist p ON p.playlist_id=pt.playlist_id;"""
sub=pd.read_sql(sub,conn)
print(len(sub))
sub.head()
8715
playlist_id | playlist_name | track_name | length_seconds | |
---|---|---|---|---|
0 | 1 | Music | Band Members Discuss Tracks from "Revelations" | 294 |
1 | 1 | Music | Revelations | 252 |
2 | 1 | Music | One and the Same | 217 |
3 | 1 | Music | Sound of a Gun | 260 |
4 | 1 | Music | Until We Fall | 230 |
# name subquery using WITH and AS
q="""
WITH sub AS
(SELECT
p.playlist_id,
p.name AS playlist_name,
t.name AS track_name,
(t.milliseconds / 1000) length_seconds
FROM track t
JOIN playlist_track pt ON t.track_id=pt.track_id
JOIN playlist p ON p.playlist_id=pt.playlist_id
)
SELECT
playlist_id,
playlist_name,
COUNT(playlist_id) number_of_tracks,
SUM(length_seconds) length_seconds
FROM sub
GROUP BY 1
ORDER BY 1
"""
table_sub=pd.read_sql(q,conn)
print(len(table_sub))
table_sub
# GROUP BY 1 and GROUP BY 1,2 are the same
# COUNT(playlist_id), same as COUNT(track_name)
14
playlist_id | playlist_name | number_of_tracks | length_seconds | |
---|---|---|---|---|
0 | 1 | Music | 3290 | 876049 |
1 | 3 | TV Shows | 213 | 500987 |
2 | 5 | 90’s Music | 1477 | 397970 |
3 | 8 | Music | 3290 | 876049 |
4 | 9 | Music Videos | 1 | 294 |
5 | 10 | TV Shows | 213 | 500987 |
6 | 11 | Brazilian Music | 39 | 9464 |
7 | 12 | Classical | 75 | 21736 |
8 | 13 | Classical 101 - Deep Cuts | 25 | 6742 |
9 | 14 | Classical 101 - Next Steps | 25 | 7565 |
10 | 15 | Classical 101 - The Basics | 25 | 7429 |
11 | 16 | Grunge | 15 | 4114 |
12 | 17 | Heavy Metal Classic | 26 | 8189 |
13 | 18 | On-The-Go 1 | 1 | 197 |
sub_LEFT_from_track
sub_LEFT_from_track="""
SELECT
p.playlist_id,
p.name AS playlist_name,
t.name AS track_name,
(t.milliseconds / 1000) length_seconds
FROM track t
LEFT JOIN playlist_track pt ON t.track_id=pt.track_id
LEFT JOIN playlist p ON p.playlist_id=pt.playlist_id;"""
sub_LEFT_from_track=pd.read_sql(sub_LEFT_from_track,conn)
print(len(sub_LEFT_from_track))
sub_LEFT_from_track.head()
8715
playlist_id | playlist_name | track_name | length_seconds | |
---|---|---|---|---|
0 | 1 | Music | For Those About To Rock (We Salute You) | 343 |
1 | 8 | Music | For Those About To Rock (We Salute You) | 343 |
2 | 17 | Heavy Metal Classic | For Those About To Rock (We Salute You) | 343 |
3 | 1 | Music | Balls to the Wall | 342 |
4 | 8 | Music | Balls to the Wall | 342 |
q="""
WITH sub_LEFT AS
(SELECT
p.playlist_id,
p.name AS playlist_name,
t.name AS track_name,
(t.milliseconds / 1000) length_seconds
FROM track t
LEFT JOIN playlist_track pt ON t.track_id=pt.track_id
LEFT JOIN playlist p ON p.playlist_id=pt.playlist_id
)
SELECT
playlist_id,
playlist_name,
COUNT(track_name) number_of_tracks,
SUM(length_seconds) length_seconds
FROM sub_LEFT
GROUP BY 1, 2
ORDER BY 1
"""
table_sub_LEFT_from_track=pd.read_sql(q,conn)
print(len(table_sub_LEFT_from_track))
table_sub_LEFT_from_track
14
playlist_id | playlist_name | number_of_tracks | length_seconds | |
---|---|---|---|---|
0 | 1 | Music | 3290 | 876049 |
1 | 3 | TV Shows | 213 | 500987 |
2 | 5 | 90’s Music | 1477 | 397970 |
3 | 8 | Music | 3290 | 876049 |
4 | 9 | Music Videos | 1 | 294 |
5 | 10 | TV Shows | 213 | 500987 |
6 | 11 | Brazilian Music | 39 | 9464 |
7 | 12 | Classical | 75 | 21736 |
8 | 13 | Classical 101 - Deep Cuts | 25 | 6742 |
9 | 14 | Classical 101 - Next Steps | 25 | 7565 |
10 | 15 | Classical 101 - The Basics | 25 | 7429 |
11 | 16 | Grunge | 15 | 4114 |
12 | 17 | Heavy Metal Classic | 26 | 8189 |
13 | 18 | On-The-Go 1 | 1 | 197 |
sub_LEFT_from_playlist
sub_LEFT_from_playlist="""
SELECT
p.playlist_id,
p.name playlist_name,
t.name track_name,
(t.milliseconds / 1000) length_seconds
FROM playlist p
LEFT JOIN playlist_track pt ON pt.playlist_id = p.playlist_id
LEFT JOIN track t ON t.track_id = pt.track_id"""
sub_LEFT_from_playlist=pd.read_sql(sub_LEFT_from_playlist, conn)
print(len(sub_LEFT_from_playlist))
sub_LEFT_from_playlist.head()
8719
playlist_id | playlist_name | track_name | length_seconds | |
---|---|---|---|---|
0 | 1 | Music | For Those About To Rock (We Salute You) | 343.0 |
1 | 1 | Music | Balls to the Wall | 342.0 |
2 | 1 | Music | Fast As a Shark | 230.0 |
3 | 1 | Music | Restless and Wild | 252.0 |
4 | 1 | Music | Princess of the Dawn | 375.0 |
q="""
WITH playlist_info AS
(
SELECT
p.playlist_id,
p.name playlist_name,
t.name track_name,
(t.milliseconds / 1000) length_seconds
FROM playlist p
LEFT JOIN playlist_track pt ON pt.playlist_id = p.playlist_id
LEFT JOIN track t ON t.track_id = pt.track_id
)
SELECT
playlist_id,
playlist_name,
COUNT(track_name) number_of_tracks,
SUM(length_seconds) length_seconds
FROM playlist_info
GROUP BY 1,2
ORDER BY 1;"""
table_sub_LEFT_from_playlist=pd.read_sql(q, conn)
print(len(table_sub_LEFT_from_playlist))
table_sub_LEFT_from_playlist
18
playlist_id | playlist_name | number_of_tracks | length_seconds | |
---|---|---|---|---|
0 | 1 | Music | 3290 | 876049.0 |
1 | 2 | Movies | 0 | NaN |
2 | 3 | TV Shows | 213 | 500987.0 |
3 | 4 | Audiobooks | 0 | NaN |
4 | 5 | 90’s Music | 1477 | 397970.0 |
5 | 6 | Audiobooks | 0 | NaN |
6 | 7 | Movies | 0 | NaN |
7 | 8 | Music | 3290 | 876049.0 |
8 | 9 | Music Videos | 1 | 294.0 |
9 | 10 | TV Shows | 213 | 500987.0 |
10 | 11 | Brazilian Music | 39 | 9464.0 |
11 | 12 | Classical | 75 | 21736.0 |
12 | 13 | Classical 101 - Deep Cuts | 25 | 6742.0 |
13 | 14 | Classical 101 - Next Steps | 25 | 7565.0 |
14 | 15 | Classical 101 - The Basics | 25 | 7429.0 |
15 | 16 | Grunge | 15 | 4114.0 |
16 | 17 | Heavy Metal Classic | 26 | 8189.0 |
17 | 18 | On-The-Go 1 | 1 | 197.0 |
CREATE VIEW (create a table can be reused), view can not be run on Jupyter notebook
q="""
SELECT c.*, i.*
FROM invoice i
INNER JOIN customer c ON i.customer_id = c.customer_id
GROUP BY 1
HAVING SUM(i.total) > 90
"""
i_c_90=pd.read_sql(q,conn)
print(len(i_c_90))
i_c_90
18
customer_id | first_name | last_name | company | address | city | state | country | postal_code | phone | fax | support_rep_id | invoice_id | customer_id | invoice_date | billing_address | billing_city | billing_state | billing_country | billing_postal_code | total | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Luís | Gonçalves | Embraer - Empresa Brasileira de Aeronáutica S.A. | Av. Brigadeiro Faria Lima, 2170 | São José dos Campos | SP | Brazil | 12227-000 | +55 (12) 3923-5555 | +55 (12) 3923-5566 | luisg@embraer.com.br | 3 | 16 | 1 | 2017-01-26 00:00:00 | Av. Brigadeiro Faria Lima, 2170 | São José dos Campos | SP | Brazil | 12227-000 | 8.91 |
1 | 3 | François | Tremblay | None | 1498 rue Bélanger | Montréal | QC | Canada | H2G 1A7 | +1 (514) 721-4711 | None | ftremblay@gmail.com | 3 | 31 | 3 | 2017-02-21 00:00:00 | 1498 rue Bélanger | Montréal | QC | Canada | H2G 1A7 | 19.80 |
2 | 5 | František | Wichterlová | JetBrains s.r.o. | Klanova 9/506 | Prague | None | Czech Republic | 14700 | +420 2 4172 5555 | +420 2 4172 5555 | frantisekw@jetbrains.com | 4 | 78 | 5 | 2017-05-29 00:00:00 | Klanova 9/506 | Prague | None | Czech Republic | 14700 | 8.91 |
3 | 6 | Helena | Holý | None | Rilská 3174/6 | Prague | None | Czech Republic | 14300 | +420 2 4177 0449 | None | hholy@gmail.com | 5 | 112 | 6 | 2017-08-31 00:00:00 | Rilská 3174/6 | Prague | None | Czech Republic | 14300 | 16.83 |
4 | 13 | Fernanda | Ramos | None | Qe 7 Bloco G | Brasília | DF | Brazil | 71020-677 | +55 (61) 3363-5547 | +55 (61) 3363-7855 | fernadaramos4@uol.com.br | 4 | 73 | 13 | 2017-05-18 00:00:00 | Qe 7 Bloco G | Brasília | DF | Brazil | 71020-677 | 14.85 |
5 | 17 | Jack | Smith | Microsoft Corporation | 1 Microsoft Way | Redmond | WA | USA | 98052-8300 | +1 (425) 882-8080 | +1 (425) 882-8081 | jacksmith@microsoft.com | 5 | 98 | 17 | 2017-07-23 00:00:00 | 1 Microsoft Way | Redmond | WA | USA | 98052-8300 | 9.90 |
6 | 20 | Dan | Miller | None | 541 Del Medio Avenue | Mountain View | CA | USA | 94040-111 | +1 (650) 644-3358 | None | dmiller@comcast.com | 4 | 17 | 20 | 2017-01-28 00:00:00 | 541 Del Medio Avenue | Mountain View | CA | USA | 94040-111 | 10.89 |
7 | 21 | Kathy | Chase | None | 801 W 4th Street | Reno | NV | USA | 89503 | +1 (775) 223-7665 | None | kachase@hotmail.com | 5 | 44 | 21 | 2017-03-20 00:00:00 | 801 W 4th Street | Reno | NV | USA | 89503 | 11.88 |
8 | 22 | Heather | Leacock | None | 120 S Orange Ave | Orlando | FL | USA | 32801 | +1 (407) 999-7788 | None | hleacock@gmail.com | 4 | 43 | 22 | 2017-03-19 00:00:00 | 120 S Orange Ave | Orlando | FL | USA | 32801 | 6.93 |
9 | 30 | Edward | Francis | None | 230 Elgin Street | Ottawa | ON | Canada | K2P 1L7 | +1 (613) 234-3322 | None | edfrancis@yachoo.ca | 3 | 2 | 30 | 2017-01-03 00:00:00 | 230 Elgin Street | Ottawa | ON | Canada | K2P 1L7 | 9.90 |
10 | 34 | João | Fernandes | None | Rua da Assunção 53 | Lisbon | None | Portugal | None | +351 (213) 466-111 | None | jfernandes@yahoo.pt | 4 | 35 | 34 | 2017-02-25 00:00:00 | Rua da Assunção 53 | Lisbon | None | Portugal | None | 4.95 |
11 | 37 | Fynn | Zimmermann | None | Berger Straße 10 | Frankfurt | None | Germany | 60316 | +49 069 40598889 | None | fzimmermann@yahoo.de | 3 | 40 | 37 | 2017-03-13 00:00:00 | Berger Straße 10 | Frankfurt | None | Germany | 60316 | 9.90 |
12 | 42 | Wyatt | Girard | None | 9, Place Louis Barthou | Bordeaux | None | France | 33000 | +33 05 56 96 96 96 | None | wyatt.girard@yahoo.fr | 3 | 12 | 42 | 2017-01-21 00:00:00 | 9, Place Louis Barthou | Bordeaux | None | France | 33000 | 3.96 |
13 | 46 | Hugh | O'Reilly | None | 3 Chatham Street | Dublin | Dublin | Ireland | None | +353 01 6792424 | None | hughoreilly@apple.ie | 3 | 30 | 46 | 2017-02-18 00:00:00 | 3 Chatham Street | Dublin | Dublin | Ireland | None | 10.89 |
14 | 50 | Enrique | Muñoz | None | C/ San Bernardo 85 | Madrid | None | Spain | 28015 | +34 914 454 454 | None | enrique_munoz@yahoo.es | 5 | 41 | 50 | 2017-03-17 00:00:00 | C/ San Bernardo 85 | Madrid | None | Spain | 28015 | 7.92 |
15 | 53 | Phil | Hughes | None | 113 Lupus St | London | None | United Kingdom | SW1V 3EN | +44 020 7976 5722 | None | phil.hughes@gmail.com | 3 | 23 | 53 | 2017-02-09 00:00:00 | 113 Lupus St | London | None | United Kingdom | SW1V 3EN | 9.90 |
16 | 57 | Luis | Rojas | None | Calle Lira, 198 | Santiago | None | Chile | None | +56 (0)2 635 4444 | None | luisrojas@yahoo.cl | 5 | 47 | 57 | 2017-03-25 00:00:00 | Calle Lira, 198 | Santiago | None | Chile | None | 5.94 |
17 | 58 | Manoj | Pareek | None | 12,Community Centre | Delhi | None | India | 110017 | +91 0124 39883988 | None | manoj.pareek@rediff.com | 3 | 33 | 58 | 2017-02-21 00:00:00 | 12,Community Centre | Delhi | None | India | 110017 | 3.96 |