In this project we will be working with a modified version of a database called Chinook. The Chinook database contains information about a fictional digital music shop - a smaller version of iTunes.
The database contains information about the artists, songs, and albums from the music shop, as well as information on the shop's employees, customers, and their purchases. This information is contained in eleven tables. Here is a schema diagram for the Chinook database:
%%capture
%load_ext sql
%sql sqlite:///chinook.db
# Connect to the database
# Import python libraries
import sqlite3
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
%%sql
/*# Database overview */
SELECT name, type
FROM sqlite_master
WHERE type = 'table'
* sqlite:///chinook.db Done.
name | type |
---|---|
album | table |
artist | table |
customer | table |
employee | table |
genre | table |
invoice | table |
invoice_line | table |
media_type | table |
playlist | table |
playlist_track | table |
track | table |
Here we will display the following data for each country where customers have bought tracks in the store:
Because there are several countries with only one customer we will group these into a row named "Other" and keep it on the bottom of the table after sorting.
%%sql
/*# Display aggregate sales data per country */
WITH country_countrynew AS
(
SELECT country,
CASE
WHEN COUNT(DISTINCT customer_id) <= 1 THEN 'Other'
ELSE country
END
AS country_new
FROM customer
GROUP BY country
)
SELECT country_new AS country,
COUNT(DISTINCT c.customer_id) AS customers,
ROUND(SUM(i.total), 2) AS sales,
ROUND(SUM(i.total) / COUNT(DISTINCT c.customer_id), 2) AS avg_sales_per_customer,
ROUND(SUM(i.total) / COUNT(DISTINCT i.invoice_id), 2) AS avg_sales_per_order
FROM customer c
INNER JOIN invoice as i ON i.customer_id = c.customer_id
INNER JOIN country_countrynew AS ccn ON ccn.country = c.country
GROUP BY country_new
ORDER BY (country_new='Other'), sales DESC;
* sqlite:///chinook.db Done.
country | customers | sales | avg_sales_per_customer | avg_sales_per_order |
---|---|---|---|---|
USA | 13 | 1040.49 | 80.04 | 7.94 |
Canada | 8 | 535.59 | 66.95 | 7.05 |
Brazil | 5 | 427.68 | 85.54 | 7.01 |
France | 5 | 389.07 | 77.81 | 7.78 |
Germany | 4 | 334.62 | 83.66 | 8.16 |
Czech Republic | 2 | 273.24 | 136.62 | 9.11 |
United Kingdom | 3 | 245.52 | 81.84 | 8.77 |
Portugal | 2 | 185.13 | 92.57 | 6.38 |
India | 2 | 183.15 | 91.58 | 8.72 |
Other | 15 | 1094.94 | 73.0 | 7.45 |
# Create variable to be passed into pd.read_sql
query_country = """WITH country_countrynew AS
(
SELECT country,
CASE
WHEN COUNT(DISTINCT customer_id) <= 1 THEN 'Other'
ELSE country
END
AS country_new
FROM customer
GROUP BY country
)
SELECT country_new AS country,
COUNT(DISTINCT c.customer_id) AS customers,
ROUND(SUM(i.total), 2) AS sales,
ROUND(SUM(i.total) / COUNT(DISTINCT c.customer_id), 2) AS avg_sales_per_customer,
ROUND(SUM(i.total) / COUNT(DISTINCT i.invoice_id), 2) AS avg_sales_per_order
FROM customer c
INNER JOIN invoice as i ON i.customer_id = c.customer_id
INNER JOIN country_countrynew AS ccn ON ccn.country = c.country
GROUP BY country_new
ORDER BY (country_new='Other'), sales DESC;"""
# Connect to database
conn = sqlite3.connect('chinook.db')
# Read into dataframe
country_stats = pd.read_sql_query(query_country, conn)
# Reset index and create list of countries to pass as x-axis labels
country_stats = country_stats.set_index('country', drop=True)
country_list = country_stats.index.tolist()
# Plot
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(10,10))
fig.subplots_adjust(hspace=0.4)
plt.rcParams['figure.dpi'] = 460
ax1 = plt.subplot(2,2,1)
ax1 = sns.barplot(x=country_list, y='sales', data=country_stats, color="steelblue")
ax1.set_title('Sales By Country (USD)', fontsize=13)
ax1.set_ylabel('')
ax1.tick_params(labelsize=9)
plt.xticks(rotation = 45, ha='right')
sns.despine(left=True)
sns.set(style="whitegrid")
ax2 = plt.subplot(2,2,2)
ax2 = sns.barplot(x=country_list, y='avg_sales_per_customer', data=country_stats, color="steelblue")
ax2.set_title('Average Sales Per Customer (USD)', fontsize=13)
ax2.set_ylabel('')
ax2.set_ylim([50,140])
ax2.tick_params(labelsize=9)
plt.xticks(rotation = 45, ha='right')
sns.despine(left=True)
sns.set(style="whitegrid")
ax3 = plt.subplot(2,2,3)
ax3 = sns.barplot(x=country_list, y='avg_sales_per_order', data=country_stats, color="steelblue")
ax3.set_title('Average Sales Per Order (USD)', fontsize=13)
ax3.set_ylabel('')
ax3.set_ylim([5,10])
ax3.tick_params(labelsize=9)
plt.xticks(rotation = 45, ha='right')
sns.despine(left=True)
sns.set(style="whitegrid")
ax4 = plt.subplot(2,2,4)
ax4 = sns.barplot(x=country_list, y='customers', data=country_stats, color="steelblue")
ax4.set_title('Number Of Customers', fontsize=13)
ax4.set_ylabel('')
ax4.tick_params(labelsize=9)
plt.xticks(rotation = 45, ha='right')
sns.despine(left=True)
sns.set(style="whitegrid")
Next, will look at the follwing popularity metrics:
The Chinook record store has just signed a deal with a new record label, and they are considering three albums that will be added to the store from a list of four:
Since the record label specializes on artists from the USA we are interested in finding out which of these genres sell best in across the country.
%%sql
/*# Display all genres in the USA sorted by the number of tracks sold */
WITH usa_tracks AS
(
SELECT t.genre_id AS genre_id, il.invoice_line_id
FROM track as t
INNER JOIN invoice_line as il ON il.track_id=t.track_id
INNER JOIN invoice as i ON i.invoice_id = il.invoice_id
WHERE i.billing_country = 'USA'
)
SELECT g.name,
COUNT(usa.genre_id) AS tracks_sold,
ROUND(CAST(COUNT(usa.genre_id) AS FLOAT)/ (SELECT COUNT(genre_id)
FROM usa_tracks),2) AS percentage
FROM usa_tracks AS usa
INNER JOIN genre as g ON g.genre_id = usa.genre_id
GROUP BY g.name
ORDER BY tracks_sold DESC;
* sqlite:///chinook.db Done.
name | tracks_sold | percentage |
---|---|---|
Rock | 561 | 0.53 |
Alternative & Punk | 130 | 0.12 |
Metal | 124 | 0.12 |
R&B/Soul | 53 | 0.05 |
Blues | 36 | 0.03 |
Alternative | 35 | 0.03 |
Pop | 22 | 0.02 |
Latin | 22 | 0.02 |
Hip Hop/Rap | 20 | 0.02 |
Jazz | 14 | 0.01 |
Easy Listening | 13 | 0.01 |
Reggae | 6 | 0.01 |
Electronica/Dance | 5 | 0.0 |
Classical | 4 | 0.0 |
Heavy Metal | 3 | 0.0 |
Soundtrack | 2 | 0.0 |
TV Shows | 1 | 0.0 |
# Create variable to pass into pd.read_sql
query_genre = """ WITH usa_tracks AS
(
SELECT t.genre_id AS genre_id, il.invoice_line_id
FROM track as t
INNER JOIN invoice_line as il ON il.track_id=t.track_id
INNER JOIN invoice as i ON i.invoice_id = il.invoice_id
WHERE i.billing_country = 'USA'
)
SELECT g.name,
COUNT(usa.genre_id) AS tracks_sold,
ROUND(CAST(COUNT(usa.genre_id) AS FLOAT)/ (SELECT COUNT(genre_id)
FROM usa_tracks),2) AS percentage
FROM usa_tracks AS usa
INNER JOIN genre as g ON g.genre_id = usa.genre_id
GROUP BY g.name
ORDER BY tracks_sold DESC;"""
# Connect to database
conn = sqlite3.connect('chinook.db')
# Read into dataframe
genre = pd.read_sql_query(query_genre, conn)
# Label genres with less than 20 tracks sold to 'All Others' for pie chart
genre['name'] = genre['name'].mask(genre['tracks_sold'] <20, 'All Others')
# Group, sort, and slice for the pie chart
genre = genre.groupby('name').sum()
genre = genre.sort_values(by='tracks_sold', ascending=False)
genre = genre.iloc[:,0]
genre
name Rock 561 Alternative & Punk 130 Metal 124 R&B/Soul 53 All Others 48 Blues 36 Alternative 35 Latin 22 Pop 22 Hip Hop/Rap 20 Name: tracks_sold, dtype: int64
# Get 'All Others' to bottom of series for pie chart
genre = genre.drop('All Others')
temp = pd.Series(48)
temp = temp.rename({0:'All Others'})
genre = genre.append(temp)
explode = (0, 0.15, 0, 0, 0, 0, 0, 0.2, 0.2, 0)
# Plot
fig, ax = plt.subplots(figsize=(14,14))
genre.plot(kind='pie', fontsize=18, explode=explode, startangle=40, labeldistance=1.05)
# Plot Aesthetics
ax.set_ylabel('')
ax.set_title('Top Selling Genres (USA)', fontsize=24)
Text(0.5, 1.0, 'Top Selling Genres (USA)')
%%sql
/*# Breakdown of how many distinct tracks in each playlist */
SELECT playlist_id,
COUNT(DISTINCT track_id)
FROM playlist_track group by playlist_id ;
* sqlite:///chinook.db Done.
playlist_id | COUNT(DISTINCT track_id) |
---|---|
1 | 3290 |
3 | 213 |
5 | 1477 |
8 | 3290 |
9 | 1 |
10 | 213 |
11 | 39 |
12 | 75 |
13 | 25 |
14 | 25 |
15 | 25 |
16 | 15 |
17 | 26 |
18 | 1 |
%%sql
/*# Display total isntances of an artist across all playlists */
WITH artist_trackid AS
(
SELECT a.name artist_name,
t.track_id
FROM artist a
INNER JOIN album alb ON alb.artist_id = a.artist_id
INNER JOIN track t ON t.album_id = alb.album_id
)
SELECT at.artist_name,
COUNT(pt.playlist_id) tracks_in_playlists
FROM artist_trackid AS at
INNER JOIN playlist_track AS pt ON pt.track_id=at.track_id
GROUP BY at.artist_name
ORDER BY COUNT(pt.playlist_id) DESC
LIMIT 10;
* sqlite:///chinook.db Done.
artist_name | tracks_in_playlists |
---|---|
Iron Maiden | 516 |
U2 | 333 |
Metallica | 296 |
Led Zeppelin | 252 |
Deep Purple | 226 |
Lost | 184 |
Pearl Jam | 177 |
Faith No More | 145 |
Eric Clapton | 145 |
Lenny Kravitz | 143 |
# Create variable to pass into pd.read_sql
query_playlists = """ WITH artist_trackid AS
(
SELECT a.name artist_name,
t.track_id
FROM artist a
INNER JOIN album alb ON alb.artist_id = a.artist_id
INNER JOIN track t ON t.album_id = alb.album_id
)
SELECT at.artist_name,
COUNT(pt.playlist_id) tracks_in_playlists
FROM artist_trackid AS at
INNER JOIN playlist_track AS pt ON pt.track_id=at.track_id
GROUP BY at.artist_name
ORDER BY COUNT(pt.playlist_id) DESC
LIMIT 10;"""
# Connect to database
conn = sqlite3.connect('chinook.db')
# Read into dataframe
playlists = pd.read_sql_query(query_playlists, conn)
# Plot
fig, ax = plt.subplots(figsize=(4,2))
ax = sns.barplot(x='artist_name', y='tracks_in_playlists', data=playlists, color="steelblue")
# Plot Aesthetics
ax.set_title('Number Of Appearances In Playlists', fontsize=8)
ax.set_ylabel('')
ax.set_xlabel('')
ax.tick_params(labelsize=5)
plt.xticks(rotation = 45, ha='right')
sns.despine(left=True)
sns.set(style="whitegrid")
plt.rcParams['figure.dpi'] = 460
To answer this question, we will have to distriguish between the entire inventory of tracks in the track
table and the distinct instances of tracks from the invoice_line
table.
%%sql
/*# Display the total amount of tracks available v. the total tracks purchased */
WITH all_and_purchased AS
(
SELECT t.track_id all_tracks, il.track_id AS purchased_tracks
FROM track AS t
LEFT JOIN invoice_line il ON il.track_id=t.track_id
)
SELECT COUNT(DISTINCT a.all_tracks) total_tracks,
COUNT(DISTINCT a.purchased_tracks) tracks_purchased,
COUNT(DISTINCT a.all_tracks) - COUNT(DISTINCT a.purchased_tracks) not_purchased,
ROUND(COUNT(DISTINCT a.purchased_tracks) / CAST(COUNT(DISTINCT a.all_tracks) AS float), 2)percent_purchased
FROM all_and_purchased AS a;
* sqlite:///chinook.db Done.
total_tracks | tracks_purchased | not_purchased | percent_purchased |
---|---|---|---|
3503 | 1806 | 1697 | 0.52 |
# Create variable to pass into pd.read_sql
query_purchased = """ WITH all_and_purchased AS
(
SELECT t.track_id all_tracks, il.track_id AS purchased_tracks
FROM track AS t
LEFT JOIN invoice_line il ON il.track_id=t.track_id
)
SELECT COUNT(DISTINCT a.all_tracks) total_tracks,
COUNT(DISTINCT a.purchased_tracks) tracks_purchased,
COUNT(DISTINCT a.all_tracks) - COUNT(DISTINCT a.purchased_tracks) not_purchased,
ROUND(COUNT(DISTINCT a.purchased_tracks) / CAST(COUNT(DISTINCT a.all_tracks) AS float), 2)percent_purchased
FROM all_and_purchased AS a;"""
# Connect to database
conn = sqlite3.connect('chinook.db')
# Read into dataframe
purchased = pd.read_sql_query(query_purchased, conn)
purchased
total_tracks | tracks_purchased | not_purchased | percent_purchased | |
---|---|---|---|---|
0 | 3503 | 1806 | 1697 | 0.52 |
# Transform dataframe and cast it to a list for plt.pie
purchased = purchased.T
purchased = purchased.iloc[[1,2],0]
purchased_list = purchased.tolist()
# Plot
plt.pie(purchased_list,
labels=('Purchased', 'Not Purchased'),
explode=(0,0.03),
startangle=90,
autopct='%1.1f%%',
textprops={'fontsize': 4})
# Plot Aesthetics
plt.title('Tracks Purchased v. Not Purchased', fontsize=5)
fig = plt.gcf()
fig.set_size_inches(2,2)
%%sql
/*# Display artists with zero units sold */
SELECT ar.name artist_name,
g.name genre,
COUNT(il.track_id) units_sold
FROM track t
LEFT JOIN invoice_line il ON il.track_id = t.track_id
INNEr JOIN album al ON al.album_id = t.album_id
INNER JOIN artist ar ON ar.artist_id = al.artist_id
INNER JOIN genre as g ON g.genre_id = t.genre_id
GROUP BY artist_name
HAVING units_sold = 0
ORDER BY units_sold;
* sqlite:///chinook.db Done.
artist_name | genre | units_sold |
---|---|---|
Aaron Copland & London Symphony Orchestra | Classical | 0 |
Academy of St. Martin in the Fields Chamber Ensemble & Sir Neville Marriner | Classical | 0 |
Academy of St. Martin in the Fields, John Birch, Sir Neville Marriner & Sylvia McNair | Classical | 0 |
Academy of St. Martin in the Fields, Sir Neville Marriner & Thurston Dart | Classical | 0 |
Adrian Leaper & Doreen de Feis | Classical | 0 |
Alberto Turco & Nova Schola Gregoriana | Classical | 0 |
Anne-Sophie Mutter, Herbert Von Karajan & Wiener Philharmoniker | Classical | 0 |
Antal Doráti & London Symphony Orchestra | Classical | 0 |
Aquaman | TV Shows | 0 |
BackBeat | Rock And Roll | 0 |
Barry Wordsworth & BBC Concert Orchestra | Classical | 0 |
Battlestar Galactica (Classic) | Sci Fi & Fantasy | 0 |
Berliner Philharmoniker & Hans Rosbaud | Classical | 0 |
Berliner Philharmoniker, Claudio Abbado & Sabine Meyer | Classical | 0 |
Boston Symphony Orchestra & Seiji Ozawa | Classical | 0 |
Britten Sinfonia, Ivor Bolton & Lesley Garrett | Classical | 0 |
C. Monteverdi, Nigel Rogers - Chiaroscuro; London Baroque; London Cornett & Sackbu | Classical | 0 |
Charles Dutoit & L'Orchestre Symphonique de Montréal | Classical | 0 |
Chicago Symphony Chorus, Chicago Symphony Orchestra & Sir Georg Solti | Classical | 0 |
Chicago Symphony Orchestra & Fritz Reiner | Classical | 0 |
Choir Of Westminster Abbey & Simon Preston | Classical | 0 |
Chor der Wiener Staatsoper, Herbert Von Karajan & Wiener Philharmoniker | Classical | 0 |
Cláudio Zoli | Latin | 0 |
Dennis Chambers | Jazz | 0 |
Edo de Waart & San Francisco Symphony | Classical | 0 |
Elis Regina | Latin | 0 |
Emanuel Ax, Eugene Ormandy & Philadelphia Orchestra | Classical | 0 |
English Concert & Trevor Pinnock | Classical | 0 |
Equale Brass Ensemble, John Eliot Gardiner & Munich Monteverdi Orchestra and Choir | Classical | 0 |
Eugene Ormandy | Classical | 0 |
Felix Schmidt, London Symphony Orchestra & Rafael Frühbeck de Burgos | Classical | 0 |
Frank Zappa & Captain Beefheart | Rock | 0 |
Fretwork | Classical | 0 |
Gene Krupa | Jazz | 0 |
Gerald Moore | Classical | 0 |
Göteborgs Symfoniker & Neeme Järvi | Classical | 0 |
Habib Koité and Bamada | World | 0 |
Herbert Von Karajan, Mirella Freni & Wiener Philharmoniker | Classical | 0 |
Heroes | TV Shows | 0 |
Hilary Hahn, Jeffrey Kahane, Los Angeles Chamber Orchestra & Margaret Batjer | Classical | 0 |
João Suplicy | World | 0 |
Julian Bream | Classical | 0 |
Kent Nagano and Orchestre de l'Opéra de Lyon | Classical | 0 |
Leonard Bernstein & New York Philharmonic | Classical | 0 |
Les Arts Florissants & William Christie | Classical | 0 |
London Symphony Orchestra & Sir Charles Mackerras | Classical | 0 |
Martin Roscoe | Classical | 0 |
Mela Tenenbaum, Pro Musica Prague & Richard Kapp | Classical | 0 |
Michael Tilson Thomas & San Francisco Symphony | Classical | 0 |
Michele Campanella | Classical | 0 |
Mônica Marianno | World | 0 |
Nash Ensemble | Classical | 0 |
O Terço | Rock | 0 |
Orchestra of The Age of Enlightenment | Classical | 0 |
Orchestre Révolutionnaire et Romantique & John Eliot Gardiner | Classical | 0 |
Otto Klemperer & Philharmonia Orchestra | Classical | 0 |
Paul D'Ianno | Rock | 0 |
Philharmonia Orchestra & Sir Neville Marriner | Classical | 0 |
Philip Glass Ensemble | Soundtrack | 0 |
R.E.M. Feat. Kate Pearson | Alternative & Punk | 0 |
Richard Marlow & The Choir of Trinity College, Cambridge | Classical | 0 |
Roger Norrington, London Classical Players | Classical | 0 |
Royal Philharmonic Orchestra & Sir Thomas Beecham | Classical | 0 |
Sergei Prokofiev & Yuri Temirkanov | Classical | 0 |
Sir Georg Solti & Wiener Philharmoniker | Classical | 0 |
Sir Georg Solti, Sumi Jo & Wiener Philharmoniker | Opera | 0 |
Smashing Pumpkins | Alternative & Punk | 0 |
Stevie Ray Vaughan & Double Trouble | Blues | 0 |
Terry Bozzio, Tony Levin & Steve Stevens | Rock | 0 |
The Office | TV Shows | 0 |
The Tea Party | Alternative & Punk | 0 |
Ton Koopman | Classical | 0 |
Toquinho & Vinícius | Bossa Nova | 0 |
Various Artists | Pop | 0 |
Seventy-four artists have zero units sold. Most of these tracks are classical music.
Since the company purchases tracks from record companies to place in the store, half of the inventory (part of the working capital of the business) is stagnant and not producing a return.
Although we do not know the financial arrangement between the company and the record label, we can offer recommandations based on two possible cases:
If the company has paid a flat fee to host these tracks in the store, this is a sunk cost. Moving forward, Chinook may want to stop signing these artists and focus on more popular genres.
If the company pays the record label a percentage of sales, there is no real harm in keeping the tracks for sale in the store.
In either case, the company should consider finding a way to get these artists in front of the eyes of the customers. Chinook could place suggestions for these artists on the webpage as the customer is purchasing other tracks or on the 'view cart' page. This would increase exposure for the low-selling artists.
Any given track sold by Chinook is either protected or not protected. This refers to the Digital Rights Managment properties of the track. If it is protected, the purchaser is not able to make copies and is restricted to listening the the platform from which they bought it (iTunes for example). An unprotected track allows the purchaser to make copies and listen to it wherever they wish.
In Chinook protected tracks are identified by the following media_type_id
's:
Non-Protected tracks are identified by these media_type_id
's:
To find out whether the protected status of a tracks impacts the sales of the track, we will complete two steps:
First we will look at the entire inventory of tracks and determine the percentage of protected v. non-protected. Then we will retreive the same percentage breakdown of tracks that have actaully sold (track_id
from the invoice_line
table) and compare the two.
%%sql
/*# Display percentage breakdown from the entire track inventory */
WITH type_totals AS
(
SELECT CASE
WHEN mt.name LIKE 'Protected%' THEN 'Protected'
ELSE 'Non-Protected'
END
AS type
FROM track t
INNER JOIN media_type mt ON mt.media_type_id = t.media_type_id
)
SELECT type,
COUNT(type) AS total,
ROUND(COUNT(type) * 100.0 / (SELECT COUNT(*) FROM type_totals), 2) AS percentage
FROM type_totals
GROUP BY type;
* sqlite:///chinook.db Done.
type | total | percentage |
---|---|---|
Non-Protected | 3052 | 87.13 |
Protected | 451 | 12.87 |
%%sql
/*# Display percentage breakdown from just the tracks that have sold */
WITH type_totals AS
(
SELECT CASE
WHEN mt.name LIKE 'Protected%' THEN 'Protected'
ELSE 'Non-Protected'
END
AS type
FROM invoice_line il
INNER JOIN track t ON t.track_id = il.track_id
INNER JOIN media_type mt ON mt.media_type_id = t.media_type_id
)
SELECT type,
COUNT(type) AS total,
ROUND(COUNT(type) * 100.0 / (SELECT COUNT(*) FROM type_totals), 2) AS percentage
FROM type_totals
GROUP BY type;
* sqlite:///chinook.db Done.
type | total | percentage |
---|---|---|
Non-Protected | 4315 | 90.71 |
Protected | 442 | 9.29 |
# Create variable for sold tracks to pass into pd.read_sql
query_protected_sold = """ WITH type_totals AS
(
SELECT CASE
WHEN mt.name LIKE 'Protected%' THEN 'Protected'
ELSE 'Non-Protected'
END
AS type
FROM invoice_line il
INNER JOIN track t ON t.track_id = il.track_id
INNER JOIN media_type mt ON mt.media_type_id = t.media_type_id
)
SELECT type,
COUNT(type) AS total,
ROUND(COUNT(type) * 100.0 / (SELECT COUNT(*) FROM type_totals), 2) AS percentage
FROM type_totals
GROUP BY type;"""
# Create variable for all tracks to pass into pd.read_sql
query_protected_all = """WITH type_totals AS
(
SELECT CASE
WHEN mt.name LIKE 'Protected%' THEN 'Protected'
ELSE 'Non-Protected'
END
AS type
FROM track t
INNER JOIN media_type mt ON mt.media_type_id = t.media_type_id
)
SELECT type,
COUNT(type) AS total,
ROUND(COUNT(type) * 100.0 / (SELECT COUNT(*) FROM type_totals), 2) AS percentage
FROM type_totals
GROUP BY type;"""
# Connect to database
conn = sqlite3.connect('chinook.db')
# Read into dataframe
protected_sold = pd.read_sql_query(query_protected_sold, conn)
protected_all = pd.read_sql_query(query_protected_all, conn)
# Transform dataframe and cast it to a list for plt.pie
protected_sold = protected_sold.set_index('type')
protected_sold = protected_sold.iloc[[0,1],0]
list_sold = protected_sold.tolist()
# Transform dataframe and cast it to a list for plt.pie
protected_all = protected_all.set_index('type')
protected_all = protected_all.iloc[[0,1],0]
list_all = protected_all.tolist()
# Plot
fig, (ax1, ax2) = plt.subplots(1,2)
patches, texts, autotexts = ax1.pie(list_all,
explode=(0,0.03),
startangle=90,
autopct='%1.1f%%',
textprops={'fontsize': 10})
patches, texts, autotexts = ax2.pie(list_sold,
explode=(0,0.03),
startangle=90,
autopct='%1.1f%%',
textprops={'fontsize': 10})
# Plot Aesthetics
ax1.set_title("All Tracks")
ax2.set_title("Sold Tracks")
fig.suptitle("Protected v. Not Protected", fontsize=16)
fig.legend(protected_sold.index, loc='upper left', prop={'size': 8})
plt.tight_layout()
The store is setup in a way that allows customer to make purchases in one of the two ways:
The store does not let customers purchase a whole album and then add individual tracks to that same purchase (unless they do that by choosing each track manually). When customers purchase albums they are charged the same price as if they had purchased each of those tracks separately.
Management are currently considering changing their purchasing strategy to save money. The strategy they are considering is to purchase only the most popular tracks from each album from record companies, instead of purchasing every track from an album.
We have been asked to find out what percentage of purchases are individual tracks vs whole albums, so that management can use this data to understand the effect this decision might have on overall revenue.
We are not going to consider the following uncommon situations:
Since our analysis is concerned with maximizing revenue we can safely ignore albums consisting of only a few tracks. The company has previously done analysis to confirm that the second situation does not happen often, so we can ignore this.
In order to answer the question, we're going to have to identify whether each invoice has all the tracks from an album. We can do this by getting the list of tracks from an invoice and comparing it to the list of tracks from an album. We can find the album to compare the purchase to by looking up the album that one of the purchased tracks belongs to. It doesn't matter which track we pick, since if it's an album purchase, that album will be the same for all tracks.
%%sql
/*# This will serve as a reference table. */
/*# It has every every invoice_id with the album_id from one track in the order. */
CREATE VIEW album_per_invoice AS
SELECT il.invoice_id,
t.album_id
FROM invoice_line il
LEFT JOIN track t ON t.track_id = il.track_id
GROUP BY il.invoice_id;
* sqlite:///chinook.db Done.
[]
%%sql
SELECT * FROM album_per_invoice LIMIT 5;
* sqlite:///chinook.db Done.
invoice_id | album_id |
---|---|
1 | 91 |
2 | 322 |
3 | 203 |
4 | 314 |
5 | 163 |
We are going to take our above album_per_invoice
view and append a new column that tells us whether the customer bought an entire album or just individual tracks. In order to do this we are going to look at the tracks purchased on every invoice_id
(tracks from the invoice_line
table) and compare that to all the tracks an that particular album (tracks from the track
table).
%%sql
/*# Appending new column with the label 'Full Album' or 'Individual Tracks' */
CREATE VIEW invoice_purchase_type AS
SELECT ai.invoice_id,
CASE
WHEN (SELECT il.track_id
FROM invoice_line il
WHERE il.invoice_id = ai.invoice_id
EXCEPT
SELECT t.track_id
FROM track t
WHERE t.album_id = ai.album_id) IS NULL
AND (SELECT t.track_id
FROM track t
WHERE t.album_id = ai.album_id
EXCEPT
SELECT il.track_id
FROM invoice_line il
WHERE il.invoice_id = ai.invoice_id) IS NULL
THEN "Full Album"
ELSE "Individual Tracks"
END AS purchase_type
FROM album_per_invoice ai;
* sqlite:///chinook.db Done.
[]
%%sql
SELECT * FROM invoice_purchase_type LIMIT 5;
* sqlite:///chinook.db Done.
invoice_id | purchase_type |
---|---|
1 | Full Album |
2 | Individual Tracks |
3 | Individual Tracks |
4 | Individual Tracks |
5 | Full Album |
%%sql
SELECT purchase_type,
COUNT(invoice_id) AS total,
ROUND(COUNT(invoice_id) * 100.0 / (SELECT COUNT(*) FROM invoice_purchase_type), 2) AS percentage
FROM invoice_purchase_type
GROUP BY purchase_type;
* sqlite:///chinook.db Done.
purchase_type | total | percentage |
---|---|---|
Full Album | 114 | 18.57 |
Individual Tracks | 500 | 81.43 |
# Create variable to pass into pd.read_sql
query_album = """SELECT purchase_type,
COUNT(invoice_id) AS total,
ROUND(COUNT(invoice_id) * 100.0 / (SELECT COUNT(*) FROM invoice_purchase_type), 2) AS percentage
FROM invoice_purchase_type
GROUP BY purchase_type;"""
# Connect to database
conn = sqlite3.connect('chinook.db')
# Read into dataframe
album = pd.read_sql_query(query_album, conn)
# Transform dataframe and cast it to a list
# album = album.T
album = album.set_index('purchase_type')
album = album.iloc[[0,1],0]
album_list = album.tolist()
# Plot
plt.pie(album_list,
labels=album.index,
explode=(0,0.03),
startangle=90,
autopct='%1.1f%%',
textprops={'fontsize': 3})
# Plot Aesthetics
plt.tight_layout()
plt.title('Album v. Individual Tracks', fontsize=4)
fig = plt.gcf()
fig.set_size_inches(2,2)
Based on this we would recommend to managment that they move forward with their plan to start purchasing the most popular tracks from an album instead of the entire album. Over 80% of customers would likely not be impacted by this change. At the same time the company would be reducing its purchasing expenses.
The impact to album-buying cusomters could be mitigated by keeping an inventory of albums from popular artists, genres, or record labels only.
Each customer for the Chinook store gets assigned to a sales support agent. We will analyze the purchases of customers belonging to each employee to see if any sales support agent is performing either better or worse than the others.
%%sql
/*# Display employee sales data as well as hire date */
WITH employee_sales AS
(
SELECT c.support_rep_id,
ROUND(SUM(i.total), 2) AS rep_total,
COUNT(c.customer_id) total_customers
FROM invoice AS i
INNER JOIN customer AS c ON c.customer_id=i.customer_id
GROUP BY c.support_rep_id
)
SELECT e.first_name||" "||e.last_name AS employee_name,
es.rep_total,
e.hire_date
FROM employee_sales AS es
INNER JOIN employee AS e ON e.employee_id=es.support_rep_id;
* sqlite:///chinook.db Done.
employee_name | rep_total | hire_date |
---|---|---|
Jane Peacock | 1731.51 | 2017-04-01 00:00:00 |
Margaret Park | 1584.0 | 2017-05-03 00:00:00 |
Steve Johnson | 1393.92 | 2017-10-17 00:00:00 |
# Create variable to pass into pd.read_sql
query_employee = """WITH employee_sales AS
(
SELECT c.support_rep_id,
ROUND(SUM(i.total), 2) AS rep_total,
COUNT(c.customer_id) total_customers
FROM invoice AS i
INNER JOIN customer AS c ON c.customer_id=i.customer_id
GROUP BY c.support_rep_id
)
SELECT e.first_name||" "||e.last_name AS employee_name,
es.rep_total,
e.hire_date
FROM employee_sales AS es
INNER JOIN employee AS e ON e.employee_id=es.support_rep_id;"""
# Connect to database
conn = sqlite3.connect('chinook.db')
# Read into dataframe
employee = pd.read_sql_query(query_employee, conn)
# Plot
ax = sns.barplot(x='employee_name', y='rep_total', data=employee, color="steelblue")
# Plot Aesthetics
ax.set_title('Employee Sales', fontsize=12)
ax.set_ylabel('')
ax.set_xlabel('')
ax.tick_params(labelsize=7)
ax.set_ylim([1100,1800])
plt.xticks(rotation = 45, ha='right')
sns.despine(left=True)
sns.set(style="whitegrid")
plt.figure(figsize=(4, 3))
<Figure size 1840x1380 with 0 Axes>
<Figure size 1840x1380 with 0 Axes>
Let's see how many customers each sales rep has:
%%sql
SELECT e.first_name||" "||e.last_name AS employee_name,
COUNT(c.customer_id) AS total_customers
FROM customer AS c
INNER JOIN employee AS e ON e.employee_id=c.support_rep_id
GROUP BY employee_name;
* sqlite:///chinook.db Done.
employee_name | total_customers |
---|---|
Jane Peacock | 21 |
Margaret Park | 20 |
Steve Johnson | 18 |