#!/usr/bin/env python # coding: utf-8 # # Answering Business Questions Using SQL # In this project We'll be working with a modified version of a database called Chinook. The Chinook database is provided as a SQLite database file called chinook.db # # Objective # The aim of this project analysis is to get a approximate answer or understanding of the following questions or points: # # # To achive that we are using analysis techniques applied to sql, as we said the database has SQLite engine. # # Conclusion # After the analysis we conclude that: # # # Database Info # The Chinook database is provided as a SQLite database file called chinook.db . The Chinook database contains information about a fictional digital music shop - kind of like a mini-iTunes store. # # The Chinook database contains information about the artists, songs, and albums from the music shop, as well as information on the shop's employees, customers, and the customers purchases. This information is contained in eleven tables. You can find more about Chinook here. # # # ### Relational Model # ![title](db_relational_model.svg) # ### Connection # In[4]: get_ipython().run_cell_magic('capture', '', '%load_ext sql\n%sql sqlite:///chinook.db\n') # ### Table Overview # In[2]: get_ipython().run_cell_magic('sql', '', 'SELECT\n name,\n type\nFROM sqlite_master\nWHERE type IN ("table","view");\n') # # Top 3 Genres sold in USA # In[14]: get_ipython().run_cell_magic('sql', '', "SELECT \n il.invoice_id, \n g.name AS genre\n FROM invoice_line il\n INNER JOIN track t ON il.track_id = t.track_id\n INNER JOIN genre g ON t.genre_id = g.genre_id\n WHERE \n genre LIKE '%Hip%' \n OR genre LIKE '%Punk%' \n OR genre LIKE '%Pop%' \n OR genre LIKE '%Blues%';\n") # In[28]: get_ipython().run_cell_magic('sql', '', "WITH \n invoice_usa AS \n (\n SELECT\n c.country,\n i.invoice_id,\n i.total \n FROM customer c\n INNER JOIN invoice i on c.customer_id = i.customer_id\n WHERE c.country = 'USA'\n ),\n top_genres AS \n ( \n SELECT \n il.invoice_id, \n g.name AS genre\n FROM invoice_line il\n INNER JOIN track t ON il.track_id = t.track_id\n INNER JOIN genre g ON t.genre_id = g.genre_id\n WHERE \n genre LIKE '%Hip%' \n OR genre LIKE '%Punk%' \n OR genre LIKE '%Pop%' \n OR genre LIKE '%Blues%'\n )\n \n\nSELECT \n tg.genre AS Genre,\n ROUND(SUM(iu.total), 0) Tracks_Sold,\n ROUND((SUM(iu.total) / (SELECT SUM(total) FROM invoice_usa)), 2) AS Percentage,\n iu.country AS Country\n FROM invoice_usa iu\n INNER JOIN top_genres tg ON iu.invoice_id = tg.invoice_id\nGROUP BY 1\nORDER BY 2 DESC\n \n\n \n") # # Employee Sales Performance # Each customer for the Chinook store gets assigned to a sales support agent within the company when they first make a purchase, let us analyze the purchases of customers belonging to each employee to see if any sales support agent is performing either better or worse than the others. # In[4]: get_ipython().run_cell_magic('sql', '', 'SELECT * \n FROM employee\nLIMIT 3;\n') # In[36]: get_ipython().run_cell_magic('sql', '', 'WITH \n sales_employees AS\n (\n SELECT \n e.employee_id,\n e.first_name || " " || e.last_name AS employee_name,\n e.title,\n e.hire_date, \n e.reports_to\n FROM employee e\n WHERE e.title = \'Sales Support Agent\'\n ),\n sales_performance AS\n (\n SELECT \n se.employee_id,\n COUNT(c.customer_id) AS customer_number,\n COUNT(DISTINCT(c.country)) AS number_countries,\n ROUND(SUM(i.total), 2) AS total_sales\n FROM customer c\n INNER JOIN invoice i ON i.customer_id = c.customer_id\n INNER JOIN sales_employees se ON se.employee_id = c.support_rep_id\n GROUP BY 1\n ORDER BY 3 DESC\n ),\n sales_managers AS\n (\n SELECT \n e.employee_id,\n e.first_name || " " || e.last_name AS employee_name,\n e.title,\n e.hire_date, \n e.reports_to\n FROM employee e\n WHERE e.title = \'Sales Manager\'\n )\n \nSELECT \n se.employee_name,\n se.title,\n se.hire_date,\n sm.employee_name AS manager,\n sp.total_sales,\n ROUND((sp.total_sales / (SELECT SUM(total_sales) FROM sales_performance)),2) \'sales_%\', \n sp.customer_number,\n sp.number_countries\n FROM sales_performance sp\n INNER JOIN sales_employees se ON se.employee_id = sp.employee_id\n INNER JOIN sales_managers sm ON sm.employee_id = se.reports_to\nORDER BY 5 DESC, 6 DESC;\n') # **Jane Peacock** has the maximum total sales but we should take into consideration that he is the oldest one on the department. **Margaret Park** has an extra number of **customer & country** also **Steve Johnson the newest** one have customers more countries. # Another observation is that the **Top 3 Sales support agents** are in the same team or at least report to **Nancy Edwards** whose job to keep them on track and in a healthy competitive environment may be essential. # # Sales Performance By Country # We are going to make sales analysis for customers from each different country (countries with only one customer, will be grouped in "Other" category). The analysis will include the following points: # # In[52]: get_ipython().run_cell_magic('sql', '', "\nWITH \n customers_info AS\n (\n SELECT\n c.customer_id, \n c.country,\n i.total,\n i.invoice_id\n FROM customer c\n INNER JOIN invoice i ON i.customer_id = c.customer_id\n GROUP BY 1\n ORDER BY 1 \n ),\n country_sales AS\n (\n SELECT\n COUNT(ci.customer_id) AS number_of_customers,\n COUNT(DISTINCT(ci.invoice_id)) AS number_of_orders,\n ROUND(SUM(ci.total), 2) AS total_sales,\n CASE \n WHEN COUNT(DISTINCT(ci.customer_id)) > 1\n THEN ci.country\n ELSE 'Other'\n END AS country\n FROM customers_info ci\n GROUP BY ci.country\n ORDER BY 1 DESC\n \n ),\n country_sales_summary AS\n (\n SELECT \n cs.country,\n SUM(cs.number_of_customers) AS total_customers,\n SUM(cs.number_of_orders) AS total_orders,\n SUM(cs.total_sales) AS total_sales,\n CASE\n WHEN cs.country = 'Other' THEN 1\n ELSE 0\n END AS sort\n FROM country_sales cs\n GROUP BY 1\n ORDER BY sort ASC, total_sales DESC\n )\n\n\nSELECT \n country,\n total_customers,\n total_orders,\n ROUND(total_sales, 2) AS total_sales,\n ROUND((total_sales / total_customers), 2) AS average_sales_per_customer,\n ROUND((total_sales / total_orders), 2) AS average_order_value\nFROM country_sales_summary;\n\n") # # # Album vs Individual Tracks # In[29]: get_ipython().run_cell_magic('sql', '', "WITH \n invoice_info AS\n (\n SELECT \n t.album_id, \n il.track_id, \n il.invoice_id\n FROM track t\n INNER JOIN invoice_line il ON il.track_id = t.track_id\n GROUP BY 3\n )\n\nSELECT \n whole_album,\n count(distinct invoice_id) number_invoices, \n cast(count(distinct invoice_id) as Float)/\n (SELECT count(distinct invoice_id) FROM invoice) perc_inv\nFROM \n (\n SELECT inv.*, \n CASE\n WHEN \n (\n SELECT t2.track_id /* whole album */\n FROM track t2\n WHERE t2.album_id = inv.album_id\n \n EXCEPT\n \n SELECT il2.track_id /* only purchased tracks */\n FROM invoice_line il2\n WHERE il2.invoice_id = inv.invoice_id\n ) IS NULL\n \n AND\n \n (\n SELECT il2.track_id\n FROM invoice_line il2\n WHERE il2.invoice_id = inv.invoice_id\n \n EXCEPT\n \n SELECT t2.track_id\n FROM track t2\n WHERE t2.album_id = inv.album_id \n ) IS NULL\n THEN 'Yes'\n ELSE 'No'\n END AS whole_album\n FROM invoice_info AS inv\n )\nGROUP BY whole_album\n \n") # # Top Artist in playlists # In[16]: get_ipython().run_cell_magic('sql', '', '\nWITH \n play_lists_tracks AS \n (\n SELECT\n pl.playlist_id,\n pl.track_id,\n t.album_id,\n al.artist_id\n FROM playlist_track pl\n INNER JOIN track t ON t.track_id = pl.track_id\n INNER JOIN album al ON al.album_id = t.album_id\n ORDER BY al.artist_id \n )\n\n \nSELECT \n ar.artist_id AS ARTIST_ID, \n ar.name AS ARTIST_NAME,\n COUNT(plt.track_id) AS PLAYLIST_APPEARANCE\n FROM artist ar\n INNER JOIN play_lists_tracks plt ON plt.artist_id = ar.artist_id \nGROUP BY ar.artist_id\nORDER BY 3 DESC\nLIMIT 3;\n') # The artist that most appeared in playlistst is **Iron Maiden** followed by **U2 and Metallica**. # # Tracks Purchased VS Not Purchased # In[20]: get_ipython().run_cell_magic('sql', '', '\nWITH\n purchase_info AS\n (\n SELECT\n t.track_id,\n CASE\n WHEN \n (\n t.track_id IN (SELECT il.track_id FROM invoice_line il) \n ) \n THEN "Yes"\n ELSE "No" \n END AS purchased \n FROM track t\n )\nSELECT \n pi.purchased,\n COUNT(pi.track_id) AS number_of_tracks,\n ROUND((CAST(COUNT(pi.track_id) AS FLOAT) * 100) / (SELECT COUNT(*) FROM track t), 2) AS percentage\n FROM purchase_info pi\nGROUP BY 1\n') # Only the **51.56** of the tracks have been purchased. # # Is the range of tracks in the store reflective of their sales popularity? # For each genre we will extract the total percentage of tracks in store and the percentage sold. # In[65]: get_ipython().run_cell_magic('sql', '', 'WITH \n number_of_tracks_in_store AS \n (\n SELECT COUNT(t.track_id) AS total\n FROM track t\n ), \n number_of_tracks_sold AS\n (\n SELECT COUNT(il.track_id) AS total\n FROM invoice_line il\n ), \n store_tracks AS\n (\n SELECT \n t.genre_id,\n g.name AS genre_name,\n COUNT(t.track_id) total_tracks_in_store,\n ROUND(CAST(COUNT(t.track_id)AS FLOAT)*100/(SELECT COUNT(t.track_id)FROM track t),2) AS percentage_of_tracks_in_store \n FROM track t\n INNER JOIN genre g ON g.genre_id =t.genre_id\n GROUP BY 2 \n ),\n store_tracks_purchased AS\n (\n SELECT \n t.genre_id,\n g.name AS genre_name,\n COUNT(il.track_id) total_tracks_sold,\n ROUND(CAST(COUNT(il.track_id)AS FLOAT)*100/(SELECT COUNT(il.track_id)FROM invoice_line il),2) AS percentage_of_tracks_sold\n FROM invoice_line il \n INNER JOIN track t ON il.track_id =t.track_id\n INNER JOIN genre g ON g.genre_id = t.genre_id\n GROUP BY 2 \n )\n \n \nSELECT \n st.genre_name AS genre,\n st.percentage_of_tracks_in_store,\n stp.percentage_of_tracks_sold\n FROM store_tracks st\n INNER JOIN store_tracks_purchased stp ON stp.genre_id = st.genre_id\nORDER BY 3 DESC\n \n') # Firstly, we categorised the tracks by their genre to represent the range of tracks that are available. From the results, we can see that the top 4 genres that have the highest percentage of tracks available are also the same 4 genres that have the highest percentages of sales. These genres are Rock, Metal, Alternative & Punk and Latin. While the rest of the range of tracks in store are pretty reflective of their sales popularity, there are a couple of genres that we should take note of to reduce loss to the Chinook Store. # # Do protected vs non-protected media types have an effect on popularity? # For answering this question, first let's see the **media_type** content. # In[5]: get_ipython().run_cell_magic('sql', '', 'SELECT *\nFROM media_type\n') # From the previous table we can group the media types as it follows: # # **Non Protected Files** # # # **Protected Files** # # # In[30]: get_ipython().run_cell_magic('sql', '', 'WITH in_store_track AS\n (\n SELECT COUNT(t.track_id) tracks_in_store,\n t.media_type_id,\n mt.name\n FROM track t\n INNER JOIN media_type mt ON mt.media_type_id = t.media_type_id\n GROUP BY 3\n ),\n track_sold AS\n (\n \n SELECT COUNT(il.track_id) tracks_sold,\n t.media_type_id,\n mt.name\n FROM invoice_line il\n INNER JOIN track t ON t.track_id = il.track_id\n INNER JOIN media_type mt ON mt.media_type_id = t.media_type_id\n GROUP BY 3\n ),\n \n store_vs_sold AS\n (\n SELECT it.name media_type,\n it.tracks_in_store,\n ts.tracks_sold\n FROM in_store_track it\n INNER JOIN track_sold ts On ts.media_type_id = it.media_type_id\n ),\n protected_vs_nonprotected AS\n (\n SELECT SUM(ss.tracks_in_store) tracks_in_store,\n SUM(ss.tracks_sold) tracks_sold,\n CASE \n WHEN\n ( \n ss.media_type like "%Protected%"\n )THEN "Protected"\n ELSE "Non_protected"\n END AS Media_type\n FROM store_vs_sold ss\n GROUP BY 3\n )\nSELECT pn.Media_type,\n pn.tracks_in_store,\n ROUND(CAST(pn.tracks_in_store AS FLOAT)*100/(SELECT COUNT(t.track_id) FROM track t),2) PERCENTAGE_TRACKS_in_STORE,\n pn.tracks_sold,\n ROUND(CAST(pn.tracks_sold AS FLOAT)*100/(SELECT COUNT(il.track_id) FROM invoice_line il),2) PERCENTAGE_TRACKS_SOLD\n \nFROM protected_vs_nonprotected pn\n') # From the previous results table, non-protected media type is the most popular in sales with a percentage of 87.13%. # # Conclusion # After the analysis we conclude that: #