#!/usr/bin/env python # coding: utf-8 # ### Advanced SQLAlchemy Queries # In[1]: # import # ### Using MySql # ``` # Import the create_engine function from the sqlalchemy library. # Create an engine to the census database by concatenating the following strings and passing them to create_engine(): # 'mysql+pymysql://' # 'student:datacamp' # '@courses.csrrinzqubik.us-east-1.rds.amazonaws.com' # ':3306/census' # Use the table_names() method on engine to print the table names. # ``` # In[2]: # # Import create_engine function # from sqlalchemy import create_engine # # Create an engine to the census database # engine = create_engine('mysql+pymysql://student:datacamp@courses.csrrinzqubik.us-east-1.rds.amazonaws.com:3306/census') # # Use the `table_names()` method on the engine to print the table names # print(engine.table_names()) # ``` # Define a select statement to return i) the state and ii) the difference in population count between 2008 and 2000 labeled as pop_change. Store the statement as stmt. The state column is given by census.columns.state and the 2008 population count column by census.columns.pop2008. # Use the group_by() method on stmt to group by the state. Do so by passing it census.columns.state. # Use the group_by() method on stmt to order the population changes ('pop_change') in descending order. Do so by passing it desc('pop_change'). # Use the limit() method to return only 5 records. Do so by passing it the desired number of records. # Use the connection to execute stmt and fetch all the records store as results. # The print statement has already been written for you. Hit 'Submit Answer' to view the results! # ``` # In[3]: # # Build query to return state names by population difference from 2008 to 2000: stmt # stmt = select([census.columns.state, (census.columns.pop2008-census.columns.pop2000).label('pop_change')]) # # Append group by for the state: stmt # stmt = stmt.group_by(census.columns.state) # # Append order by for pop_change descendingly: stmt # stmt = stmt.order_by(desc('pop_change')) # # Return only 5 results: stmt # stmt = stmt.limit(5) # # Use connection to execute the statement and fetch all results # results = connection.execute(stmt).fetchall() # # Print the state and population change for each record # for result in results: # print('{}-{}'.format(result.state, result.pop_change)) # ``` # Import case, cast, and Float from sqlalchemy. # Build an expression female_pop2000to calculate female population in 2000. To achieve this: # Use case() inside func.sum() # Make the first argument of case() a list containing a tuple of i) a boolean checking that census.columns.sex is equal to 'F' and ii) the column census.columns.pop2000. # Use cast() to cast an expression to calculate total population in 2000 to Float. # Build a query to calculate the percentage of females in 2000. # Execute the query by passing stmt to connection.execute(). Apply the scalar() method to it and store the result as percent_female. # Print percent_female. # ``` # In[4]: # # import case, cast and Float from sqlalchemy # from sqlalchemy import case, cast, Float # # Build an expression to calculatae female population in 2000 # female_pop2000 = func.sum( # case([ # (census.columns.sex == 'F', census.columns.pop2000) # ], else_=0)) # # Cast an expression to calculate total population in 2000 to Float # total_pop2000 = cast(func.sum(census.columns.pop2000), Float) # # Build a query to calculate the percentage of females in 2000: stmt # stmt = select([female_pop2000 / total_pop2000* 100]) # # Execute the query and store the scalar result: percent_female # percent_female = connection.execute(stmt).scalar() # # Print the percentage # print(percent_female) # ``` # Build a statement to join the census and state_fact tables and select the pop2000 column from the first and the abbreviation column from the second. # Execute the statement to get the first result and save it as result. # Hit submit to loop over the keys of the result object, and print the key and value for each! # ``` # In[5]: # # Build a statement to join census and state_fact tables: stmt # stmt = select([census.columns.pop2000, state_fact.columns.abbreviation]) # # Execute the statement and get the first result: result # result = connection.execute(stmt).first() # # Loop over the keys in the result object and print the key and value # for key in result.keys(): # print(key, getattr(result, key)) # ```Build a statement to select ALL the columns from the census and state_fact tables. To select ALL the columns from two tables employees and sales, for example, you would use stmt = select([employees, sales]). # Append a select_from to stmt to join the census table to the state_fact table by the state column in census and the name column in the state_fact table. # Execute the statement to get the first result and save it as result. This code is alrady written. # Hit submit to loop over the keys of the result object, and print the key and value for each!``` # In[7]: # # Build a statement to select the census and state_fact tables: stmt # stmt = select([census, state_fact]) # # Add a select_from clause that wraps a join for the census and state_fact # # tables where the census state column and state_fact name column match # stmt = stmt.select_from( # census.join(state_fact, census.columns.state == state_fact.columns.name)) # # Execute the statement and get the first result: result # result = connection.execute(stmt).first() # # Loop over the keys in the result object and print the key and value # for key in result.keys(): # print(key, getattr(result, key)) # ```Build a statement to select from the census table the following: # the state column, # the sum of the pop2008 column and # the census_division_name column. # Append a select_from() to stmt in order to join the census and state_fact tables by the state and name columns. # Append a group_by to stmt in order to group by the name column from the state_fact table. # Execute the statement to get all the records and save it as results. # Hit submit to loop over the results object and print each record. # ``` # In[8]: # # Build a statement to select the state, sum of 2008 population and census # # division name: stmt # stmt = select([ # census.columns.state, # func.sum(census.columns.pop2008), # state_fact.columns.census_division_name # ]) # # Append select_from to join the census and state_fact tables by the census state and state_fact name columns # stmt = stmt.select_from( # census.join(state_fact, census.columns.state == state_fact.columns.name) # ) # # Append a group by for the state_fact name column # stmt = stmt.group_by(state_fact.columns.name) # # Execute the statement and get the results: results # results = connection.execute(stmt).fetchall() # # Loop over the the results object and print each record. # for record in results: # print(record) # ``` # Save an alias of the employees table as managers. To do so, apply the method alias() to employees. # Build a query to select the employee name and their manager's name. You can use label to label the name column of employees as 'employee'. # Append a where clause to stmt to match where the mgr column of the employees table corresponds to the id column of the managers table. # Append an order by clause to stmt so that it is ordered by the name column of the managers table. # Execute the statement and store all the results. This code is already written. Hit submit to print the names of the managers and all their employees. # ``` # In[9]: # # Make an alias of the employees table: managers # managers = employees.alias() # # Build a query to select manager's and their employees names: stmt # stmt = select( # [managers.columns.name.label('manager'), # employees.columns.name.label('employee')] # ) # # Append where to match manager ids with employees managers: stmt # stmt = stmt.where(managers.columns.id==employees.columns.mgr) # # Append order by managers name: stmt # stmt = stmt.order_by(managers.columns.name) # # Execute statement: results # results = connection.execute(satmt).fetchall() # # Print records # for record in results: # print(record) # ``` # Save an alias of the employees table as managers. # Build a query to select the manager's name and the count of the number of their employees. The function func.count() has been imported and will be useful! # Append a where clause that filters for records where the manager id and employee mgr are equal. # Use a group_by() clause to group the query by the name column of the managers table. # Execute the statement and store all the results. Print the names of the managers and their employees. This code has already been written so hit submit and check out the results! # ``` # In[10]: # # Make an alias of the employees table: managers # managers = employees.alias() # # Build a query to select managers and counts of their employees: stmt # stmt = select([managers.columns.name, func.count(employees.columns.id)]) # # Append a where clause that ensures the manager id and employee mgr are equal # stmt = stmt.where(managers.columns.id==employees.columns.mgr) # # Group by Managers Name # stmt = stmt.group_by(managers.columns.name) # # Execute statement: results # results = connection.execute(stmt).fetchall() # # print manager # for record in results: # print(record)a # ``` # Use a while loop that checks if there are more_results. # Inside the loop, apply the method fetchmany() to results_proxy to get 50 records at a time and store those records as partial_results. # After fetching the records, if partial_results is an empty list (that is, if it is equal to []), set more_results to False. # Loop over the partial_results and, if row.state is a key in the state_count dictionary, increment state_count[row.state] by 1; otherwise set state_count[row.state] to 1. # After the while loop, close the ResultProxy results_proxy. # Hit 'Submit' to print state_count. # ``` # In[ ]: # # Start a while loop checking for more results # while more_results: # # Fetch the first 50 results from the ResultProxy: partial_results # partial_results = results_proxy.fetchmany(50) # # if empty list, set more_results to False # if partial_results == []: # more_results = False # # Loop over the fetched records and increment the count for the state: state_count # for row in partial_results: # if row.state in state_count: # state_count[row.state] = state_count[row.state]+1 # else: # state_count[row.state] =1 # # Close the ResultProxy, and thus the connection # results_proxy.close() # # Print the count by state # print(state_count)