#!/usr/bin/env python # coding: utf-8 # # INTRODUCTION # In this project, we'll use SQL in Jupyter Notebook to explore and analyze data from the [CIA World Factbook](https://www.cia.gov/library/publications/the-world-factbook/), a compendium of statistics about all of the countries on Earth. The Factbook contains demographic information like: # - population - The population as of 2015. # - population_growth - The annual population growth rate, as a percentage. # - area - The total land and water area. # In[1]: get_ipython().run_cell_magic('capture', '', '%load_ext sql\n%sql sqlite:///factbook.db\n') # ## Overview of the data # We'll begin by getting a sense of what the data looks like. # In[2]: get_ipython().run_cell_magic('sql', '', 'SELECT *\n FROM facts\n LIMIT 5;\n') # Here are the descriptions for some of the columns: # - name - The name of the country. # - name - The name of the country. # - area - The total land and sea area of the country. # - population - The country's population. # - population_growth- The country's population growth as a percentage. # - birth_rate - The country's birth rate, or the number of births a year per 1,000 people. # - death_rate - The country's death rate, or the number of death a year per 1,000 people. # - area- The country's total area (both land and water). # - area_land - The country's land area in [square kilometers](https://www.cia.gov/library/publications/the-world-factbook/rankorder/2147rank.html). # - area_water - The country's waterarea in square kilometers. # Let's start by calculating some summary statistics and see what they tell us # ## Summary Statistics # In[3]: get_ipython().run_cell_magic('sql', '', 'SELECT MIN(population),\n MAX(population),\n MIN(population_growth),\n MAX(population_growth)\n FROM facts;\n \n') # A few things stick out from the summary statistics in the last screen: # - There's a country with a population of 0 # - There's a country with a population of 7256490011 (or more than 7.2 billion people) # Let's use subqueries to zoom in on just these countries without using the specific values. # # Exploring Outliers # In[4]: get_ipython().run_cell_magic('sql', '', 'SELECT *\n FROM facts\n WHERE population ==\n (SELECT MIN(population) \n FROM facts);\n') # It seems like the table contains a row for Antarctica, which explains the population of 0. This seems to match the CIA Factbook [page for Antarctica:](https://www.cia.gov/library/publications/the-world-factbook/geos/ay.html) # ](https://s3.amazonaws.com/dq-content/257/fb_antarctica.png) # In[5]: get_ipython().run_cell_magic('sql', '', 'SELECT *\n FROM facts\n WHERE population ==\n (SELECT MAX(population) \n FROM facts);\n') # We also see that the table contains a row for the whole world, which explains the maximum population of over 7.2 billion we found earlier. # Now that we know this, we should recalculate the summary statistics we calculated earlier, while excluding the row for the whole world. # # Summary Statistics Revisited # In[6]: get_ipython().run_cell_magic('sql', '', 'SELECT MIN(population),\n MAX(population),\n MIN(population_growth),\n MAX(population_growth)\n FROM facts\nWHERE name!="World"\n \n') # There's a country whose population closes in on 1.4 billion! # # Exploring Average Population and Area # Let's explore density. Density depends on the population and the country's area. Let's look at the average values for these two columns. # We should take care of discarding the row for the whole planet. # In[7]: get_ipython().run_cell_magic('sql', '', 'SELECT AVG(population) AS AVG_POPULATION,\n AVG(area) AS AVG_AREA\n FROM facts\nWHERE name!="World"\n \n') # We see that the average population is around 32 million and the average area is 555 thousand square kilometers. # # Finding Densely Populated Countries # To finish, we'll build on the query above to find countries that are densely populated. We'll identify countries that have: # - Above average values for population. # - Below average values for area. # In[8]: get_ipython().run_cell_magic('sql', '', 'SELECT *\n FROM facts\n WHERE population > (SELECT AVG(population) \n FROM facts) \n AND area < (SELECT AVG(area) \n FROM facts);\n \n') # Some of these countries are generally known to be densely populated, so we have confidence in our results! # # Finding country with the Most population and Highest growth rate # In[9]: get_ipython().run_cell_magic('sql', '', 'SELECT name,MAX(population) AS Highest_population\n FROM facts\n WHERE name <> "World"\n\n \n') # From the analysis we can see that ***China*** has the highest population with over ***1.3 billion people*** # In[10]: get_ipython().run_cell_magic('sql', '', 'SELECT name,MAX(population_growth) AS highest_growth_rate\n FROM facts\n WHERE name <> "World"\n') # *** South Sudan*** has the highest growth rate # # Finding countries with # - the highest water to land mass ratio # - more water than land # In[11]: get_ipython().run_cell_magic('sql', '', 'SELECT *, CAST(area_water AS FLOAT)/CAST(area_land AS FLOAT)AS water_to_land_ratio\n \n FROM facts\n WHERE name <> "World" \n ORDER BY water_to_land_ratio DESC\n LIMIT 5;\n') # From the above, ***British indian ocean*** has the highest water to land mass ratio # In[12]: get_ipython().run_cell_magic('sql', '', 'SELECT *, CAST(area_water AS FLOAT) > CAST(area_land AS FLOAT) AS water_greaterThan_land\n \n FROM facts\n WHERE name <> "World" \n ORDER BY water_greaterThan_land DESC\n LIMIT 5;\n') # ***British indian ocean*** has more water than land # #### FINDING THE COUNTRIES THAT WOULD ADD MOST PEOPLE TO # # #### THEIR COUNTRY NEXT YEAR # In[13]: get_ipython().run_cell_magic('sql', '', 'SELECT name AS country,population,CAST(population_growth*population AS FLOAT) AS NEXTYR_POP \n FROM facts\n WHERE name <> "World" \n ORDER BY NEXTYR_POP DESC\n LIMIT 7;\n') # ***India*** will add more people to their population next year # #### COUNTRIES WITH HIGHER DEATH RATE THAN BIRTHRATE # In[14]: get_ipython().run_cell_magic('sql', '', 'SELECT *, CAST(death_rate AS FLOAT)/CAST(birth_rate AS FLOAT) AS death_to_birthrate\n \n FROM facts\n WHERE name <> "World" \n ORDER BY death_to_birthrate DESC\n LIMIT 5;\n') # ***Bulgaria*** has the highest death rate than birth rate # ### countries with the highest population to area ratio # In[15]: get_ipython().run_cell_magic('sql', '', 'SELECT name AS country,population,CAST(population/area AS FLOAT) AS population_to_area \n FROM facts\n WHERE name <> "World" \n ORDER BY population_to_area DESC\n LIMIT 7;\n') # ***Macau*** has the highest death rate than birth rate # In[ ]: