# One time one line installation of ipython-sql
!conda install -yc conda-forge ipython-sql
Collecting package metadata (current_repodata.json): ...working... done Solving environment: ...working... done # All requested packages already installed.
In this project we'll work with data from the CIA World Factbook a compendium of statistics about all of the countries on Earth. The Factbook contains demographic information like the following:
For this project we'll be using SQL to conduct a basic analysis of the data in this database.
%%capture
%load_ext sql
%sql sqlite:///factbook.db
%%sql
SELECT *
FROM sqlite_master
WHERE type='table';
* sqlite:///factbook.db Done.
type | name | tbl_name | rootpage | sql |
---|---|---|---|---|
table | sqlite_sequence | sqlite_sequence | 3 | CREATE TABLE sqlite_sequence(name,seq) |
table | facts | facts | 47 | CREATE TABLE "facts" ("id" INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, "code" varchar(255) NOT NULL, "name" varchar(255) NOT NULL, "area" integer, "area_land" integer, "area_water" integer, "population" integer, "population_growth" float, "birth_rate" float, "death_rate" float, "migration_rate" float) |
Checking the first 5 rows of the facts table
%%sql
SELECT *
FROM facts
LIMIT 5
* sqlite:///factbook.db Done.
id | code | name | area | area_land | area_water | population | population_growth | birth_rate | death_rate | migration_rate |
---|---|---|---|---|---|---|---|---|---|---|
1 | af | Afghanistan | 652230 | 652230 | 0 | 32564342 | 2.32 | 38.57 | 13.89 | 1.51 |
2 | al | Albania | 28748 | 27398 | 1350 | 3029278 | 0.3 | 12.92 | 6.58 | 3.3 |
3 | ag | Algeria | 2381741 | 2381741 | 0 | 39542166 | 1.84 | 23.67 | 4.31 | 0.92 |
4 | an | Andorra | 468 | 468 | 0 | 85580 | 0.12 | 8.13 | 6.96 | 0.0 |
5 | ao | Angola | 1246700 | 1246700 | 0 | 19625353 | 2.78 | 38.78 | 11.49 | 0.46 |
%%sql
SELECT
MIN(population) AS 'Min_Pop',
MAX(population) AS 'Max_Pop',
MIN(population_growth) AS 'Min_Pop_growth',
MAX(population_growth) AS 'Max_Pop_growth',
FROM facts;
* sqlite:///factbook.db (sqlite3.OperationalError) near "FROM": syntax error [SQL: SELECT MIN(population) AS 'Min_Pop', MAX(population) AS 'Max_Pop', MIN(population_growth) AS 'Min_Pop_growth', MAX(population_growth) AS 'Max_Pop_growth', FROM facts;] (Background on this error at: https://sqlalche.me/e/14/e3q8)
%%sql
SELECT
name
FROM facts
WHERE population = (SELECT MIN(population) FROM facts);
* sqlite:///factbook.db Done.
name |
---|
Antarctica |
A row representative of the entire population of the world (at the time this database was prepared) was wrongly entered
%%sql
SELECT
name,
population
FROM facts
WHERE population = (SELECT MAX(population) FROM facts);
* sqlite:///factbook.db Done.
name | population |
---|---|
World | 7256490011 |
Recomputing the summary statistics without the blunder row
%%sql
SELECT
MIN(population) AS 'Min_Pop',
MAX(population) AS 'Max_Pop',
MIN(population_growth) AS 'Min_Pop_growth',
MAX(population_growth) AS 'Max_Pop_growth'
FROM facts
WHERE population != (SELECT MAX(population) FROM facts);
* sqlite:///factbook.db Done.
Min_Pop | Max_Pop | Min_Pop_growth | Max_Pop_growth |
---|---|---|---|
0 | 1367485388 | 0.0 | 4.02 |
Results: These appear to be more realistic
Calculating the average value for population & area
%%sql
SELECT
AVG(population) AS 'Average_pop',
AVG(area) AS 'Average_area'
FROM facts
* sqlite:///factbook.db Done.
Average_pop | Average_area |
---|---|
62094928.32231405 | 555093.546184739 |
I.E - Countries having above average values for population & below average values for area
%%sql
SELECT
name,
population,
area
FROM facts
WHERE (population > (SELECT AVG(population) FROM facts))
AND (area < (SELECT AVG(area) FROM facts));
* sqlite:///factbook.db Done.
name | population | area |
---|---|---|
Bangladesh | 168957745 | 148460 |
Germany | 80854408 | 357022 |
Japan | 126919659 | 377915 |
Philippines | 100998376 | 300000 |
Thailand | 67976405 | 513120 |
United Kingdom | 64088222 | 243610 |
Vietnam | 94348835 | 331210 |
Observation Comment: Most of these countries are part of the developed world
%%sql
SELECT
name,
population
FROM facts
WHERE population < (SELECT MAX(Population) FROM facts) -- To remove the blunder row of the world population
ORDER BY population DESC
LIMIT 1;
* sqlite:///factbook.db Done.
name | population |
---|---|
China | 1367485388 |
%%sql
SELECT
name,
population_growth
FROM facts
ORDER BY population_growth DESC
LIMIT 1;
* sqlite:///factbook.db Done.
name | population_growth |
---|---|
South Sudan | 4.02 |
Answer:
China currently has the most people and the coutry with the highest growth rate is South Sudan.
%%sql
SELECT
name,
CAST(area_water as Float) / CAST(area_land as Float) AS ratio_water_land
FROM facts
WHERE ratio_water_land > 0.5;
* sqlite:///factbook.db Done.
name | ratio_water_land |
---|---|
British Indian Ocean Territory | 905.6666666666666 |
Puerto Rico | 0.5547914317925592 |
Virgin Islands | 4.520231213872832 |
%%sql
SELECT
name,
CAST(area_water as Float) / CAST(area as Float) AS ratio_water
FROM facts
WHERE ratio_water > 0.5;
* sqlite:///factbook.db Done.
name | ratio_water |
---|---|
British Indian Ocean Territory | 0.9988970588235294 |
Virgin Islands | 0.818848167539267 |
Answer:
The BIOT, Puerto Rico and the Virgin Islands have the highest ratio of land to water with the BIOT and the Virgin Islands actually having more water than land in proportion to their total area.
I.E Calculating the effectual population growth for one year based on the countries current population
%%sql
SELECT
name,
ROUND(population_growth * population, 2) AS added_pop
FROM facts
WHERE population != (SELECT MAX(population) FROM facts) --Accounting for the blunder row which has the world population
ORDER BY added_pop DESC
LIMIT 5;
* sqlite:///factbook.db Done.
name | added_pop |
---|---|
India | 1527068612.48 |
China | 615368424.6 |
Nigeria | 444827037.2 |
Pakistan | 290665336.62 |
Ethiopia | 287456216.91 |
Answer:
India will be the country to add the most people to their population in one year
%%sql
SELECT
name,
birth_rate,
death_rate
FROM facts
WHERE death_rate > birth_rate;
* sqlite:///factbook.db Done.
name | birth_rate | death_rate |
---|---|---|
Austria | 9.41 | 9.42 |
Belarus | 10.7 | 13.36 |
Bosnia and Herzegovina | 8.87 | 9.75 |
Bulgaria | 8.92 | 14.44 |
Croatia | 9.45 | 12.18 |
Czech Republic | 9.63 | 10.34 |
Estonia | 10.51 | 12.4 |
Germany | 8.47 | 11.42 |
Greece | 8.66 | 11.09 |
Hungary | 9.16 | 12.73 |
Italy | 8.74 | 10.19 |
Japan | 7.93 | 9.51 |
Latvia | 10.0 | 14.31 |
Lithuania | 10.1 | 14.27 |
Moldova | 12.0 | 12.59 |
Monaco | 6.65 | 9.24 |
Poland | 9.74 | 10.19 |
Portugal | 9.27 | 11.02 |
Romania | 9.14 | 11.9 |
Russia | 11.6 | 13.69 |
Serbia | 9.08 | 13.66 |
Slovenia | 8.42 | 11.37 |
Ukraine | 10.72 | 14.46 |
Saint Pierre and Miquelon | 7.42 | 9.72 |
%%sql
SELECT
name,
population,
area,
CAST(population as Float) / CAST(area as Float) AS pop_area_ratio
FROM facts
ORDER BY pop_area_ratio DESC
LIMIT 5;
* sqlite:///factbook.db Done.
name | population | area | pop_area_ratio |
---|---|---|---|
Macau | 592731 | 28 | 21168.964285714286 |
Monaco | 30535 | 2 | 15267.5 |
Singapore | 5674472 | 697 | 8141.279770444763 |
Hong Kong | 7141106 | 1108 | 6445.041516245487 |
Gaza Strip | 1869055 | 360 | 5191.819444444444 |
Observation Comment
Most of these countries are either part of the developing world or are situated in areas of geopolitical conflict. These results are very different from the list of countries returned for the earlier query of finding above average population densities and this is so as where these utilize absolute values, those past results were relative to the average population and average area of the entire dataset.