본 장은 "따라 하며 배우는 데이터 과학" 3장의 "데이터 취득과 데이터 가공: SQL과 dplyr" 내용의 파이썬 버전입니다.
%matplotlib inline
import pandas as pd
import numpy as np
# gapminder 자료를 다운로드하고 판다스 데이터프레임으로 읽어들이자
gapminder = pd.read_csv("data/gapminder.tsv", sep="\t")
gapminder.head()
country | continent | year | lifeExp | pop | gdpPercap | |
---|---|---|---|---|---|---|
0 | Afghanistan | Asia | 1952 | 28.801 | 8425333 | 779.445314 |
1 | Afghanistan | Asia | 1957 | 30.332 | 9240934 | 820.853030 |
2 | Afghanistan | Asia | 1962 | 31.997 | 10267083 | 853.100710 |
3 | Afghanistan | Asia | 1967 | 34.020 | 11537966 | 836.197138 |
4 | Afghanistan | Asia | 1972 | 36.088 | 13079460 | 739.981106 |
gapminder.tail()
country | continent | year | lifeExp | pop | gdpPercap | |
---|---|---|---|---|---|---|
1699 | Zimbabwe | Africa | 1987 | 62.351 | 9216418 | 706.157306 |
1700 | Zimbabwe | Africa | 1992 | 60.377 | 10704340 | 693.420786 |
1701 | Zimbabwe | Africa | 1997 | 46.809 | 11404948 | 792.449960 |
1702 | Zimbabwe | Africa | 2002 | 39.989 | 11926563 | 672.038623 |
1703 | Zimbabwe | Africa | 2007 | 43.487 | 12311143 | 469.709298 |
gapminder.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1704 entries, 0 to 1703 Data columns (total 6 columns): country 1704 non-null object continent 1704 non-null object year 1704 non-null int64 lifeExp 1704 non-null float64 pop 1704 non-null int64 gdpPercap 1704 non-null float64 dtypes: float64(2), int64(2), object(2) memory usage: 79.9+ KB
gapminder.describe()
year | lifeExp | pop | gdpPercap | |
---|---|---|---|---|
count | 1704.00000 | 1704.000000 | 1.704000e+03 | 1704.000000 |
mean | 1979.50000 | 59.474439 | 2.960121e+07 | 7215.327081 |
std | 17.26533 | 12.917107 | 1.061579e+08 | 9857.454543 |
min | 1952.00000 | 23.599000 | 6.001100e+04 | 241.165877 |
25% | 1965.75000 | 48.198000 | 2.793664e+06 | 1202.060309 |
50% | 1979.50000 | 60.712500 | 7.023596e+06 | 3531.846989 |
75% | 1993.25000 | 70.845500 | 1.958522e+07 | 9325.462346 |
max | 2007.00000 | 82.603000 | 1.318683e+09 | 113523.132900 |
# gapminder 데이터에서 한국 데이터, 2007년 데이터, 한국 2007년 데이터를 추출하는 명령은 다음과 같다.
# R dplyr 에서는 filter(gapminder, country=='Korea, Rep.' & year==2007)
gapminder.query("country=='Korea, Rep.' & year==2007")
country | continent | year | lifeExp | pop | gdpPercap | |
---|---|---|---|---|---|---|
851 | Korea, Rep. | Asia | 2007 | 78.623 | 49044790 | 23348.13973 |
# gapminder 데이터를 year, country 변수순으로 정렬하려면,
# R dplyr 에서는 gapminder %>% arrange(year, country)
gapminder.sort_values(by=["year", "country"]).head()
country | continent | year | lifeExp | pop | gdpPercap | |
---|---|---|---|---|---|---|
0 | Afghanistan | Asia | 1952 | 28.801 | 8425333 | 779.445314 |
12 | Albania | Europe | 1952 | 55.230 | 1282697 | 1601.056136 |
24 | Algeria | Africa | 1952 | 43.077 | 9279525 | 2449.008185 |
36 | Angola | Africa | 1952 | 30.015 | 4232095 | 3520.610273 |
48 | Argentina | Americas | 1952 | 62.485 | 17876956 | 5911.315053 |
# gapminder 데이터에서 pop, gdpPercap 변수만 선택.
# R dplyr 에서는 gapminder %>% select(pop, gdpPercap)
gapminder[["pop", "gdpPercap"]].head()
pop | gdpPercap | |
---|---|---|
0 | 8425333 | 779.445314 |
1 | 9240934 | 820.853030 |
2 | 10267083 | 853.100710 |
3 | 11537966 | 836.197138 |
4 | 13079460 | 739.981106 |
# gapminder 데이터에서 기존의 변수들을 변환한 결과를 기존 변수나 새 변수에 할당한다.
# R dplyr 에서는
# gapminder %>%
# mutate(total_gdp = pop * gdpPercap,
# le_gdp_ratio = lifeExp / gdpPercap, lgrk = le_gdp_ratio * 100)
# 1. 파이썬에서는 각 변수 할당에 새로운 assign() 함수를 사용해야 한다.
# 2. x.pop 은 내부의 pop() 함수와 충돌을 일으키므로 x['pop']으로 표현했다.
gapminder.\
assign(total_gdp = lambda x: (x['pop'] * x['gdpPercap'])).\
assign(le_gdp_ratio = lambda x: (x['lifeExp'] / x['gdpPercap'])).\
assign(lgrk = lambda x: x['le_gdp_ratio'] * 100).\
head()
country | continent | year | lifeExp | pop | gdpPercap | total_gdp | le_gdp_ratio | lgrk | |
---|---|---|---|---|---|---|---|---|---|
0 | Afghanistan | Asia | 1952 | 28.801 | 8425333 | 779.445314 | 6.567086e+09 | 0.036951 | 3.695064 |
1 | Afghanistan | Asia | 1957 | 30.332 | 9240934 | 820.853030 | 7.585449e+09 | 0.036952 | 3.695180 |
2 | Afghanistan | Asia | 1962 | 31.997 | 10267083 | 853.100710 | 8.758856e+09 | 0.037507 | 3.750671 |
3 | Afghanistan | Asia | 1967 | 34.020 | 11537966 | 836.197138 | 9.648014e+09 | 0.040684 | 4.068419 |
4 | Afghanistan | Asia | 1972 | 36.088 | 13079460 | 739.981106 | 9.678553e+09 | 0.048769 | 4.876881 |
# gapminder 데이터에서 기존의 변수들을 변환한 결과를 기존 변수나 새 변수에 할당한다.
# R dplyr 에서는
# gapminder %>% summarize(n_obs = n( ),
# n_countries = n_distinct(country),
# n_years = n_distinct(year),
# med_gdpc = median(gdpPercap),
# max_gdppc = max(gdpPercap))
gapminder.aggregate(['mean', 'median'])
year | lifeExp | pop | gdpPercap | |
---|---|---|---|---|
mean | 1979.5 | 59.474439 | 2.960121e+07 | 7215.327081 |
median | 1979.5 | 60.712500 | 7.023596e+06 | 3531.846989 |
np.random.seed(12345)
gapminder.sample(n=10)
country | continent | year | lifeExp | pop | gdpPercap | |
---|---|---|---|---|---|---|
1522 | Tanzania | Africa | 2002 | 49.651 | 34593779 | 899.074211 |
1553 | Trinidad and Tobago | Americas | 1977 | 68.300 | 1039009 | 7899.554209 |
238 | Cameroon | Africa | 2002 | 49.856 | 15929988 | 1934.011449 |
1085 | Netherlands | Europe | 1977 | 75.240 | 13852989 | 21209.059200 |
634 | Guinea-Bissau | Africa | 2002 | 45.504 | 1332459 | 575.704718 |
1569 | Tunisia | Africa | 1997 | 71.973 | 9231669 | 4876.798614 |
648 | Honduras | Americas | 1952 | 41.912 | 1517453 | 2194.926204 |
1072 | Nepal | Asia | 1972 | 43.971 | 12412593 | 674.788130 |
1617 | United States | Americas | 1997 | 76.810 | 272911760 | 35767.433030 |
953 | Mali | Africa | 1977 | 41.714 | 6491649 | 686.395269 |
gapminder.sample(frac=.01).shape
(17, 6)
# R dplyr 는
# gapminder %>% select(country) %>% distinct()
# gapminder %>% select(year) %>% distinct()
gapminder.country.unique(), gapminder.year.unique()
(array(['Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina', 'Australia', 'Austria', 'Bahrain', 'Bangladesh', 'Belgium', 'Benin', 'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon', 'Canada', 'Central African Republic', 'Chad', 'Chile', 'China', 'Colombia', 'Comoros', 'Congo, Dem. Rep.', 'Congo, Rep.', 'Costa Rica', "Cote d'Ivoire", 'Croatia', 'Cuba', 'Czech Republic', 'Denmark', 'Djibouti', 'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Ethiopia', 'Finland', 'France', 'Gabon', 'Gambia', 'Germany', 'Ghana', 'Greece', 'Guatemala', 'Guinea', 'Guinea-Bissau', 'Haiti', 'Honduras', 'Hong Kong, China', 'Hungary', 'Iceland', 'India', 'Indonesia', 'Iran', 'Iraq', 'Ireland', 'Israel', 'Italy', 'Jamaica', 'Japan', 'Jordan', 'Kenya', 'Korea, Dem. Rep.', 'Korea, Rep.', 'Kuwait', 'Lebanon', 'Lesotho', 'Liberia', 'Libya', 'Madagascar', 'Malawi', 'Malaysia', 'Mali', 'Mauritania', 'Mauritius', 'Mexico', 'Mongolia', 'Montenegro', 'Morocco', 'Mozambique', 'Myanmar', 'Namibia', 'Nepal', 'Netherlands', 'New Zealand', 'Nicaragua', 'Niger', 'Nigeria', 'Norway', 'Oman', 'Pakistan', 'Panama', 'Paraguay', 'Peru', 'Philippines', 'Poland', 'Portugal', 'Puerto Rico', 'Reunion', 'Romania', 'Rwanda', 'Sao Tome and Principe', 'Saudi Arabia', 'Senegal', 'Serbia', 'Sierra Leone', 'Singapore', 'Slovak Republic', 'Slovenia', 'Somalia', 'South Africa', 'Spain', 'Sri Lanka', 'Sudan', 'Swaziland', 'Sweden', 'Switzerland', 'Syria', 'Taiwan', 'Tanzania', 'Thailand', 'Togo', 'Trinidad and Tobago', 'Tunisia', 'Turkey', 'Uganda', 'United Kingdom', 'United States', 'Uruguay', 'Venezuela', 'Vietnam', 'West Bank and Gaza', 'Yemen, Rep.', 'Zambia', 'Zimbabwe'], dtype=object), array([1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, 2002, 2007]))
gapminder.drop_duplicates(['country', 'year']).head()
country | continent | year | lifeExp | pop | gdpPercap | |
---|---|---|---|---|---|---|
0 | Afghanistan | Asia | 1952 | 28.801 | 8425333 | 779.445314 |
1 | Afghanistan | Asia | 1957 | 30.332 | 9240934 | 820.853030 |
2 | Afghanistan | Asia | 1962 | 31.997 | 10267083 | 853.100710 |
3 | Afghanistan | Asia | 1967 | 34.020 | 11537966 | 836.197138 |
4 | Afghanistan | Asia | 1972 | 36.088 | 13079460 | 739.981106 |
# R dplyr 는
# gapminder %>%
# filter(year == 2007) %>%
# group_by(continent) %>%
# summarize(median(lifeExp))
gapminder.\
query('year == 2007').\
groupby('continent').\
agg({'lifeExp':'median'})
lifeExp | |
---|---|
continent | |
Africa | 52.9265 |
Americas | 72.8990 |
Asia | 72.3960 |
Europe | 78.6085 |
Oceania | 80.7195 |
R dplyr 예는 다음과 같다.
(df1 <- data_frame(x = c(1, 2), y = 2:1))
(df2 <- data_frame(x = c(1, 3), a = 10, b = "a"))
df1 %>% inner_join(df2)
df1 %>% left_join(df2)
df1 %>% right_join(df2)
df1 %>% full_join(df2)
파이썬 판다스에서는 DataFrame.merge
함수로 처리하면 된다.
df1 = pd.DataFrame(data={'x':range(2), 'y':range(2, 0, -1)})
df1
x | y | |
---|---|---|
0 | 0 | 2 |
1 | 1 | 1 |
df2 = pd.DataFrame(data={'x':[1,3], 'a':10, 'b':"a"})
df2
a | b | x | |
---|---|---|---|
0 | 10 | a | 1 |
1 | 10 | a | 3 |
df1.merge(df2, how="inner")
x | y | a | b | |
---|---|---|---|---|
0 | 1 | 1 | 10 | a |
df1.merge(df2, how="left")
x | y | a | b | |
---|---|---|---|---|
0 | 0 | 2 | NaN | NaN |
1 | 1 | 1 | 10.0 | a |
df1.merge(df2, how="inner")
x | y | a | b | |
---|---|---|---|---|
0 | 1 | 1 | 10 | a |
df1.merge(df2, how="right")
x | y | a | b | |
---|---|---|---|---|
0 | 1 | 1.0 | 10 | a |
1 | 3 | NaN | 10 | a |
df1.merge(df2, how="outer")
x | y | a | b | |
---|---|---|---|---|
0 | 0 | 2.0 | NaN | NaN |
1 | 1 | 1.0 | 10.0 | a |
2 | 3 | NaN | 10.0 | a |