Suodatusta ja poimintaa

In [1]:
# Data-analytiikan peruskirjaston tuonti
import pandas as pd

# Datan avaaminen dataframeen
df = pd.read_excel('http://taanila.fi/data1.xlsx')

# Varmistetaan, että koko data näytetään tulosteissa
pd.options.display.max_rows = df.shape[0]
In [2]:
# Ne, joiden palkka on suurempi kuin 4000
df[df['palkka']>4000]
Out[2]:
nro sukup ikä perhe koulutus palveluv palkka johto työtov työymp palkkat työteht työterv lomaosa kuntosa hieroja
16 17 1 26 1 4.0 2.0 5225 5 5.0 5 4 5 NaN NaN 1.0 NaN
21 22 1 47 2 3.0 17.0 4874 2 4.0 3 2 4 NaN 1.0 NaN NaN
23 24 1 36 1 3.0 7.0 4446 3 4.0 3 4 5 NaN NaN NaN NaN
32 33 1 59 2 3.0 15.0 6278 4 4.0 5 4 4 NaN 1.0 NaN NaN
66 67 1 37 2 4.0 8.0 5069 3 4.0 3 2 2 1.0 1.0 NaN 1.0
In [3]:
# Naiset, joiden tyytyväisyys palkaan on 1 (erittäin tyytymätön) tai 2 (tyytymätön)
df[(df['sukup']==2) & (df['palkkat']<3)]
Out[3]:
nro sukup ikä perhe koulutus palveluv palkka johto työtov työymp palkkat työteht työterv lomaosa kuntosa hieroja
5 6 2 31 2 2.0 14.0 1910 4 4.0 5 2 4 1.0 1.0 NaN NaN
17 18 2 38 2 3.0 17.0 2729 4 3.0 4 2 1 NaN NaN NaN NaN
27 28 2 56 1 1.0 15.0 2223 3 4.0 3 2 4 1.0 NaN NaN 1.0
45 46 2 51 2 1.0 28.0 1989 3 3.0 2 2 3 1.0 NaN NaN 1.0
46 47 2 22 1 3.0 21.0 1872 3 3.0 4 1 3 NaN NaN 1.0 NaN
62 63 2 51 2 2.0 10.0 1872 4 3.0 2 2 3 1.0 NaN NaN NaN
63 64 2 44 1 2.0 1.0 1715 4 4.0 3 2 3 1.0 NaN NaN 1.0
64 65 2 35 2 2.0 17.0 2691 4 4.0 5 2 4 1.0 NaN NaN 1.0
In [4]:
# Ne, jotka ovat käyttäneet kuntosalia tai hierojaa
df[(df['kuntosa']==1) | (df['hieroja']==1)]
Out[4]:
nro sukup ikä perhe koulutus palveluv palkka johto työtov työymp palkkat työteht työterv lomaosa kuntosa hieroja
6 7 1 49 1 2.0 16.0 2066 3 5.0 4 2 2 NaN NaN 1.0 NaN
11 12 1 40 2 3.0 21.0 2846 3 5.0 3 1 2 NaN 1.0 NaN 1.0
15 16 2 42 2 3.0 23.0 2691 3 3.0 3 3 3 1.0 NaN NaN 1.0
16 17 1 26 1 4.0 2.0 5225 5 5.0 5 4 5 NaN NaN 1.0 NaN
18 19 1 42 1 3.0 20.0 2925 2 3.0 4 1 4 1.0 NaN NaN 1.0
19 20 2 40 2 2.0 13.0 2457 3 3.0 4 3 2 1.0 NaN NaN 1.0
25 26 1 26 1 2.0 3.0 1521 2 4.0 2 1 3 1.0 NaN 1.0 1.0
26 27 1 26 1 2.0 2.0 1989 2 4.0 2 2 3 1.0 NaN NaN 1.0
27 28 2 56 1 1.0 15.0 2223 3 4.0 3 2 4 1.0 NaN NaN 1.0
30 31 1 21 1 3.0 1.0 2340 4 5.0 3 4 2 1.0 NaN 1.0 1.0
33 34 1 37 2 1.0 14.0 2183 1 5.0 1 1 2 1.0 NaN NaN 1.0
34 35 1 28 2 2.0 5.0 1989 3 4.0 3 3 3 1.0 NaN 1.0 1.0
36 37 2 56 2 2.0 17.0 2729 5 5.0 5 5 5 NaN NaN NaN 1.0
43 44 1 30 1 2.0 7.0 2223 2 3.0 4 1 3 1.0 NaN NaN 1.0
44 45 1 55 2 1.0 35.0 2651 4 5.0 4 2 4 1.0 NaN NaN 1.0
45 46 2 51 2 1.0 28.0 1989 3 3.0 2 2 3 1.0 NaN NaN 1.0
46 47 2 22 1 3.0 21.0 1872 3 3.0 4 1 3 NaN NaN 1.0 NaN
48 49 1 27 2 2.0 7.0 2729 4 4.0 3 3 5 NaN NaN 1.0 NaN
54 55 2 45 2 1.0 17.0 2417 3 5.0 4 3 3 NaN NaN NaN 1.0
55 56 2 31 2 1.0 6.0 1949 4 4.0 4 3 3 1.0 NaN NaN 1.0
56 57 1 61 2 2.0 36.0 3119 2 NaN 2 1 5 1.0 NaN NaN 1.0
57 58 1 38 2 2.0 NaN 2574 2 3.0 1 1 2 1.0 NaN NaN 1.0
63 64 2 44 1 2.0 1.0 1715 4 4.0 3 2 3 1.0 NaN NaN 1.0
64 65 2 35 2 2.0 17.0 2691 4 4.0 5 2 4 1.0 NaN NaN 1.0
66 67 1 37 2 4.0 8.0 5069 3 4.0 3 2 2 1.0 1.0 NaN 1.0
69 70 1 52 2 2.0 22.0 3119 3 4.0 3 2 2 1.0 1.0 NaN 1.0
77 78 1 22 1 3.0 0.0 1598 4 4.0 4 3 4 NaN 1.0 1.0 NaN
79 80 1 27 1 2.0 7.0 2612 3 4.0 3 3 3 1.0 NaN 1.0 NaN
In [5]:
# Lasketaan uusi muuttuja, joka kertoo käytettyjen etuisuuksien lukumäärän
df['käyttö'] = df[['työterv', 'lomaosa', 'kuntosa', 'hieroja']].count(axis=1)

# Ne, jotka eivät ole käyttäneet mitään etuisuutta
df[df['käyttö']==0]
Out[5]:
nro sukup ikä perhe koulutus palveluv palkka johto työtov työymp palkkat työteht työterv lomaosa kuntosa hieroja käyttö
0 1 1 38 1 1.0 22.0 3587 3 3.0 3 3 3 NaN NaN NaN NaN 0
1 2 1 29 2 2.0 10.0 2963 1 5.0 2 1 3 NaN NaN NaN NaN 0
10 11 1 39 2 1.0 22.0 2651 3 5.0 3 1 3 NaN NaN NaN NaN 0
13 14 1 58 2 3.0 21.0 3587 4 5.0 4 1 3 NaN NaN NaN NaN 0
17 18 2 38 2 3.0 17.0 2729 4 3.0 4 2 1 NaN NaN NaN NaN 0
20 21 2 40 2 3.0 20.0 2691 2 4.0 5 3 4 NaN NaN NaN NaN 0
23 24 1 36 1 3.0 7.0 4446 3 4.0 3 4 5 NaN NaN NaN NaN 0
38 39 1 30 1 2.0 10.0 2300 3 5.0 3 3 4 NaN NaN NaN NaN 0
47 48 1 34 2 1.0 18.0 2183 4 5.0 4 1 3 NaN NaN NaN NaN 0
58 59 1 20 1 2.0 1.0 2261 3 4.0 3 2 3 NaN NaN NaN NaN 0
67 68 1 33 2 3.0 7.0 2417 2 4.0 3 1 4 NaN NaN NaN NaN 0
68 69 1 28 2 2.0 1.0 3510 4 5.0 3 1 4 NaN NaN NaN NaN 0
76 77 1 39 1 2.0 22.0 2183 4 5.0 3 1 2 NaN NaN NaN NaN 0
80 81 1 35 2 2.0 16.0 2808 3 4.0 3 3 3 NaN NaN NaN NaN 0
In [6]:
# Palkka ja palkkatyytyväisyys niille, joiden palkka on korkeintaan 2000
df[['palkka','palkkat']][df['palkka']<=2000]
Out[6]:
palkka palkkat
2 1989 1
5 1910 2
25 1521 1
26 1989 2
29 1949 3
34 1989 3
35 1559 1
45 1989 2
46 1872 1
52 1989 1
53 1559 1
55 1949 3
61 1872 2
62 1872 2
63 1715 2
74 1949 1
75 1598 1
77 1598 3
78 1638 1
In [7]:
# Edellinen palkan mukaan järjestettynä
df[['palkka','palkkat']][df['palkka']<=2000].sort_values(by='palkka')
Out[7]:
palkka palkkat
25 1521 1
35 1559 1
53 1559 1
75 1598 1
77 1598 3
78 1638 1
63 1715 2
46 1872 1
61 1872 2
62 1872 2
5 1910 2
29 1949 3
74 1949 1
55 1949 3
2 1989 1
34 1989 3
26 1989 2
45 1989 2
52 1989 1
In [8]:
# Edellinen palkan mukaan laskevassa järjestyksessä
df[['palkka','palkkat']][df['palkka']<=2000].sort_values(by='palkka', ascending=False)
Out[8]:
palkka palkkat
2 1989 1
34 1989 3
45 1989 2
52 1989 1
26 1989 2
29 1949 3
55 1949 3
74 1949 1
5 1910 2
46 1872 1
61 1872 2
62 1872 2
63 1715 2
78 1638 1
75 1598 1
77 1598 3
35 1559 1
53 1559 1
25 1521 1
In [9]:
# Avataan data, jossa päästään suodattamaan merkkijonon perusteella
df1 = pd.read_excel('http://taanila.fi/titanic.xlsx')
df1.head()
Out[9]:
pclass survived name sex age sibsp parch ticket fare cabin embarked boat body home.dest
0 1 1 Allen, Miss. Elisabeth Walton female 29.0000 0 0 24160 211.3375 B5 S 2 NaN St Louis, MO
1 1 1 Allison, Master. Hudson Trevor male 0.9167 1 2 113781 151.5500 C22 C26 S 11 NaN Montreal, PQ / Chesterville, ON
2 1 0 Allison, Miss. Helen Loraine female 2.0000 1 2 113781 151.5500 C22 C26 S NaN NaN Montreal, PQ / Chesterville, ON
3 1 0 Allison, Mr. Hudson Joshua Creighton male 30.0000 1 2 113781 151.5500 C22 C26 S NaN 135.0 Montreal, PQ / Chesterville, ON
4 1 0 Allison, Mrs. Hudson J C (Bessie Waldo Daniels) female 25.0000 1 2 113781 151.5500 C22 C26 S NaN NaN Montreal, PQ / Chesterville, ON
In [10]:
# Ne Titanicin matkustajat, joiden määränpäässä esiintyy merkkijono 'Finland'
df1[df1['home.dest'].astype(str).str.contains('Finland')]
Out[10]:
pclass survived name sex age sibsp parch ticket fare cabin embarked boat body home.dest
375 2 0 Collander, Mr. Erik Gustaf male 28.0 0 0 248740 13.000 NaN S NaN NaN Helsinki, Finland Ashtabula, Ohio
445 2 0 Hiltunen, Miss. Marta female 18.0 1 1 250650 13.000 NaN S NaN NaN Kontiolahti, Finland / Detroit, MI
558 2 1 Silven, Miss. Lyyli Karoliina female 18.0 0 2 250652 13.000 NaN S 16 NaN Finland / Minneapolis, MN
560 2 1 Sinkkonen, Miss. Anna female 30.0 0 0 250648 13.000 NaN S 10 NaN Finland / Washington, DC
606 3 1 Abrahamsson, Mr. Abraham August Johannes male 20.0 0 0 SOTON/O2 3101284 7.925 NaN S 15 NaN Taalintehdas, Finland Hoboken, NJ
615 3 0 Alhomaki, Mr. Ilmari Rudolf male 20.0 0 0 SOTON/O2 3101287 7.925 NaN S NaN NaN Salo, Finland Astoria, OR
625 3 1 Andersson, Miss. Erna Alexandra female 17.0 4 2 3101281 7.925 NaN S D NaN Ruotsinphyhtaa, Finland New York, NY
655 3 0 Backstrom, Mr. Karl Alfred male 32.0 1 0 3101278 15.850 NaN S D NaN Ruotsinphytaa, Finland New York, NY
656 3 1 Backstrom, Mrs. Karl Alfred (Maria Mathilda Gu... female 33.0 3 0 3101278 15.850 NaN S NaN NaN Ruotsinphytaa, Finland New York, NY
671 3 0 Berglund, Mr. Karl Ivar Sven male 22.0 0 0 PP 4348 9.350 NaN S NaN NaN Tranvik, Finland New York
837 3 0 Gustafsson, Mr. Anders Vilhelm male 37.0 2 0 3101276 7.925 NaN S NaN 98.0 Ruotsinphytaa, Finland New York, NY
838 3 0 Gustafsson, Mr. Johan Birger male 28.0 2 0 3101277 7.925 NaN S NaN NaN Ruotsinphytaa, Finland New York, NY
1063 3 0 Nirva, Mr. Iisakki Antino Aijo male 41.0 0 0 SOTON/O2 3101272 7.125 NaN S NaN NaN Finland Sudbury, ON
In [11]:
# Korvaan 'Finland' suomenkielisellä vastineella 'Suomi'
df1['home.dest'] = df1['home.dest'].astype(str).str.replace('Finland', 'Suomi')

# Tarkistan korvauksen onnistumisen
df1[df1['home.dest'].astype(str).str.contains('Suomi')]
Out[11]:
pclass survived name sex age sibsp parch ticket fare cabin embarked boat body home.dest
375 2 0 Collander, Mr. Erik Gustaf male 28.0 0 0 248740 13.000 NaN S NaN NaN Helsinki, Suomi Ashtabula, Ohio
445 2 0 Hiltunen, Miss. Marta female 18.0 1 1 250650 13.000 NaN S NaN NaN Kontiolahti, Suomi / Detroit, MI
558 2 1 Silven, Miss. Lyyli Karoliina female 18.0 0 2 250652 13.000 NaN S 16 NaN Suomi / Minneapolis, MN
560 2 1 Sinkkonen, Miss. Anna female 30.0 0 0 250648 13.000 NaN S 10 NaN Suomi / Washington, DC
606 3 1 Abrahamsson, Mr. Abraham August Johannes male 20.0 0 0 SOTON/O2 3101284 7.925 NaN S 15 NaN Taalintehdas, Suomi Hoboken, NJ
615 3 0 Alhomaki, Mr. Ilmari Rudolf male 20.0 0 0 SOTON/O2 3101287 7.925 NaN S NaN NaN Salo, Suomi Astoria, OR
625 3 1 Andersson, Miss. Erna Alexandra female 17.0 4 2 3101281 7.925 NaN S D NaN Ruotsinphyhtaa, Suomi New York, NY
655 3 0 Backstrom, Mr. Karl Alfred male 32.0 1 0 3101278 15.850 NaN S D NaN Ruotsinphytaa, Suomi New York, NY
656 3 1 Backstrom, Mrs. Karl Alfred (Maria Mathilda Gu... female 33.0 3 0 3101278 15.850 NaN S NaN NaN Ruotsinphytaa, Suomi New York, NY
671 3 0 Berglund, Mr. Karl Ivar Sven male 22.0 0 0 PP 4348 9.350 NaN S NaN NaN Tranvik, Suomi New York
837 3 0 Gustafsson, Mr. Anders Vilhelm male 37.0 2 0 3101276 7.925 NaN S NaN 98.0 Ruotsinphytaa, Suomi New York, NY
838 3 0 Gustafsson, Mr. Johan Birger male 28.0 2 0 3101277 7.925 NaN S NaN NaN Ruotsinphytaa, Suomi New York, NY
1063 3 0 Nirva, Mr. Iisakki Antino Aijo male 41.0 0 0 SOTON/O2 3101272 7.125 NaN S NaN NaN Suomi Sudbury, ON