pathUrl2 = 'https://raw.githubusercontent.com/pydata/pydata-book/master/ch02/usagov_bitly_data2012-03-16-1331923249.txt'
import urllib
response = urllib.urlopen(pathUrl2)
responseLines = response.readlines()
responseLines[0]
'{ "a": "Mozilla\\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\\/535.11 (KHTML, like Gecko) Chrome\\/17.0.963.78 Safari\\/535.11", "c": "US", "nk": 1, "tz": "America\\/New_York", "gr": "MA", "g": "A6qOVH", "h": "wfLQtf", "l": "orofrog", "al": "en-US,en;q=0.8", "hh": "1.usa.gov", "r": "http:\\/\\/www.facebook.com\\/l\\/7AQEFzjSi\\/1.usa.gov\\/wfLQtf", "u": "http:\\/\\/www.ncbi.nlm.nih.gov\\/pubmed\\/22415991", "t": 1331923247, "hc": 1331822918, "cy": "Danvers", "ll": [ 42.576698, -70.954903 ] }\n'
import json
records = [json.loads(line) for line in responseLines]
records[0]
{u'a': u'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.78 Safari/535.11', u'al': u'en-US,en;q=0.8', u'c': u'US', u'cy': u'Danvers', u'g': u'A6qOVH', u'gr': u'MA', u'h': u'wfLQtf', u'hc': 1331822918, u'hh': u'1.usa.gov', u'l': u'orofrog', u'll': [42.576698, -70.954903], u'nk': 1, u'r': u'http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/wfLQtf', u't': 1331923247, u'tz': u'America/New_York', u'u': u'http://www.ncbi.nlm.nih.gov/pubmed/22415991'}
records[0]['tz']
u'America/New_York'
print records[0]['tz']
America/New_York
time_zones=[rec['tz'] for rec in records]
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) <ipython-input-15-8e4b2ebabb9c> in <module>() ----> 1 time_zones=[rec['tz'] for rec in records] KeyError: 'tz'
time_zones = [rec['tz'] for rec in records if 'tz' in rec]
time_zones[:10]
[u'America/New_York', u'America/Denver', u'America/New_York', u'America/Sao_Paulo', u'America/New_York', u'America/New_York', u'Europe/Warsaw', u'', u'', u'']
from collections import Counter
counts = Counter(time_zones)
counts.most_common(10)
[(u'America/New_York', 1251), (u'', 521), (u'America/Chicago', 400), (u'America/Los_Angeles', 382), (u'America/Denver', 191), (u'Europe/London', 74), (u'Asia/Tokyo', 37), (u'Pacific/Honolulu', 36), (u'Europe/Madrid', 35), (u'America/Sao_Paulo', 33)]
from pandas import DataFrame, Series
import pandas as pd;
import numpy as np
pd.__version__
'0.16.1'
frame = DataFrame(records)
frame
_heartbeat_ | a | al | c | cy | g | gr | h | hc | hh | kw | l | ll | nk | r | t | tz | u | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | NaN | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... | en-US,en;q=0.8 | US | Danvers | A6qOVH | MA | wfLQtf | 1331822918 | 1.usa.gov | NaN | orofrog | [42.576698, -70.954903] | 1 | http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/... | 1331923247 | America/New_York | http://www.ncbi.nlm.nih.gov/pubmed/22415991 |
1 | NaN | GoogleMaps/RochesterNY | NaN | US | Provo | mwszkS | UT | mwszkS | 1308262393 | j.mp | NaN | bitly | [40.218102, -111.613297] | 0 | http://www.AwareMap.com/ | 1331923249 | America/Denver | http://www.monroecounty.gov/etc/911/rss.php |
2 | NaN | Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ... | en-US | US | Washington | xxr3Qb | DC | xxr3Qb | 1331919941 | 1.usa.gov | NaN | bitly | [38.9007, -77.043098] | 1 | http://t.co/03elZC4Q | 1331923250 | America/New_York | http://boxer.senate.gov/en/press/releases/0316... |
3 | NaN | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8)... | pt-br | BR | Braz | zCaLwp | 27 | zUtuOu | 1331923068 | 1.usa.gov | NaN | alelex88 | [-23.549999, -46.616699] | 0 | direct | 1331923249 | America/Sao_Paulo | http://apod.nasa.gov/apod/ap120312.html |
4 | NaN | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... | en-US,en;q=0.8 | US | Shrewsbury | 9b6kNl | MA | 9b6kNl | 1273672411 | bit.ly | NaN | bitly | [42.286499, -71.714699] | 0 | http://www.shrewsbury-ma.gov/selco/ | 1331923251 | America/New_York | http://www.shrewsbury-ma.gov/egov/gallery/1341... |
5 | NaN | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... | en-US,en;q=0.8 | US | Shrewsbury | axNK8c | MA | axNK8c | 1273672506 | bit.ly | NaN | bitly | [42.286499, -71.714699] | 0 | http://www.shrewsbury-ma.gov/selco/ | 1331923252 | America/New_York | http://www.shrewsbury-ma.gov/egov/gallery/1341... |
6 | NaN | Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1... | pl-PL,pl;q=0.8,en-US;q=0.6,en;q=0.4 | PL | Luban | wcndER | 77 | zkpJBR | 1331922854 | 1.usa.gov | NaN | bnjacobs | [51.116699, 15.2833] | 0 | http://plus.url.google.com/url?sa=z&n=13319232... | 1331923255 | Europe/Warsaw | http://www.nasa.gov/mission_pages/nustar/main/... |
7 | NaN | Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/2... | bg,en-us;q=0.7,en;q=0.3 | None | NaN | wcndER | NaN | zkpJBR | 1331922854 | 1.usa.gov | NaN | bnjacobs | NaN | 0 | http://www.facebook.com/ | 1331923255 | http://www.nasa.gov/mission_pages/nustar/main/... | |
8 | NaN | Opera/9.80 (X11; Linux zbov; U; en) Presto/2.1... | en-US, en | None | NaN | wcndER | NaN | zkpJBR | 1331922854 | 1.usa.gov | NaN | bnjacobs | NaN | 0 | http://www.facebook.com/l.php?u=http%3A%2F%2F1... | 1331923254 | http://www.nasa.gov/mission_pages/nustar/main/... | |
9 | NaN | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... | pt-BR,pt;q=0.8,en-US;q=0.6,en;q=0.4 | None | NaN | zCaLwp | NaN | zUtuOu | 1331923068 | 1.usa.gov | NaN | alelex88 | NaN | 0 | http://t.co/o1Pd0WeV | 1331923255 | http://apod.nasa.gov/apod/ap120312.html | |
10 | NaN | Mozilla/5.0 (Windows NT 6.1; WOW64; rv:10.0.2)... | en-us,en;q=0.5 | US | Seattle | vNJS4H | WA | u0uD9q | 1319563556 | 1.usa.gov | NaN | o_4us71ccioa | [47.5951, -122.332603] | 1 | direct | 1331923258 | America/Los_Angeles | https://www.nysdot.gov/rexdesign/design/commun... |
11 | NaN | Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.4... | en-us,en;q=0.5 | US | Washington | wG7OIH | DC | A0nRz4 | 1331815838 | 1.usa.gov | NaN | darrellissa | [38.937599, -77.092796] | 0 | http://t.co/ND7SoPyo | 1331923259 | America/New_York | http://oversight.house.gov/wp-content/uploads/... |
12 | NaN | Mozilla/5.0 (Windows NT 6.1; WOW64; rv:10.0.2)... | en-us,en;q=0.5 | US | Alexandria | vNJS4H | VA | u0uD9q | 1319563556 | 1.usa.gov | NaN | o_4us71ccioa | [38.790901, -77.094704] | 1 | direct | 1331923259 | America/New_York | https://www.nysdot.gov/rexdesign/design/commun... |
13 | 1331923261 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
14 | NaN | Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US... | en-us,en;q=0.5 | US | Marietta | 2rOUYc | GA | 2rOUYc | 1255769846 | 1.usa.gov | NaN | bitly | [33.953201, -84.5177] | 1 | direct | 1331923262 | America/New_York | http://toxtown.nlm.nih.gov/index.php |
15 | NaN | Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1... | zh-TW,zh;q=0.8,en-US;q=0.6,en;q=0.4 | HK | Central District | nQvgJp | 00 | rtrrth | 1317318030 | j.mp | NaN | walkeryuen | [22.2833, 114.150002] | 1 | http://forum2.hkgolden.com/view.aspx?type=BW&m... | 1331923263 | Asia/Hong_Kong | http://www.ssd.noaa.gov/PS/TROP/TCFP/data/curr... |
16 | NaN | Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1... | zh-TW,zh;q=0.8,en-US;q=0.6,en;q=0.4 | HK | Central District | XdUNr | 00 | qWkgbq | 1317318039 | j.mp | NaN | walkeryuen | [22.2833, 114.150002] | 1 | http://forum2.hkgolden.com/view.aspx?type=BW&m... | 1331923263 | Asia/Hong_Kong | http://www.usno.navy.mil/NOOC/nmfc-ph/RSS/jtwc... |
17 | NaN | Mozilla/5.0 (Macintosh; Intel Mac OS X 10.5; r... | en-us,en;q=0.5 | US | Buckfield | zH1BFf | ME | x3jOIv | 1331839576 | 1.usa.gov | NaN | andyzieminski | [44.299702, -70.369797] | 0 | http://t.co/6Cx4ROLs | 1331923264 | America/New_York | http://www.usda.gov/wps/portal/usda/usdahome?c... |
18 | NaN | GoogleMaps/RochesterNY | NaN | US | Provo | mwszkS | UT | mwszkS | 1308262393 | 1.usa.gov | NaN | bitly | [40.218102, -111.613297] | 0 | http://www.AwareMap.com/ | 1331923262 | America/Denver | http://www.monroecounty.gov/etc/911/rss.php |
19 | NaN | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... | it-IT,it;q=0.8,en-US;q=0.6,en;q=0.4 | IT | Venice | wcndER | 20 | zkpJBR | 1331922854 | 1.usa.gov | NaN | bnjacobs | [45.438599, 12.3267] | 0 | http://www.facebook.com/ | 1331923264 | Europe/Rome | http://www.nasa.gov/mission_pages/nustar/main/... |
20 | NaN | Mozilla/5.0 (compatible; MSIE 9.0; Windows NT ... | es-ES | ES | Alcal | zQ95Hi | 51 | ytZYWR | 1331670549 | bitly.com | NaN | jplnews | [37.516701, -5.9833] | 0 | http://www.facebook.com/ | 1331923265 | Africa/Ceuta | http://voyager.jpl.nasa.gov/imagesvideo/uranus... |
21 | NaN | Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6... | en-us,en;q=0.5 | US | Davidsonville | wcndER | MD | zkpJBR | 1331922854 | 1.usa.gov | NaN | bnjacobs | [38.939201, -76.635002] | 0 | http://www.facebook.com/ | 1331923267 | America/New_York | http://www.nasa.gov/mission_pages/nustar/main/... |
22 | NaN | Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ... | en-us | US | Hockessin | y3ZImz | DE | y3ZImz | 1331064158 | 1.usa.gov | NaN | bitly | [39.785, -75.682297] | 0 | direct | 1331923267 | America/New_York | http://portal.hud.gov/hudportal/documents/hudd... |
23 | NaN | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3)... | en-us | US | Lititz | wWiOiD | PA | wWiOiD | 1330217829 | 1.usa.gov | NaN | bitly | [40.174999, -76.3078] | 0 | http://www.facebook.com/l.php?u=http%3A%2F%2F1... | 1331923267 | America/New_York | http://www.tricare.mil/mybenefit/ProfileFilter... |
24 | NaN | Mozilla/5.0 (Windows; U; Windows NT 5.1; es-ES... | es-es,es;q=0.8,en-us;q=0.5,en;q=0.3 | ES | Bilbao | wcndER | 59 | zkpJBR | 1331922854 | 1.usa.gov | NaN | bnjacobs | [43.25, -2.9667] | 0 | http://www.facebook.com/ | 1331923268 | Europe/Madrid | http://www.nasa.gov/mission_pages/nustar/main/... |
25 | NaN | Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1... | en-GB,en;q=0.8,en-US;q=0.6,en-AU;q=0.4 | MY | Kuala Lumpur | wcndER | 14 | zkpJBR | 1331922854 | 1.usa.gov | NaN | bnjacobs | [3.1667, 101.699997] | 0 | http://www.facebook.com/ | 1331923269 | Asia/Kuala_Lumpur | http://www.nasa.gov/mission_pages/nustar/main/... |
26 | NaN | Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1... | ro-RO,ro;q=0.8,en-US;q=0.6,en;q=0.4 | CY | Nicosia | wcndER | 04 | zkpJBR | 1331922854 | 1.usa.gov | NaN | bnjacobs | [35.166698, 33.366699] | 0 | http://www.facebook.com/?ref=tn_tnmn | 1331923268 | Asia/Nicosia | http://www.nasa.gov/mission_pages/nustar/main/... |
27 | NaN | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8)... | en-US,en;q=0.8 | BR | SPaulo | zCaLwp | 27 | zUtuOu | 1331923068 | 1.usa.gov | NaN | alelex88 | [-23.5333, -46.616699] | 0 | direct | 1331923269 | America/Sao_Paulo | http://apod.nasa.gov/apod/ap120312.html |
28 | NaN | Mozilla/5.0 (iPad; CPU OS 5_0_1 like Mac OS X)... | en-us | None | NaN | vNJS4H | NaN | u0uD9q | 1319563556 | 1.usa.gov | NaN | o_4us71ccioa | NaN | 0 | direct | 1331923270 | https://www.nysdot.gov/rexdesign/design/commun... | |
29 | NaN | Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X... | en-us | None | NaN | FPX0IM | NaN | FPX0IL | 1331922978 | 1.usa.gov | NaN | twittershare | NaN | 1 | http://t.co/5xlp0B34 | 1331923270 | http://www.ed.gov/news/media-advisories/us-dep... | |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
3530 | NaN | Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.1... | en-US,en;q=0.8 | US | San Francisco | xVZg4P | CA | wqUkTo | 1331908247 | go.nasa.gov | NaN | nasatwitter | [37.7645, -122.429398] | 0 | http://www.facebook.com/l.php?u=http%3A%2F%2Fg... | 1331926815 | America/Los_Angeles | http://www.nasa.gov/multimedia/imagegallery/im... |
3531 | NaN | Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6... | en-US | None | NaN | wcndER | NaN | zkpJBR | 1331922854 | 1.usa.gov | NaN | bnjacobs | NaN | 0 | direct | 1331926816 | http://www.nasa.gov/mission_pages/nustar/main/... | |
3532 | NaN | Mozilla/5.0 (Windows NT 6.1; WOW64; rv:10.0.2)... | en-us,en;q=0.5 | US | Washington | Au3aUS | DC | A9ct6C | 1331926420 | 1.usa.gov | NaN | ncsha | [38.904202, -77.031998] | 1 | http://www.ncsha.org/ | 1331926817 | America/New_York | http://portal.hud.gov/hudportal/HUD?src=/press... |
3533 | NaN | Mozilla/5.0 (iPad; CPU OS 5_1 like Mac OS X) A... | en-us | US | Jacksonville | b2UtUJ | FL | ieCdgH | 1301393171 | go.nasa.gov | NaN | nasatwitter | [30.279301, -81.585098] | 1 | direct | 1331926818 | America/New_York | http://apod.nasa.gov/apod/ |
3534 | NaN | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8)... | en-us | US | Frisco | vNJS4H | TX | u0uD9q | 1319563556 | 1.usa.gov | NaN | o_4us71ccioa | [33.149899, -96.855499] | 1 | direct | 1331926820 | America/Chicago | https://www.nysdot.gov/rexdesign/design/commun... |
3535 | NaN | Mozilla/5.0 (Windows NT 5.1; rv:10.0.2) Gecko/... | en-us | US | Houston | zIgLx8 | TX | yrPaLt | 1331903484 | aash.to | NaN | aashto | [29.775499, -95.415199] | 1 | direct | 1331926823 | America/Chicago | http://ntl.bts.gov/lib/44000/44300/44374/FHWA-... |
3536 | NaN | Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; e... | en-US,en;q=0.5 | None | NaN | xIcyim | NaN | yG1TTf | 1331728309 | go.nasa.gov | NaN | nasatwitter | NaN | 0 | http://t.co/g1VKE8zS | 1331926824 | http://www.nasa.gov/mission_pages/hurricanes/a... | |
3537 | NaN | Mozilla/5.0 (Windows NT 6.1; WOW64; rv:10.0.2)... | es-es,es;q=0.8,en-us;q=0.5,en;q=0.3 | HN | Tegucigalpa | zCaLwp | 08 | w63FZW | 1331546756 | 1.usa.gov | NaN | bufferapp | [14.1, -87.216698] | 0 | http://t.co/A8TJyibE | 1331926825 | America/Tegucigalpa | http://apod.nasa.gov/apod/ap120312.html |
3538 | NaN | Mozilla/5.0 (iPhone; CPU iPhone OS 5_1 like Ma... | en-us | US | Los Angeles | qMac9k | CA | qds1Ge | 1310473559 | 1.usa.gov | NaN | healthypeople | [34.041599, -118.298798] | 0 | direct | 1331926825 | America/Los_Angeles | http://healthypeople.gov/2020/connect/webinars... |
3539 | NaN | Mozilla/5.0 (compatible; Fedora Core 3) FC3 KDE | NaN | US | Bellevue | zu2M5o | WA | zDhdro | 1331586192 | bit.ly | NaN | glimtwin | [47.615398, -122.210297] | 0 | direct | 1331926827 | America/Los_Angeles | http://www.federalreserve.gov/newsevents/press... |
3540 | NaN | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... | en-US,en;q=0.8 | US | Payson | wcndER | UT | zkpJBR | 1331922854 | 1.usa.gov | NaN | bnjacobs | [40.014198, -111.738899] | 0 | http://www.facebook.com/l.php?u=http%3A%2F%2F1... | 1331926828 | America/Denver | http://www.nasa.gov/mission_pages/nustar/main/... |
3541 | NaN | Mozilla/5.0 (X11; U; OpenVMS AlphaServer_ES40;... | NaN | US | Bellevue | zu2M5o | WA | zDhdro | 1331586192 | 1.usa.gov | NaN | glimtwin | [47.615398, -122.210297] | 0 | direct | 1331926828 | America/Los_Angeles | http://www.federalreserve.gov/newsevents/press... |
3542 | NaN | Mozilla/5.0 (compatible; MSIE 9.0; Windows NT ... | en-us | US | Pittsburg | y3reI1 | CA | y3reI1 | 1331926120 | 1.usa.gov | NaN | bitly | [38.0051, -121.838699] | 0 | http://www.facebook.com/l.php?u=http%3A%2F%2F1... | 1331926829 | America/Los_Angeles | http://www.sba.gov/community/blogs/community-b... |
3543 | 1331926831 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3544 | NaN | Mozilla/5.0 (Windows NT 6.1; WOW64; rv:5.0.1) ... | en-us,en;q=0.5 | US | Wentzville | vNJS4H | MO | u0uD9q | 1319563556 | 1.usa.gov | NaN | o_4us71ccioa | [38.790001, -90.854897] | 1 | direct | 1331926831 | America/Chicago | https://www.nysdot.gov/rexdesign/design/commun... |
3545 | NaN | Mozilla/5.0 (Windows NT 6.1; WOW64; rv:10.0.2)... | en-us,en;q=0.5 | US | Saint Charles | vNJS4H | IL | u0uD9q | 1319563556 | 1.usa.gov | NaN | o_4us71ccioa | [41.9352, -88.290901] | 1 | direct | 1331926832 | America/Chicago | https://www.nysdot.gov/rexdesign/design/commun... |
3546 | NaN | Mozilla/5.0 (iPhone; CPU iPhone OS 5_1 like Ma... | en-us | US | Los Angeles | qMac9k | CA | qds1Ge | 1310473559 | 1.usa.gov | NaN | healthypeople | [34.041599, -118.298798] | 1 | direct | 1331926833 | America/Los_Angeles | http://healthypeople.gov/2020/connect/webinars... |
3547 | NaN | Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8)... | en-us | US | Silver Spring | y0jYkg | MD | y0jYkg | 1331851811 | 1.usa.gov | NaN | bitly | [39.052101, -77.014999] | 1 | direct | 1331926836 | America/New_York | http://www.epa.gov/otaq/regs/fuels/additive/e1... |
3548 | NaN | Mozilla/5.0 (iPhone; CPU iPhone OS 5_1 like Ma... | en-us | US | Mcgehee | y5rMac | AR | xANY6O | 1331916302 | 1.usa.gov | NaN | twitterfeed | [33.628399, -91.356903] | 1 | https://twitter.com/fdarecalls/status/18069759... | 1331926836 | America/Chicago | http://www.fda.gov/Safety/Recalls/ucm296326.htm |
3549 | NaN | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... | sv-SE,sv;q=0.8,en-US;q=0.6,en;q=0.4 | SE | Sollefte | eH8wu | 24 | 7dtjei | 1260316355 | 1.usa.gov | NaN | tweetdeckapi | [63.166698, 17.266701] | 1 | direct | 1331926834 | Europe/Stockholm | http://www.nasa.gov/mission_pages/WISE/main/in... |
3550 | NaN | Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ... | en-us | US | Conshohocken | A00b72 | PA | yGSwzn | 1331917632 | 1.usa.gov | NaN | addthis | [40.0798, -75.2855] | 0 | http://www.linkedin.com/home?trk=hb_tab_home_top | 1331926837 | America/New_York | http://www.nlm.nih.gov/medlineplus/news/fullst... |
3551 | NaN | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... | en-US,en;q=0.8 | None | NaN | wcndER | NaN | zkpJBR | 1331922854 | 1.usa.gov | NaN | bnjacobs | NaN | 0 | http://plus.url.google.com/url?sa=z&n=13319268... | 1331926837 | http://www.nasa.gov/mission_pages/nustar/main/... | |
3552 | NaN | Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US... | NaN | US | Decatur | rqgJuE | AL | xcz8vt | 1331227417 | 1.usa.gov | NaN | bootsnall | [34.572701, -86.940598] | 0 | direct | 1331926839 | America/Chicago | http://travel.state.gov/passport/passport_5535... |
3553 | NaN | Mozilla/4.0 (compatible; MSIE 7.0; Windows NT ... | en-us | US | Shrewsbury | 9b6kNl | MA | 9b6kNl | 1273672411 | bit.ly | NaN | bitly | [42.286499, -71.714699] | 0 | http://www.shrewsbury-ma.gov/selco/ | 1331926840 | America/New_York | http://www.shrewsbury-ma.gov/egov/gallery/1341... |
3554 | NaN | Mozilla/4.0 (compatible; MSIE 7.0; Windows NT ... | en-us | US | Shrewsbury | axNK8c | MA | axNK8c | 1273672506 | bit.ly | NaN | bitly | [42.286499, -71.714699] | 0 | http://www.shrewsbury-ma.gov/selco/ | 1331926840 | America/New_York | http://www.shrewsbury-ma.gov/egov/gallery/1341... |
3555 | NaN | Mozilla/4.0 (compatible; MSIE 9.0; Windows NT ... | en | US | Paramus | e5SvKE | NJ | fqPSr9 | 1301298479 | 1.usa.gov | NaN | tweetdeckapi | [40.9445, -74.07] | 1 | direct | 1331926841 | America/New_York | http://www.fda.gov/AdvisoryCommittees/Committe... |
3556 | NaN | Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1... | en-US,en;q=0.8 | US | Oklahoma City | jQLtP4 | OK | jQLtP4 | 1307530247 | 1.usa.gov | NaN | bitly | [35.4715, -97.518997] | 0 | http://www.facebook.com/l.php?u=http%3A%2F%2F1... | 1331926844 | America/Chicago | http://www.okc.gov/PublicNotificationSystem/Fo... |
3557 | NaN | GoogleMaps/RochesterNY | NaN | US | Provo | mwszkS | UT | mwszkS | 1308262393 | j.mp | NaN | bitly | [40.218102, -111.613297] | 0 | http://www.AwareMap.com/ | 1331926846 | America/Denver | http://www.monroecounty.gov/etc/911/rss.php |
3558 | NaN | GoogleProducer | NaN | US | Mountain View | zjtI4X | CA | zjtI4X | 1327528527 | 1.usa.gov | NaN | bitly | [37.419201, -122.057404] | 0 | direct | 1331926847 | America/Los_Angeles | http://www.ahrq.gov/qual/qitoolkit/ |
3559 | NaN | Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ... | en-US | US | Mc Lean | qxKrTK | VA | qxKrTK | 1312897670 | 1.usa.gov | NaN | bitly | [38.935799, -77.162102] | 0 | http://t.co/OEEEvwjU | 1331926849 | America/New_York | http://herndon-va.gov/Content/public_safety/Pu... |
3560 rows × 18 columns
frame['tz'][:10]
0 America/New_York 1 America/Denver 2 America/New_York 3 America/Sao_Paulo 4 America/New_York 5 America/New_York 6 Europe/Warsaw 7 8 9 Name: tz, dtype: object
tz_counts = frame['tz'].value_counts()
tz_counts[:10]
America/New_York 1251 521 America/Chicago 400 America/Los_Angeles 382 America/Denver 191 Europe/London 74 Asia/Tokyo 37 Pacific/Honolulu 36 Europe/Madrid 35 America/Sao_Paulo 33 dtype: int64
%matplotlib inline
tz_counts[:10].plot(kind='barh', rot=10)
<matplotlib.axes._subplots.AxesSubplot at 0xaf99978>
frame['a'][1]
u'GoogleMaps/RochesterNY'
frame['a'][50]
u'Mozilla/5.0 (Windows NT 5.1; rv:10.0.2) Gecko/20100101 Firefox/10.0.2'
frame['a'][51]
u'Mozilla/5.0 (Linux; U; Android 2.2.2; en-us; LG-P925/V10e Build/FRG83G) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1'
result = Series([x.split()[0] for x in frame.a.dropna()])
result[:5]
0 Mozilla/5.0 1 GoogleMaps/RochesterNY 2 Mozilla/4.0 3 Mozilla/5.0 4 Mozilla/5.0 dtype: object
result.value_counts()[:8]
Mozilla/5.0 2594 Mozilla/4.0 601 GoogleMaps/RochesterNY 121 Opera/9.80 34 TEST_INTERNET_AGENT 24 GoogleProducer 21 Mozilla/6.0 5 BlackBerry8520/5.0.0.681 4 dtype: int64
cframe = frame[frame.a.notnull()]
operating_system = np.where(cframe['a'].str.contains('Windows'), 'Windows', 'Not Windows')
operating_system[:5]
#각 행이 윈도우인지 아닌지 검사
array(['Windows', 'Not Windows', 'Windows', 'Not Windows', 'Windows'], dtype='|S11')
by_tz_os = cframe.groupby(['tz', operating_system])
#표준시간대와 운영체제 데이터를 그룹으로 묶기
by_tz_os.size()
tz Not Windows 245 Windows 276 Africa/Cairo Windows 3 Africa/Casablanca Windows 1 Africa/Ceuta Windows 2 Africa/Johannesburg Windows 1 Africa/Lusaka Windows 1 America/Anchorage Not Windows 4 Windows 1 America/Argentina/Buenos_Aires Not Windows 1 America/Argentina/Cordoba Windows 1 America/Argentina/Mendoza Windows 1 America/Bogota Not Windows 1 Windows 2 America/Caracas Windows 1 America/Chicago Not Windows 115 Windows 285 America/Chihuahua Not Windows 1 Windows 1 America/Costa_Rica Windows 1 America/Denver Not Windows 132 Windows 59 America/Edmonton Not Windows 2 Windows 4 America/Guayaquil Not Windows 2 America/Halifax Not Windows 1 Windows 3 America/Indianapolis Not Windows 8 Windows 12 America/La_Paz Windows 1 ... Europe/Madrid Not Windows 16 Windows 19 Europe/Malta Windows 2 Europe/Moscow Not Windows 1 Windows 9 Europe/Oslo Not Windows 2 Windows 8 Europe/Paris Not Windows 4 Windows 10 Europe/Prague Not Windows 3 Windows 7 Europe/Riga Not Windows 1 Windows 1 Europe/Rome Not Windows 8 Windows 19 Europe/Skopje Windows 1 Europe/Sofia Windows 1 Europe/Stockholm Not Windows 2 Windows 12 Europe/Uzhgorod Windows 1 Europe/Vienna Not Windows 3 Windows 3 Europe/Vilnius Windows 2 Europe/Volgograd Windows 1 Europe/Warsaw Not Windows 1 Windows 15 Europe/Zurich Not Windows 4 Pacific/Auckland Not Windows 3 Windows 8 Pacific/Honolulu Windows 36 dtype: int64
agg_counts = by_tz_os.size().unstack().fillna(0)
agg_counts[:10]
Not Windows | Windows | |
---|---|---|
tz | ||
245 | 276 | |
Africa/Cairo | 0 | 3 |
Africa/Casablanca | 0 | 1 |
Africa/Ceuta | 0 | 2 |
Africa/Johannesburg | 0 | 1 |
Africa/Lusaka | 0 | 1 |
America/Anchorage | 4 | 1 |
America/Argentina/Buenos_Aires | 1 | 0 |
America/Argentina/Cordoba | 0 | 1 |
America/Argentina/Mendoza | 0 | 1 |
indexer = agg_counts.sum(1).argsort()
indexer[:10]
tz 24 Africa/Cairo 20 Africa/Casablanca 21 Africa/Ceuta 92 Africa/Johannesburg 87 Africa/Lusaka 53 America/Anchorage 54 America/Argentina/Buenos_Aires 57 America/Argentina/Cordoba 26 America/Argentina/Mendoza 55 dtype: int64
count_subset = agg_counts.take(indexer)[-10:]
#take를 사용해 정렬된 순서그대로 선택, 마지막 10행 잘라내기
count_subset
Not Windows | Windows | |
---|---|---|
tz | ||
America/Sao_Paulo | 13 | 20 |
Europe/Madrid | 16 | 19 |
Pacific/Honolulu | 0 | 36 |
Asia/Tokyo | 2 | 35 |
Europe/London | 43 | 31 |
America/Denver | 132 | 59 |
America/Los_Angeles | 130 | 252 |
America/Chicago | 115 | 285 |
245 | 276 | |
America/New_York | 339 | 912 |
count_subset.plot(kind='barh', stacked=True)
<matplotlib.axes._subplots.AxesSubplot at 0xcad0518>
nored_subset = count_subset.div(count_subset.sum(1), axis=0)
nored_subset.plot(kind='barh', stacked=True)
<matplotlib.axes._subplots.AxesSubplot at 0xcc235c0>
pathUrl3 = 'https://raw.githubusercontent.com/pydata/pydata-book/master/ch02/movielens/users.dat'
import urllib
response = urllib.urlopen(pathUrl3)
responseLines = response.readlines()
#한줄씩읽음
responseLines
['1::F::1::10::48067\n', '2::M::56::16::70072\n', '3::M::25::15::55117\n', '4::M::45::7::02460\n', '5::M::25::20::55455\n', '6::F::50::9::55117\n', '7::M::35::1::06810\n', '8::M::25::12::11413\n', '9::M::25::17::61614\n', '10::F::35::1::95370\n', '11::F::25::1::04093\n', '12::M::25::12::32793\n', '13::M::45::1::93304\n', '14::M::35::0::60126\n', '15::M::25::7::22903\n', '16::F::35::0::20670\n', '17::M::50::1::95350\n', '18::F::18::3::95825\n', '19::M::1::10::48073\n', '20::M::25::14::55113\n', '21::M::18::16::99353\n', '22::M::18::15::53706\n', '23::M::35::0::90049\n', '24::F::25::7::10023\n', '25::M::18::4::01609\n', '26::M::25::7::23112\n', '27::M::25::11::19130\n', '28::F::25::1::14607\n', '29::M::35::7::33407\n', '30::F::35::7::19143\n', '31::M::56::7::06840\n', '32::F::25::0::19355\n', '33::M::45::3::55421\n', '34::F::18::0::02135\n', '35::M::45::1::02482\n', '36::M::25::3::94123\n', '37::F::25::9::66212\n', '38::F::18::4::02215\n', '39::M::18::4::61820\n', '40::M::45::0::10543\n', '41::F::18::4::15116\n', '42::M::25::8::24502\n', '43::M::25::12::60614\n', '44::M::45::17::98052\n', '45::F::45::16::94110\n', '46::M::18::19::75602\n', '47::M::18::4::94305\n', '48::M::25::4::92107\n', '49::M::18::12::77084\n', '50::F::25::2::98133\n', '51::F::1::10::10562\n', '52::M::18::4::72212\n', '53::M::25::0::96931\n', '54::M::50::1::56723\n', '55::F::35::12::55303\n', '56::M::35::20::60440\n', '57::M::18::19::30350\n', '58::M::25::2::30303\n', '59::F::50::1::55413\n', '60::M::50::1::72118\n', '61::M::25::17::95122\n', '62::F::35::3::98105\n', '63::M::18::4::54902\n', '64::M::18::1::53706\n', '65::M::35::12::55803\n', '66::M::25::18::57706\n', '67::F::50::5::60181\n', '68::M::18::4::53706\n', '69::F::25::1::02143\n', '70::M::18::4::53703\n', '71::M::25::14::95008\n', '72::F::45::0::55122\n', '73::M::18::4::53706\n', '74::M::35::14::94530\n', '75::F::1::10::01748\n', '76::M::35::7::55413\n', '77::M::18::4::15321\n', '78::F::45::1::98029\n', '79::F::45::0::98103\n', '80::M::56::1::49327\n', '81::F::25::0::60640\n', '82::M::25::17::48380\n', '83::F::25::2::94609\n', '84::M::18::4::53140\n', '85::M::18::4::94945\n', '86::F::1::10::54467\n', '87::M::25::14::48360\n', '88::F::45::1::02476\n', '89::F::56::9::85749\n', '90::M::56::13::85749\n', '91::M::35::7::07650\n', '92::F::18::4::44243\n', '93::M::25::17::95825\n', '94::M::25::17::28601\n', '95::M::45::0::98201\n', '96::F::25::16::78028\n', '97::F::35::3::66210\n', '98::F::35::7::33547\n', '99::F::1::10::19390\n', '100::M::35::17::95401\n', '101::F::18::3::33314\n', '102::M::35::19::20871\n', '103::M::45::7::92104\n', '104::M::25::12::00926\n', '105::M::45::12::90277\n', '106::F::35::11::79101\n', '107::M::45::18::63129\n', '108::M::25::12::30316\n', '109::M::45::15::92028\n', '110::M::25::2::90803\n', '111::M::35::15::55416\n', '112::M::25::16::97209\n', '113::M::18::12::37032\n', '114::F::25::2::83712\n', '115::M::25::17::28083\n', '116::M::25::17::55744\n', '117::M::25::17::33314\n', '118::M::35::17::22315\n', '119::F::1::10::77515\n', '120::M::25::11::27106\n', '121::M::35::7::75229\n', '122::F::18::4::94305\n', '123::M::35::9::67208\n', '124::M::56::7::91356\n', '125::M::45::14::01701\n', '126::M::18::9::98117\n', '127::F::45::3::01770\n', '128::M::56::6::37922\n', '129::M::25::11::20164\n', '130::M::35::17::50021\n', '131::M::18::4::06520\n', '132::M::25::17::99709\n', '133::F::25::0::55071\n', '134::M::25::0::66212\n', '135::M::18::4::20006\n', '136::M::18::2::21202\n', '137::F::45::6::78758\n', '138::M::18::20::22203\n', '139::F::25::20::45409\n', '140::F::35::1::55107\n', '141::M::35::13::62035\n', '142::M::25::7::10011\n', '143::M::18::3::64043\n', '144::M::25::17::29401\n', '145::M::18::4::19081\n', '146::F::35::20::10954\n', '147::M::18::4::91360\n', '148::M::50::17::57747\n', '149::M::25::1::29205\n', '150::M::35::7::98144\n', '151::F::25::20::85013\n', '152::M::18::4::48104\n', '153::M::1::10::51537\n', '154::M::50::20::94530\n', '155::M::35::12::07470\n', '156::F::45::7::14519\n', '157::M::35::16::12866\n', '158::M::56::7::28754\n', '159::F::45::0::37922\n', '160::M::35::7::13021\n', '161::M::45::16::98107-2117\n', '162::F::18::4::93117\n', '163::M::18::4::85013\n', '164::F::56::13::94566\n', '165::M::18::16::98502\n', '166::M::18::4::92802\n', '167::F::25::11::10022\n', '168::F::50::0::46970\n', '169::M::25::7::55439\n', '170::M::25::11::07002\n', '171::F::50::17::55441\n', '172::M::25::3::07661\n', '173::M::25::0::45237\n', '174::M::25::16::21203\n', '175::F::25::2::95123\n', '176::F::18::3::55016\n', '177::M::50::1::54016\n', '178::M::56::17::53705\n', '179::M::25::0::02135\n', '180::M::45::12::01603\n', '181::M::18::17::33186\n', '182::M::18::4::03052\n', '183::F::45::1::55407\n', '184::F::25::0::19001\n', '185::M::45::0::14468\n', '186::M::18::5::91767\n', '187::F::45::1::94061\n', '188::M::56::16::79930\n', '189::M::18::0::60076\n', '190::M::25::17::55125\n', '191::M::18::4::04915\n', '192::M::18::1::10977\n', '193::F::45::15::44106\n', '194::F::1::10::29146\n', '195::M::25::12::10458\n', '196::F::35::9::94587\n', '197::M::18::14::10023\n', '198::M::25::12::55108\n', '199::M::18::4::83706\n', '200::F::18::4::84321\n', '201::F::35::2::55117\n', '202::M::18::4::53706\n', '203::F::18::4::53715\n', '204::M::25::7::92123\n', '205::M::35::12::97333\n', '206::M::25::17::20194\n', '207::M::25::12::94115\n', '208::M::35::17::55432\n', '209::M::35::1::90048\n', '210::F::1::10::25801\n', '211::M::45::17::90620\n', '212::M::25::16::53714\n', '213::F::18::4::01609\n', '214::M::18::20::80218\n', '215::M::35::14::92075\n', '216::M::45::13::52761\n', '217::M::18::4::22903\n', '218::M::35::14::95822\n', '219::F::25::4::55113\n', '220::M::25::12::22903\n', '221::F::25::0::94063\n', '222::M::25::1::55116\n', '223::M::25::17::28262\n', '224::F::18::4::14850\n', '225::M::25::7::11215\n', '226::M::35::1::94518\n', '227::M::35::20::90291\n', '228::M::25::15::55455\n', '229::M::18::10::04576\n', '230::M::45::1::43210\n', '231::M::25::3::55455\n', '232::M::25::20::55408\n', '233::F::45::20::37919-4204\n', '234::M::35::7::39652\n', '235::M::25::0::98153\n', '236::M::25::5::55126\n', '237::M::25::6::46835\n', '238::F::50::7::90291\n', '239::M::18::19::94618\n', '240::M::50::17::55113\n', '241::M::35::20::55121\n', '242::F::18::4::53706\n', '243::M::25::16::11576\n', '244::M::50::7::19072\n', '245::M::35::16::66046\n', '246::F::18::4::60625\n', '247::M::25::17::94404\n', '248::M::18::17::72703\n', '249::F::18::14::48126\n', '250::M::35::16::11229\n', '251::M::56::17::55105\n', '252::M::25::12::94112\n', '253::F::25::11::97370\n', '254::M::35::17::49015\n', '255::M::35::0::85310\n', '256::M::45::16::55076\n', '257::M::18::18::55113\n', '258::M::25::7::55436\n', '259::M::25::1::60615\n', '260::M::18::19::98126\n', '261::M::25::20::75801\n', '262::F::25::1::68503\n', '263::F::25::7::22304\n', '264::M::35::0::20755\n', '265::F::35::7::55116\n', '266::M::35::11::75229\n', '267::M::45::12::55001\n', '268::F::18::12::29708\n', '269::M::25::2::55408\n', '270::M::50::14::55414\n', '271::M::18::4::48202\n', '272::M::18::0::80302\n', '273::M::18::4::55427\n', '274::M::18::4::55455\n', '275::M::25::0::48162\n', '276::M::45::16::01982\n', '277::F::35::1::98126\n', '278::M::45::18::60482\n', '279::M::25::14::91214\n', '280::M::50::7::19118\n', '281::F::35::0::94117\n', '282::M::25::17::94401\n', '283::M::25::0::10003\n', '284::M::25::12::91910\n', '285::F::35::0::94109\n', '286::M::25::1::54601\n', '287::M::50::13::94706\n', '288::F::25::12::97119\n', '289::F::25::0::11801\n', '290::F::25::20::94591\n', '291::M::35::12::55110\n', '292::M::35::7::19406\n', '293::M::56::1::55337-4056\n', '294::F::45::9::43147\n', '295::M::18::0::80203\n', '296::M::50::5::89432\n', '297::M::18::2::97211\n', '298::F::18::4::19010\n', '299::M::25::12::97370\n', '300::M::25::7::78664\n', '301::M::18::4::61820\n', '302::M::18::4::04901\n', '303::M::25::7::20006\n', '304::M::25::0::55414\n', '305::F::18::0::55414\n', '306::M::18::0::53051\n', '307::M::50::16::90027\n', '308::M::25::2::10025\n', '309::M::25::4::16801\n', '310::F::18::4::31207\n', '311::M::18::4::31201\n', '312::F::50::2::22207\n', '313::F::18::4::02138\n', '314::F::56::9::46911\n', '315::F::56::1::55105\n', '316::M::56::13::90740\n', '317::M::35::7::38555\n', '318::F::56::13::55104\n', '319::F::50::6::33436\n', '320::M::35::6::99516\n', '321::M::18::4::55128\n', '322::M::56::17::55117\n', '323::M::45::12::53716\n', '324::M::35::17::55106\n', '325::F::50::3::55112\n', '326::M::50::11::25302\n', '327::M::35::18::55448\n', '328::M::35::12::80401\n', '329::M::35::7::02115\n', '330::M::56::7::92065\n', '331::M::25::7::55902\n', '332::M::50::1::55109\n', '333::M::35::2::55410\n', '334::F::56::2::55113\n', '335::M::35::18::55434\n', '336::M::18::0::98765\n', '337::M::18::19::80205\n', '338::M::35::7::55116\n', '339::M::50::7::80207\n', '340::F::25::3::28001\n', '341::F::56::13::92119\n', '342::M::18::12::55076\n', '343::F::35::3::55127\n', '344::M::35::14::75034\n', '345::M::25::12::94114\n', '346::F::25::0::55110\n', '347::F::56::1::55305\n', '348::M::50::7::55110\n', '349::M::1::10::08035\n', '350::M::45::20::08035\n', '351::M::18::4::55105\n', '352::M::18::4::60115\n', '353::F::35::7::92625\n', '354::M::35::1::55117\n', '355::M::18::3::55107\n', '356::M::56::20::55101\n', '357::M::18::0::98103\n', '358::F::56::1::20815\n', '359::M::25::17::55128\n', '360::M::35::17::28208\n', '361::F::25::14::94115\n', '362::M::35::20::55407\n', '363::M::18::10::55419\n', '364::F::35::1::46815\n', '365::F::18::4::02138\n', '366::M::50::15::55126\n', '367::M::50::12::55421\n', '368::M::25::0::90293\n', '369::M::35::1::55110\n', '370::M::18::17::22304\n', '371::M::18::4::02141\n', '372::F::18::4::72227\n', '373::F::25::2::55347\n', '374::F::35::14::55346\n', '375::M::25::2::55106\n', '376::M::35::1::80026\n', '377::M::25::17::55418\n', '378::F::18::0::55105\n', '379::F::35::1::54822\n', '380::M::25::2::92024\n', '381::M::35::17::89015\n', '382::F::35::20::66205\n', '383::F::25::7::78757\n', '384::M::25::11::55075\n', '385::M::25::6::68131\n', '386::M::25::0::55408\n', '387::F::35::7::55111\n', '388::F::25::0::10021\n', '389::M::25::6::68128\n', '390::M::25::4::55405\n', '391::M::45::11::22122\n', '392::M::18::7::20037\n', '393::M::35::17::55402\n', '394::M::18::0::55013\n', '395::M::18::5::55104\n', '396::M::25::1::56187\n', '397::M::35::17::22124\n', '398::M::25::17::55454\n', '399::F::35::6::55128\n', '400::F::18::3::55422\n', '401::M::18::0::55129\n', '402::M::25::11::55427\n', '403::M::18::4::02138\n', '404::M::18::4::10128\n', '405::M::56::1::13077\n', '406::M::25::20::55105\n', '407::M::18::17::89503\n', '408::M::25::11::02143\n', '409::M::18::12::55122\n', '410::F::25::1::55417\n', '411::F::45::1::43214\n', '412::M::35::15::55117\n', '413::M::25::11::55409\n', '414::M::25::0::55317\n', '415::F::35::0::55406\n', '416::M::45::14::55076\n', '417::F::25::0::50613\n', '418::F::25::3::54016\n', '419::M::18::3::55422\n', '420::M::35::1::55406\n', '421::F::45::3::55125\n', '422::M::56::17::55104\n', '423::M::18::4::55455\n', '424::M::25::17::55112\n', '425::M::25::12::55303\n', '426::M::18::4::55455\n', '427::M::35::12::55104\n', '428::F::18::4::55455\n', '429::M::18::0::54901\n', '430::F::18::10::55306\n', '431::M::18::10::55303\n', '432::M::45::16::55306\n', '433::M::50::6::55115\n', '434::F::45::3::98155\n', '435::M::25::7::55125\n', '436::M::18::4::43023\n', '437::M::35::17::55030\n', '438::M::18::11::53705\n', '439::M::35::14::55129\n', '440::M::56::1::32940\n', '441::M::35::1::55127\n', '442::M::25::1::55105\n', '443::M::25::3::55421\n', '444::M::56::0::55108\n', '445::M::45::12::55117\n', '446::F::50::0::55042\n', '447::F::45::11::55105\n', '448::M::25::17::80123\n', '449::M::25::7::85037\n', '450::M::45::1::24523\n', '451::M::56::13::54720\n', '452::M::50::17::55117\n', '453::M::18::4::55102\n', '454::M::25::20::55092\n', '455::F::35::2::55113\n', '456::M::35::0::55105\n', '457::M::18::4::54703\n', '458::M::50::16::55405-2546\n', '459::F::18::4::55105\n', '460::M::45::18::55313\n', '461::M::50::7::55075\n', '462::M::18::16::55416\n', '463::M::25::7::55105\n', '464::F::18::4::55455\n', '465::M::18::19::94523\n', '466::M::25::5::55405\n', '467::F::35::3::55075\n', '468::F::1::10::55082\n', '469::M::35::6::55122\n', '470::F::1::10::55068\n', '471::M::35::7::08904\n', '472::M::35::0::55418\n', '473::F::18::4::55112\n', '474::M::25::17::92126\n', '475::F::25::2::55421\n', '476::M::35::0::55127\n', '477::M::35::14::55410\n', '478::M::50::16::55113\n', '479::M::25::12::55042\n', '480::F::18::4::55422\n', '481::M::45::7::55115\n', '482::M::25::14::55305\n', '483::M::18::12::55105\n', '484::F::1::10::55104\n', '485::M::56::7::55042\n', '486::M::56::0::91367\n', '487::F::35::17::55082\n', '488::M::25::12::55107\n', '489::M::18::4::55455\n', '490::M::1::10::55345\n', '491::M::18::4::56043\n', '492::M::25::12::55112\n', '493::M::50::7::55016\n', '494::F::35::0::17870\n', '495::M::18::10::55421\n', '496::M::18::4::55455\n', '497::F::25::17::55412\n', '498::M::35::17::55113\n', '499::F::25::1::55108\n', '500::F::18::2::55105\n', '501::M::25::2::55372\n', '502::M::35::6::55126\n', '503::M::35::11::73120\n', '504::M::25::2::17003\n', '505::M::35::17::37815\n', '506::M::25::16::55103-1006\n', '507::F::25::0::55405\n', '508::M::25::12::55418\n', '509::M::25::2::55125\n', '510::M::18::12::55109\n', '511::F::45::4::15232\n', '512::M::35::0::55379\n', '513::M::25::0::55119\n', '514::M::25::2::55113\n', '515::M::25::1::55406\n', '516::F::56::14::55033\n', '517::F::25::14::55408\n', '518::F::35::12::75240\n', '519::F::35::14::55038\n', '520::F::35::20::55104\n', '521::M::56::7::55105\n', '522::M::25::12::55124\n', '523::M::50::7::55105\n', '524::M::18::0::91320\n', '525::M::35::6::19027\n', '526::M::45::17::94806\n', '527::F::25::2::11201\n', '528::F::18::17::83843\n', '529::M::35::12::92009\n', '530::M::25::2::10019\n', '531::F::18::14::22206\n', '532::M::25::7::94301\n', '533::M::25::12::27514\n', '534::M::25::15::55902\n', '535::M::35::6::95370\n', '536::M::25::20::01267\n', '537::M::45::14::07704\n', '538::M::56::16::95407\n', '539::M::25::2::55103\n', '540::M::18::1::15213\n', '541::F::18::4::5849574\n', '542::M::18::4::78705\n', '543::M::25::5::55057\n', '544::M::35::12::94538\n', '545::M::35::17::01890\n', '546::F::25::0::37211\n', '547::M::35::12::76109\n', '548::F::35::16::96860\n', '549::M::25::6::53217\n', '550::M::45::8::21559\n', '551::M::35::20::55116\n', '552::M::50::18::23456\n', '553::M::25::2::94131\n', '554::M::25::12::94086\n', '555::M::18::4::53213\n', '556::F::25::9::37221\n', '557::M::56::1::30030\n', '558::M::35::20::55108\n', '559::F::25::7::60422\n', '560::M::45::15::81335\n', '561::F::18::14::64060\n', '562::M::35::20::48083\n', '563::M::25::4::53703\n', '564::M::45::1::49419\n', '565::M::25::16::45242\n', '566::M::25::17::92122\n', '567::M::35::20::52570-9634\n', '568::F::50::17::19716\n', '569::F::18::4::97339\n', '570::M::25::4::33314\n', '571::M::50::5::95401\n', '572::M::18::4::61801\n', '573::F::35::2::98119\n', '574::M::25::19::90214\n', '575::M::25::12::92130\n', '576::F::45::15::20910\n', '577::M::35::0::02115\n', '578::M::18::17::90064\n', '579::M::25::5::32839\n', '580::M::1::10::08534\n', '581::M::50::14::73543\n', '582::M::18::4::67042\n', '583::F::25::0::48067\n', '584::F::25::0::94403\n', '585::M::18::1::53703\n', '586::M::35::14::53092\n', '587::M::25::20::92649\n', '588::F::25::11::23220\n', '589::M::18::2::90210\n', '590::F::35::6::98032\n', '591::M::25::0::76201\n', '592::M::18::0::92103\n', '593::F::50::1::91711\n', '594::F::56::13::60076\n', '595::M::25::7::10019\n', '596::F::25::1::01950\n', '597::M::35::12::80206\n', '598::M::35::7::95476\n', '599::M::50::6::53711\n', '600::M::35::17::66209\n', '601::F::18::20::06320\n', '602::F::56::6::14612\n', '603::F::25::6::32256\n', '604::M::45::17::32256\n', '605::F::18::4::44425\n', '606::F::1::10::49507\n', '607::M::25::0::43614\n', '608::M::18::4::18011\n', '609::M::25::7::10012\n', '610::M::25::4::77025\n', '611::M::35::0::20715\n', '612::M::50::7::95020\n', '613::M::35::20::10562\n', '614::M::35::1::90024\n', '615::M::50::17::32951\n', '616::M::25::7::94115\n', '617::F::25::7::92117\n', '618::M::25::0::74105\n', '619::F::18::1::94706\n', '620::M::18::0::13210\n', '621::M::18::4::93560\n', '622::M::25::0::92612\n', '623::M::25::17::60555\n', '624::M::25::1::75207\n', '625::F::35::12::44106\n', '626::M::18::4::77005\n', '627::M::50::2::97210\n', '628::F::35::0::20715\n', '629::F::1::10::48154\n', '630::F::35::14::80525\n', '631::M::35::17::28601\n', '632::M::45::17::07649\n', '633::M::25::17::94131\n', '634::F::1::10::49512\n', '635::M::56::17::33785\n', '636::M::18::20::92055\n', '637::M::35::12::97132\n', '638::M::25::6::77584\n', '639::M::45::17::85202\n', '640::M::18::4::47406\n', '641::F::35::0::55116\n', '642::F::35::1::78155\n', '643::M::25::2::23188\n', '644::M::25::20::93704\n', '645::M::35::20::11215\n', '646::M::35::7::08505\n', '647::M::35::12::85224\n', '648::F::25::12::94131\n', '649::M::25::4::80526\n', '650::M::25::17::49445\n', '651::M::35::7::20895\n', '652::M::18::20::12222\n', '653::M::56::13::92660\n', '654::M::50::7::75023\n', '655::F::25::2::92629\n', '656::F::45::16::92106\n', '657::F::50::13::34691\n', '658::F::25::0::10021\n', '659::F::25::0::28134\n', '660::M::45::16::70507\n', '661::M::45::1::13114\n', '662::M::18::4::99163\n', '663::F::25::7::12345\n', '664::M::35::18::74601\n', '665::M::18::2::13317\n', '666::F::18::0::08540\n', '667::M::25::18::98438\n', '668::F::25::3::22042\n', '669::M::25::17::30307\n', '670::M::25::12::30303\n', '671::M::18::4::61761\n', '672::M::35::7::46845\n', '673::M::25::20::10023\n', '674::M::25::17::22042\n', '675::M::25::20::02048\n', '676::M::18::16::55129\n', '677::M::50::2::94122\n', '678::M::25::0::34952\n', '679::M::35::16::29681\n', '680::M::25::12::37013\n', '681::M::35::7::30501\n', '682::M::25::4::27510\n', '683::M::25::4::27514\n', '684::M::25::4::27510\n', '685::M::25::4::27514\n', '686::M::18::4::27514\n', '687::F::25::7::14136\n', '688::M::56::13::13407\n', '689::F::50::0::48103\n', '690::M::25::12::90717\n', '691::M::35::11::96813\n', '692::F::25::0::85207\n', '693::M::35::12::98103\n', '694::M::18::4::76401\n', '695::F::18::1::04240\n', '696::M::25::12::94114\n', '697::F::50::6::97330\n', '698::M::18::4::76401\n', '699::M::18::0::73505\n', '700::M::50::17::14650\n', '701::F::18::4::01002\n', '702::M::35::7::90808\n', '703::F::56::1::44074\n', '704::F::35::8::85712\n', '705::M::25::4::48065\n', '706::M::35::20::78664\n', '707::M::35::7::17740\n', '708::M::25::0::37042\n', '709::M::18::0::92647\n', '710::M::25::20::85207\n', '711::M::25::18::80521\n', '712::M::25::0::95136\n', '713::M::35::7::79912\n', '714::M::18::4::76013\n', '715::M::18::0::53715\n', '716::M::18::4::98188\n', '717::M::25::11::90211\n', '718::M::18::2::90211\n', '719::M::1::0::75070\n', '720::M::18::0::55129\n', '721::F::35::9::97520\n', '722::M::25::12::55106\n', '723::M::25::12::98027\n', '724::M::50::13::90266\n', '725::M::56::11::08904\n', '726::F::25::9::77449\n', '727::M::35::11::94061\n', '728::M::25::6::11598\n', '729::M::35::12::85284\n', '730::M::25::0::10580\n', '731::M::45::3::55410\n', '732::M::25::7::07030\n', '733::M::35::16::30034\n', '734::M::25::7::90078\n', '735::M::25::14::04240\n', '736::M::18::12::07070\n', '737::M::1::19::53711\n', '738::M::18::17::99205\n', '739::M::35::12::55803\n', '740::M::25::14::24551\n', '741::M::25::5::44095\n', '742::M::18::0::54758\n', '743::F::1::2::60660\n', '744::M::25::17::77007\n', '745::M::1::10::90004\n', '746::F::18::19::95019\n', '747::M::18::4::98606\n', '748::M::25::0::60098\n', '749::M::35::18::56303\n', '750::F::35::9::13066\n', '751::F::25::17::97201\n', '752::F::25::3::97401\n', '753::M::1::10::42754\n', '754::M::35::7::38024\n', '755::F::35::0::94002\n', '756::M::35::0::90064\n', '757::M::25::12::97401\n', '758::M::35::7::19518\n', '759::F::56::3::89156\n', '760::M::56::15::94114\n', '761::M::18::7::99945\n', '762::M::1::10::63041\n', '763::M::18::10::02882\n', '764::M::18::20::85296\n', '765::M::25::17::74467\n', '766::F::25::7::95128\n', '767::M::25::12::45241\n', '768::M::25::12::17067\n', '769::M::45::17::02421\n', '770::M::18::4::98499\n', '771::F::50::1::75087\n', '772::M::18::12::89118\n', '773::M::25::6::48108\n', '774::M::18::4::22903\n', '775::M::1::17::56258\n', '776::F::25::3::92627\n', '777::M::18::19::87543\n', '778::M::18::17::32694\n', '779::M::25::0::60156\n', '780::M::25::17::91776\n', '781::M::18::4::95076\n', '782::M::56::1::28223\n', '783::M::25::4::53703\n', '784::M::18::0::11040\n', '785::M::18::19::29307\n', '786::M::25::0::55987\n', '787::M::18::4::77802\n', '788::M::25::0::37601\n', '789::M::18::1::67212\n', '790::M::25::17::45694\n', '791::M::35::7::27511\n', '792::M::25::17::01002\n', '793::F::35::14::37043\n', '794::M::50::20::10543\n', '795::F::35::14::19147\n', '796::M::50::6::98237\n', '797::F::50::7::20175\n', '798::F::25::20::48464\n', '799::F::25::5::98498\n', '800::F::35::12::72032\n', '801::F::25::20::95776\n', '802::M::25::16::22801\n', '803::M::18::4::67217\n', '804::M::25::20::90032\n', '805::M::35::12::37211\n', '806::M::35::0::34990\n', '807::M::18::4::89015\n', '808::M::25::7::85226\n', '809::M::25::17::70508\n', '810::M::25::12::98006\n', '811::M::50::0::94043\n', '812::M::25::6::37930\n', '813::M::18::4::70002\n', '814::F::18::4::92612\n', '815::M::25::4::37130\n', '816::M::45::0::92108\n', '817::M::18::4::41076\n', '818::M::18::4::60107\n', '819::M::35::7::60611\n', '820::M::35::7::78759\n', '821::M::18::2::14456\n', '822::F::25::1::07666\n', '823::M::25::0::92078\n', '824::M::35::12::94530\n', '825::M::56::1::39532\n', '826::F::35::1::80210\n', '827::M::45::2::55409\n', '828::M::18::4::70817\n', '829::M::1::19::53711\n', '830::M::35::7::60657\n', '831::F::45::0::20874\n', '832::M::18::2::19464\n', '833::M::35::7::46825\n', '834::F::35::12::78640\n', '835::M::56::0::94952\n', '836::M::25::11::10023\n', '837::M::56::7::08540\n', '838::M::50::17::92661\n', '839::M::25::0::98103\n', '840::F::25::3::02828\n', '841::F::50::14::92119\n', '842::F::18::0::55102\n', '843::M::25::7::49321\n', '844::M::25::0::92121\n', '845::F::18::0::20009\n', '846::M::25::5::80209\n', '847::F::25::7::80228\n', '848::M::25::15::94121\n', '849::M::25::3::75006\n', '850::M::35::0::60640\n', '851::M::18::4::75287\n', '852::M::25::5::92105\n', '853::M::25::17::55447\n', '854::F::25::16::44092\n', '855::F::18::2::72701\n', '856::F::45::6::02453\n', '857::M::35::17::11701\n', '858::M::35::14::10013\n', '859::M::25::12::10128\n', '860::M::18::0::78237\n', '861::M::25::7::37027\n', '862::M::35::7::10019\n', '863::M::25::7::62522\n', '864::F::25::0::97205\n', '865::M::35::1::91711\n', '866::M::1::10::08820\n', '867::M::45::12::95814\n', '868::M::50::17::01702-7224\n', '869::M::18::20::92026\n', '870::F::25::1::02144\n', '871::M::1::10::76013\n', '872::M::50::1::20815\n', '873::M::56::6::19027\n', '874::M::45::15::26105\n', '875::M::1::10::94707\n', '876::M::25::4::43202\n', '877::M::25::0::90631\n', '878::M::18::2::01720\n', '879::M::45::16::92018\n', '880::M::35::17::94114\n', '881::M::18::14::76401\n', '882::M::18::2::01720\n', '883::F::35::14::92673\n', '884::M::1::10::49454\n', '885::M::35::0::48105\n', '886::F::56::0::14830\n', '887::F::56::16::55345\n', '888::F::1::10::08820\n', '889::M::45::20::10024\n', '890::M::18::4::92153\n', '891::M::25::1::80202\n', '892::M::35::0::76031\n', '893::F::50::6::85016\n', '894::M::18::4::31602\n', '895::M::50::1::34683\n', '896::M::18::15::94015\n', '897::M::25::6::92130\n', '898::M::25::7::77005\n', '899::F::25::1::30605\n', '900::F::56::13::90066\n', '901::F::56::0::92027\n', '902::M::56::0::45432\n', '903::M::56::13::75165\n', '904::M::35::20::92308\n', '905::F::45::12::28655\n', '906::M::1::10::71106\n', '907::F::45::6::92117\n', '908::F::50::1::92103\n', '909::M::45::7::92831\n', '910::F::50::0::98226\n', '911::M::50::0::98226\n', '912::F::35::2::91324\n', '913::M::25::0::20744-6223\n', '914::F::35::9::30338\n', '915::M::25::7::72762\n', '916::F::25::1::92084\n', '917::M::50::6::92117\n', '918::F::45::1::91901\n', '919::F::35::1::92056\n', '920::M::18::4::92173\n', '921::M::25::17::92084\n', '922::M::56::16::48009\n', '923::M::25::7::37212\n', '924::M::50::17::98642\n', '925::F::35::3::48073\n', '926::M::1::10::07869\n', '927::F::25::20::91105\n', '928::F::56::16::98199\n', '929::M::25::15::53706\n', '930::M::25::5::98380\n', '931::F::56::1::06032\n', '932::F::35::6::97838\n', '933::M::45::1::49419\n', '934::F::35::7::97401\n', '935::M::35::14::60538\n', '936::M::18::4::34567\n', '937::M::25::15::60513\n', '938::F::25::1::01060\n', '939::F::25::20::20110-5616\n', '940::M::1::10::76240\n', '941::M::18::20::45387\n', '942::F::50::6::85716\n', '943::F::45::12::08033\n', '944::M::35::0::91505\n', '945::M::45::7::49345\n', '946::M::35::7::48103-8929\n', '947::M::18::0::21015\n', '948::M::56::12::43056\n', '949::M::50::17::96753\n', '950::M::35::7::21044\n', '951::M::45::2::10009\n', '952::M::56::16::97478\n', '953::M::50::20::97030\n', '954::F::25::0::21030\n', '955::F::50::7::94941\n', '956::M::35::1::55104\n', '957::M::35::1::29706\n', '958::M::35::2::48116\n', '959::M::50::11::44622\n', '960::M::1::10::45244\n', '961::M::18::10::10016\n', '962::F::25::1::80020\n', '963::M::25::0::48197\n', '964::M::25::7::94123\n', '965::M::56::7::10954\n', '966::M::56::2::92262\n', '967::F::18::0::43615\n', '968::M::50::12::94530\n', '969::F::25::11::19103\n', '970::M::25::0::27615\n', '971::M::25::12::43615\n', '972::M::45::1::02138\n', '973::F::25::1::80026\n', '974::M::35::19::94930\n', '975::M::35::0::98136\n', '976::M::35::14::89113\n', '977::M::25::2::80110\n', '978::M::18::0::19116\n', '979::M::1::10::48073\n', '980::M::25::6::92014\n', '981::M::25::20::02141\n', '982::F::25::9::92064\n', '983::F::25::16::99224\n', '984::M::50::16::92129\n', '985::M::25::4::32608\n', '986::F::56::0::19004\n', '987::F::35::17::48098\n', '988::M::50::11::48823\n', '989::M::50::0::20706\n', '990::M::18::6::10004\n', '991::F::25::9::48103\n', '992::F::35::3::02780\n', '993::M::25::0::45678\n', '994::M::18::2::92109\n', '995::F::18::4::96803\n', '996::M::25::17::98102\n', '997::M::1::19::15748\n', '998::M::45::20::10019\n', '999::M::25::15::62558\n', '1000::F::25::6::90027\n', ...]
import pandas as pd
unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_table('https://raw.githubusercontent.com/pydata/pydata-book/master/ch02/movielens/users.dat',
sep='::', header=None, names=unames)
rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table('https://raw.githubusercontent.com/pydata/pydata-book/master/ch02/movielens/ratings.dat',
sep='::', header=None, names=rnames)
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('https://raw.githubusercontent.com/pydata/pydata-book/master/ch02/movielens/movies.dat',
sep='::', header=None, names=mnames)
users[:]
user_id | gender | age | occupation | zip | |
---|---|---|---|---|---|
0 | 1 | F | 1 | 10 | 48067 |
1 | 2 | M | 56 | 16 | 70072 |
2 | 3 | M | 25 | 15 | 55117 |
3 | 4 | M | 45 | 7 | 02460 |
4 | 5 | M | 25 | 20 | 55455 |
5 | 6 | F | 50 | 9 | 55117 |
6 | 7 | M | 35 | 1 | 06810 |
7 | 8 | M | 25 | 12 | 11413 |
8 | 9 | M | 25 | 17 | 61614 |
9 | 10 | F | 35 | 1 | 95370 |
10 | 11 | F | 25 | 1 | 04093 |
11 | 12 | M | 25 | 12 | 32793 |
12 | 13 | M | 45 | 1 | 93304 |
13 | 14 | M | 35 | 0 | 60126 |
14 | 15 | M | 25 | 7 | 22903 |
15 | 16 | F | 35 | 0 | 20670 |
16 | 17 | M | 50 | 1 | 95350 |
17 | 18 | F | 18 | 3 | 95825 |
18 | 19 | M | 1 | 10 | 48073 |
19 | 20 | M | 25 | 14 | 55113 |
20 | 21 | M | 18 | 16 | 99353 |
21 | 22 | M | 18 | 15 | 53706 |
22 | 23 | M | 35 | 0 | 90049 |
23 | 24 | F | 25 | 7 | 10023 |
24 | 25 | M | 18 | 4 | 01609 |
25 | 26 | M | 25 | 7 | 23112 |
26 | 27 | M | 25 | 11 | 19130 |
27 | 28 | F | 25 | 1 | 14607 |
28 | 29 | M | 35 | 7 | 33407 |
29 | 30 | F | 35 | 7 | 19143 |
... | ... | ... | ... | ... | ... |
6010 | 6011 | M | 35 | 15 | 80538 |
6011 | 6012 | M | 35 | 15 | 02871 |
6012 | 6013 | F | 25 | 20 | 32301 |
6013 | 6014 | M | 45 | 1 | 80634 |
6014 | 6015 | F | 25 | 9 | 80013 |
6015 | 6016 | M | 45 | 1 | 37209 |
6016 | 6017 | F | 35 | 7 | 21117 |
6017 | 6018 | M | 35 | 1 | 48906 |
6018 | 6019 | M | 25 | 0 | 10024 |
6019 | 6020 | M | 50 | 16 | 10023 |
6020 | 6021 | M | 25 | 12 | 08876 |
6021 | 6022 | M | 25 | 17 | 57006 |
6022 | 6023 | M | 25 | 0 | 43213 |
6023 | 6024 | M | 25 | 12 | 53705 |
6024 | 6025 | F | 25 | 1 | 32607 |
6025 | 6026 | M | 35 | 6 | 11210 |
6026 | 6027 | M | 18 | 4 | 20742 |
6027 | 6028 | M | 18 | 4 | 94133 |
6028 | 6029 | F | 25 | 1 | 23185 |
6029 | 6030 | M | 25 | 17 | 32618 |
6030 | 6031 | F | 18 | 0 | 45123 |
6031 | 6032 | M | 45 | 7 | 55108 |
6032 | 6033 | M | 50 | 13 | 78232 |
6033 | 6034 | M | 25 | 14 | 94117 |
6034 | 6035 | F | 25 | 1 | 78734 |
6035 | 6036 | F | 25 | 15 | 32603 |
6036 | 6037 | F | 45 | 1 | 76006 |
6037 | 6038 | F | 56 | 1 | 14706 |
6038 | 6039 | F | 45 | 0 | 01060 |
6039 | 6040 | M | 25 | 6 | 11106 |
6040 rows × 5 columns
ratings[:]
user_id | movie_id | rating | timestamp | |
---|---|---|---|---|
0 | 1 | 1193 | 5 | 978300760 |
1 | 1 | 661 | 3 | 978302109 |
2 | 1 | 914 | 3 | 978301968 |
3 | 1 | 3408 | 4 | 978300275 |
4 | 1 | 2355 | 5 | 978824291 |
5 | 1 | 1197 | 3 | 978302268 |
6 | 1 | 1287 | 5 | 978302039 |
7 | 1 | 2804 | 5 | 978300719 |
8 | 1 | 594 | 4 | 978302268 |
9 | 1 | 919 | 4 | 978301368 |
10 | 1 | 595 | 5 | 978824268 |
11 | 1 | 938 | 4 | 978301752 |
12 | 1 | 2398 | 4 | 978302281 |
13 | 1 | 2918 | 4 | 978302124 |
14 | 1 | 1035 | 5 | 978301753 |
15 | 1 | 2791 | 4 | 978302188 |
16 | 1 | 2687 | 3 | 978824268 |
17 | 1 | 2018 | 4 | 978301777 |
18 | 1 | 3105 | 5 | 978301713 |
19 | 1 | 2797 | 4 | 978302039 |
20 | 1 | 2321 | 3 | 978302205 |
21 | 1 | 720 | 3 | 978300760 |
22 | 1 | 1270 | 5 | 978300055 |
23 | 1 | 527 | 5 | 978824195 |
24 | 1 | 2340 | 3 | 978300103 |
25 | 1 | 48 | 5 | 978824351 |
26 | 1 | 1097 | 4 | 978301953 |
27 | 1 | 1721 | 4 | 978300055 |
28 | 1 | 1545 | 4 | 978824139 |
29 | 1 | 745 | 3 | 978824268 |
... | ... | ... | ... | ... |
1000179 | 6040 | 2762 | 4 | 956704584 |
1000180 | 6040 | 1036 | 3 | 956715455 |
1000181 | 6040 | 508 | 4 | 956704972 |
1000182 | 6040 | 1041 | 4 | 957717678 |
1000183 | 6040 | 3735 | 4 | 960971654 |
1000184 | 6040 | 2791 | 4 | 956715569 |
1000185 | 6040 | 2794 | 1 | 956716438 |
1000186 | 6040 | 527 | 5 | 956704219 |
1000187 | 6040 | 2003 | 1 | 956716294 |
1000188 | 6040 | 535 | 4 | 964828734 |
1000189 | 6040 | 2010 | 5 | 957716795 |
1000190 | 6040 | 2011 | 4 | 956716113 |
1000191 | 6040 | 3751 | 4 | 964828782 |
1000192 | 6040 | 2019 | 5 | 956703977 |
1000193 | 6040 | 541 | 4 | 956715288 |
1000194 | 6040 | 1077 | 5 | 964828799 |
1000195 | 6040 | 1079 | 2 | 956715648 |
1000196 | 6040 | 549 | 4 | 956704746 |
1000197 | 6040 | 2020 | 3 | 956715288 |
1000198 | 6040 | 2021 | 3 | 956716374 |
1000199 | 6040 | 2022 | 5 | 956716207 |
1000200 | 6040 | 2028 | 5 | 956704519 |
1000201 | 6040 | 1080 | 4 | 957717322 |
1000202 | 6040 | 1089 | 4 | 956704996 |
1000203 | 6040 | 1090 | 3 | 956715518 |
1000204 | 6040 | 1091 | 1 | 956716541 |
1000205 | 6040 | 1094 | 5 | 956704887 |
1000206 | 6040 | 562 | 5 | 956704746 |
1000207 | 6040 | 1096 | 4 | 956715648 |
1000208 | 6040 | 1097 | 4 | 956715569 |
1000209 rows × 4 columns
movies[:]
movie_id | title | genres | |
---|---|---|---|
0 | 1 | Toy Story (1995) | Animation|Children's|Comedy |
1 | 2 | Jumanji (1995) | Adventure|Children's|Fantasy |
2 | 3 | Grumpier Old Men (1995) | Comedy|Romance |
3 | 4 | Waiting to Exhale (1995) | Comedy|Drama |
4 | 5 | Father of the Bride Part II (1995) | Comedy |
5 | 6 | Heat (1995) | Action|Crime|Thriller |
6 | 7 | Sabrina (1995) | Comedy|Romance |
7 | 8 | Tom and Huck (1995) | Adventure|Children's |
8 | 9 | Sudden Death (1995) | Action |
9 | 10 | GoldenEye (1995) | Action|Adventure|Thriller |
10 | 11 | American President, The (1995) | Comedy|Drama|Romance |
11 | 12 | Dracula: Dead and Loving It (1995) | Comedy|Horror |
12 | 13 | Balto (1995) | Animation|Children's |
13 | 14 | Nixon (1995) | Drama |
14 | 15 | Cutthroat Island (1995) | Action|Adventure|Romance |
15 | 16 | Casino (1995) | Drama|Thriller |
16 | 17 | Sense and Sensibility (1995) | Drama|Romance |
17 | 18 | Four Rooms (1995) | Thriller |
18 | 19 | Ace Ventura: When Nature Calls (1995) | Comedy |
19 | 20 | Money Train (1995) | Action |
20 | 21 | Get Shorty (1995) | Action|Comedy|Drama |
21 | 22 | Copycat (1995) | Crime|Drama|Thriller |
22 | 23 | Assassins (1995) | Thriller |
23 | 24 | Powder (1995) | Drama|Sci-Fi |
24 | 25 | Leaving Las Vegas (1995) | Drama|Romance |
25 | 26 | Othello (1995) | Drama |
26 | 27 | Now and Then (1995) | Drama |
27 | 28 | Persuasion (1995) | Romance |
28 | 29 | City of Lost Children, The (1995) | Adventure|Sci-Fi |
29 | 30 | Shanghai Triad (Yao a yao yao dao waipo qiao) ... | Drama |
... | ... | ... | ... |
3853 | 3923 | Return of the Fly (1959) | Horror|Sci-Fi |
3854 | 3924 | Pajama Party (1964) | Comedy |
3855 | 3925 | Stranger Than Paradise (1984) | Comedy |
3856 | 3926 | Voyage to the Bottom of the Sea (1961) | Adventure|Sci-Fi |
3857 | 3927 | Fantastic Voyage (1966) | Adventure|Sci-Fi |
3858 | 3928 | Abbott and Costello Meet Frankenstein (1948) | Comedy|Horror |
3859 | 3929 | Bank Dick, The (1940) | Comedy |
3860 | 3930 | Creature From the Black Lagoon, The (1954) | Horror |
3861 | 3931 | Giant Gila Monster, The (1959) | Horror|Sci-Fi |
3862 | 3932 | Invisible Man, The (1933) | Horror|Sci-Fi |
3863 | 3933 | Killer Shrews, The (1959) | Horror|Sci-Fi |
3864 | 3934 | Kronos (1957) | Sci-Fi |
3865 | 3935 | Kronos (1973) | Horror |
3866 | 3936 | Phantom of the Opera, The (1943) | Drama|Thriller |
3867 | 3937 | Runaway (1984) | Sci-Fi|Thriller |
3868 | 3938 | Slumber Party Massacre, The (1982) | Horror |
3869 | 3939 | Slumber Party Massacre II, The (1987) | Horror |
3870 | 3940 | Slumber Party Massacre III, The (1990) | Horror |
3871 | 3941 | Sorority House Massacre (1986) | Horror |
3872 | 3942 | Sorority House Massacre II (1990) | Horror |
3873 | 3943 | Bamboozled (2000) | Comedy |
3874 | 3944 | Bootmen (2000) | Comedy|Drama |
3875 | 3945 | Digimon: The Movie (2000) | Adventure|Animation|Children's |
3876 | 3946 | Get Carter (2000) | Action|Drama|Thriller |
3877 | 3947 | Get Carter (1971) | Thriller |
3878 | 3948 | Meet the Parents (2000) | Comedy |
3879 | 3949 | Requiem for a Dream (2000) | Drama |
3880 | 3950 | Tigerland (2000) | Drama |
3881 | 3951 | Two Family House (2000) | Drama |
3882 | 3952 | Contender, The (2000) | Drama|Thriller |
3883 rows × 3 columns
type(movies)
pandas.core.frame.DataFrame
data = pd.merge(pd.merge(ratings, users), movies)
data[:10]
user_id | movie_id | rating | timestamp | gender | age | occupation | zip | title | genres | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1193 | 5 | 978300760 | F | 1 | 10 | 48067 | One Flew Over the Cuckoo's Nest (1975) | Drama |
1 | 2 | 1193 | 5 | 978298413 | M | 56 | 16 | 70072 | One Flew Over the Cuckoo's Nest (1975) | Drama |
2 | 12 | 1193 | 4 | 978220179 | M | 25 | 12 | 32793 | One Flew Over the Cuckoo's Nest (1975) | Drama |
3 | 15 | 1193 | 4 | 978199279 | M | 25 | 7 | 22903 | One Flew Over the Cuckoo's Nest (1975) | Drama |
4 | 17 | 1193 | 5 | 978158471 | M | 50 | 1 | 95350 | One Flew Over the Cuckoo's Nest (1975) | Drama |
5 | 18 | 1193 | 4 | 978156168 | F | 18 | 3 | 95825 | One Flew Over the Cuckoo's Nest (1975) | Drama |
6 | 19 | 1193 | 5 | 982730936 | M | 1 | 10 | 48073 | One Flew Over the Cuckoo's Nest (1975) | Drama |
7 | 24 | 1193 | 5 | 978136709 | F | 25 | 7 | 10023 | One Flew Over the Cuckoo's Nest (1975) | Drama |
8 | 28 | 1193 | 3 | 978125194 | F | 25 | 1 | 14607 | One Flew Over the Cuckoo's Nest (1975) | Drama |
9 | 33 | 1193 | 5 | 978557765 | M | 45 | 3 | 55421 | One Flew Over the Cuckoo's Nest (1975) | Drama |
data.ix[0]
#0번째 행의 정보
user_id 1 movie_id 1193 rating 5 timestamp 978300760 gender F age 1 occupation 10 zip 48067 title One Flew Over the Cuckoo's Nest (1975) genres Drama Name: 0, dtype: object
mean_ratings = data.pivot_table('rating', index='title', columns='gender', aggfunc='mean')
#책의 rows, cols가 index, colums로 바뀜
#aggfunc은 default값이 mean 이며, 값을 어떻게 나타낼지 결정하는 함수
mean_ratings[:5]
#성별에 따른 영화 평점 DataFrame으로 객체 생성
gender | F | M |
---|---|---|
title | ||
$1,000,000 Duck (1971) | 3.375000 | 2.761905 |
'Night Mother (1986) | 3.388889 | 3.352941 |
'Til There Was You (1997) | 2.675676 | 2.733333 |
'burbs, The (1989) | 2.793478 | 2.962085 |
...And Justice for All (1979) | 3.828571 | 3.689024 |
ratings_by_title = data.groupby('title').size()
ratings_by_title[:10]
title $1,000,000 Duck (1971) 37 'Night Mother (1986) 70 'Til There Was You (1997) 52 'burbs, The (1989) 303 ...And Justice for All (1979) 199 1-900 (1994) 2 10 Things I Hate About You (1999) 700 101 Dalmatians (1961) 565 101 Dalmatians (1996) 364 12 Angry Men (1957) 616 dtype: int64
active_titles = ratings_by_title.index[ratings_by_title >= 250]
active_titles
Index([u''burbs, The (1989)', u'10 Things I Hate About You (1999)', u'101 Dalmatians (1961)', u'101 Dalmatians (1996)', u'12 Angry Men (1957)', u'13th Warrior, The (1999)', u'2 Days in the Valley (1996)', u'20,000 Leagues Under the Sea (1954)', u'2001: A Space Odyssey (1968)', u'2010 (1984)', ... u'X-Men (2000)', u'Year of Living Dangerously (1982)', u'Yellow Submarine (1968)', u'You've Got Mail (1998)', u'Young Frankenstein (1974)', u'Young Guns (1988)', u'Young Guns II (1990)', u'Young Sherlock Holmes (1985)', u'Zero Effect (1998)', u'eXistenZ (1999)'], dtype='object', name=u'title', length=1216)
mean_ratings = mean_ratings.ix[active_titles]
mean_ratings[:10]
gender | F | M |
---|---|---|
title | ||
'burbs, The (1989) | 2.793478 | 2.962085 |
10 Things I Hate About You (1999) | 3.646552 | 3.311966 |
101 Dalmatians (1961) | 3.791444 | 3.500000 |
101 Dalmatians (1996) | 3.240000 | 2.911215 |
12 Angry Men (1957) | 4.184397 | 4.328421 |
13th Warrior, The (1999) | 3.112000 | 3.168000 |
2 Days in the Valley (1996) | 3.488889 | 3.244813 |
20,000 Leagues Under the Sea (1954) | 3.670103 | 3.709205 |
2001: A Space Odyssey (1968) | 3.825581 | 4.129738 |
2010 (1984) | 3.446809 | 3.413712 |
top_female_ratings = mean_ratings.sort_index(by='F', ascending = False)
top_female_ratings[:]
gender | F | M |
---|---|---|
title | ||
Close Shave, A (1995) | 4.644444 | 4.473795 |
Wrong Trousers, The (1993) | 4.588235 | 4.478261 |
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950) | 4.572650 | 4.464589 |
Wallace & Gromit: The Best of Aardman Animation (1996) | 4.563107 | 4.385075 |
Schindler's List (1993) | 4.562602 | 4.491415 |
Shawshank Redemption, The (1994) | 4.539075 | 4.560625 |
Grand Day Out, A (1992) | 4.537879 | 4.293255 |
To Kill a Mockingbird (1962) | 4.536667 | 4.372611 |
Creature Comforts (1990) | 4.513889 | 4.272277 |
Usual Suspects, The (1995) | 4.513317 | 4.518248 |
It Happened One Night (1934) | 4.500000 | 4.163934 |
Rear Window (1954) | 4.484536 | 4.472991 |
Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954) | 4.481132 | 4.576628 |
Sixth Sense, The (1999) | 4.477410 | 4.379944 |
Third Man, The (1949) | 4.466019 | 4.448276 |
Some Like It Hot (1959) | 4.462745 | 4.228769 |
City Lights (1931) | 4.452055 | 4.363636 |
Notorious (1946) | 4.448718 | 4.211073 |
Philadelphia Story, The (1940) | 4.446809 | 4.201729 |
Life Is Beautiful (La Vita � bella) (1997) | 4.422343 | 4.286624 |
Strangers on a Train (1951) | 4.414815 | 4.262248 |
Bicycle Thief, The (Ladri di biciclette) (1948) | 4.407407 | 4.343434 |
Mr. Smith Goes to Washington (1939) | 4.400000 | 4.183746 |
Thin Blue Line, The (1988) | 4.396552 | 4.213270 |
Killing Fields, The (1984) | 4.378049 | 4.211268 |
Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963) | 4.376623 | 4.464789 |
Ran (1985) | 4.365079 | 4.248299 |
North by Northwest (1959) | 4.364458 | 4.390641 |
All About Eve (1950) | 4.363057 | 4.186992 |
Manchurian Candidate, The (1962) | 4.358824 | 4.326050 |
... | ... | ... |
Rocky III (1982) | 2.361702 | 2.943503 |
Congo (1995) | 2.347368 | 2.217021 |
House on Haunted Hill, The (1999) | 2.333333 | 2.299213 |
Spawn (1997) | 2.333333 | 2.660287 |
Blue Lagoon, The (1980) | 2.321739 | 2.337662 |
Nightmare on Elm Street Part 2: Freddy's Revenge, A (1985) | 2.319149 | 2.321586 |
Inspector Gadget (1999) | 2.316327 | 2.112727 |
Porky's (1981) | 2.296875 | 2.836364 |
Honey, I Blew Up the Kid (1992) | 2.295918 | 2.243816 |
Wild Wild West (1999) | 2.275449 | 2.131973 |
Ace Ventura: When Nature Calls (1995) | 2.269663 | 2.543333 |
Fly II, The (1989) | 2.257143 | 2.321285 |
Cable Guy, The (1996) | 2.250000 | 2.863787 |
Island of Dr. Moreau, The (1996) | 2.245283 | 2.191882 |
Grease 2 (1982) | 2.243478 | 1.792553 |
Superman IV: The Quest for Peace (1987) | 2.216216 | 1.847458 |
I Still Know What You Did Last Summer (1998) | 2.200000 | 2.204444 |
Eye of the Beholder (1999) | 2.181818 | 2.115789 |
Godzilla (1998) | 2.175439 | 2.343891 |
Super Mario Bros. (1993) | 2.163636 | 1.820339 |
Karate Kid III, The (1989) | 2.155556 | 2.116183 |
Teenage Mutant Ninja Turtles II: The Secret of the Ooze (1991) | 2.152174 | 2.121951 |
Striptease (1996) | 2.150000 | 2.193277 |
Howard the Duck (1986) | 2.074627 | 2.103542 |
Anaconda (1997) | 2.000000 | 2.248447 |
Avengers, The (1998) | 1.915254 | 2.017467 |
Speed 2: Cruise Control (1997) | 1.906667 | 1.863014 |
Rocky V (1990) | 1.878788 | 2.132780 |
Barb Wire (1996) | 1.585366 | 2.100386 |
Battlefield Earth (2000) | 1.574468 | 1.616949 |
1216 rows × 2 columns
mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F']
#남성 평점과 여성평점 차이를 diff에 넣기
sorted_by_diff = mean_ratings.sort_index(by='diff')
sorted_by_diff[:10]
gender | F | M | diff |
---|---|---|---|
title | |||
Dirty Dancing (1987) | 3.790378 | 2.959596 | -0.830782 |
Jumpin' Jack Flash (1986) | 3.254717 | 2.578358 | -0.676359 |
Grease (1978) | 3.975265 | 3.367041 | -0.608224 |
Little Women (1994) | 3.870588 | 3.321739 | -0.548849 |
Steel Magnolias (1989) | 3.901734 | 3.365957 | -0.535777 |
Anastasia (1997) | 3.800000 | 3.281609 | -0.518391 |
Rocky Horror Picture Show, The (1975) | 3.673016 | 3.160131 | -0.512885 |
Color Purple, The (1985) | 4.158192 | 3.659341 | -0.498851 |
Age of Innocence, The (1993) | 3.827068 | 3.339506 | -0.487561 |
Free Willy (1993) | 2.921348 | 2.438776 | -0.482573 |
sorted_by_diff[::-1][:10]
#역순으로 상위 10개만 (남성들이 선호하는 순으로)
gender | F | M | diff |
---|---|---|---|
title | |||
Good, The Bad and The Ugly, The (1966) | 3.494949 | 4.221300 | 0.726351 |
Kentucky Fried Movie, The (1977) | 2.878788 | 3.555147 | 0.676359 |
Dumb & Dumber (1994) | 2.697987 | 3.336595 | 0.638608 |
Longest Day, The (1962) | 3.411765 | 4.031447 | 0.619682 |
Cable Guy, The (1996) | 2.250000 | 2.863787 | 0.613787 |
Evil Dead II (Dead By Dawn) (1987) | 3.297297 | 3.909283 | 0.611985 |
Hidden, The (1987) | 3.137931 | 3.745098 | 0.607167 |
Rocky III (1982) | 2.361702 | 2.943503 | 0.581801 |
Caddyshack (1980) | 3.396135 | 3.969737 | 0.573602 |
For a Few Dollars More (1965) | 3.409091 | 3.953795 | 0.544704 |
#평점의 표준편차
rating_std_by_title = data.groupby('title')['rating'].std()
#active_titles만 선택
rating_std_by_title = rating_std_by_title.ix[active_titles]
#내림차순으로 정렬
rating_std_by_title.order(ascending=False)[:10]
title Dumb & Dumber (1994) 1.321333 Blair Witch Project, The (1999) 1.316368 Natural Born Killers (1994) 1.307198 Tank Girl (1995) 1.277695 Rocky Horror Picture Show, The (1975) 1.260177 Eyes Wide Shut (1999) 1.259624 Evita (1996) 1.253631 Billy Madison (1995) 1.249970 Fear and Loathing in Las Vegas (1998) 1.246408 Bicentennial Man (1999) 1.245533 Name: rating, dtype: float64
rating_std_by_title.order(ascending=True)[:10]
title Close Shave, A (1995) 0.667143 Rear Window (1954) 0.688946 Great Escape, The (1963) 0.692585 Shawshank Redemption, The (1994) 0.700443 Wrong Trousers, The (1993) 0.708666 Raiders of the Lost Ark (1981) 0.725647 North by Northwest (1959) 0.732515 Hustler, The (1961) 0.737298 Double Indemnity (1944) 0.740793 Sunset Blvd. (a.k.a. Sunset Boulevard) (1950) 0.740924 Name: rating, dtype: float64
import pandas as pd
names1880 = pd.read_csv('https://raw.githubusercontent.com/pydata/pydata-book/master/ch02/names/yob1880.txt',
names=['name', 'sex', 'births'])
names1880[:]
name | sex | births | |
---|---|---|---|
0 | Mary | F | 7065 |
1 | Anna | F | 2604 |
2 | Emma | F | 2003 |
3 | Elizabeth | F | 1939 |
4 | Minnie | F | 1746 |
5 | Margaret | F | 1578 |
6 | Ida | F | 1472 |
7 | Alice | F | 1414 |
8 | Bertha | F | 1320 |
9 | Sarah | F | 1288 |
10 | Annie | F | 1258 |
11 | Clara | F | 1226 |
12 | Ella | F | 1156 |
13 | Florence | F | 1063 |
14 | Cora | F | 1045 |
15 | Martha | F | 1040 |
16 | Laura | F | 1012 |
17 | Nellie | F | 995 |
18 | Grace | F | 982 |
19 | Carrie | F | 949 |
20 | Maude | F | 858 |
21 | Mabel | F | 808 |
22 | Bessie | F | 794 |
23 | Jennie | F | 793 |
24 | Gertrude | F | 787 |
25 | Julia | F | 783 |
26 | Hattie | F | 769 |
27 | Edith | F | 768 |
28 | Mattie | F | 704 |
29 | Rose | F | 700 |
... | ... | ... | ... |
1970 | Philo | M | 5 |
1971 | Phineas | M | 5 |
1972 | Presley | M | 5 |
1973 | Ransom | M | 5 |
1974 | Reece | M | 5 |
1975 | Rene | M | 5 |
1976 | Roswell | M | 5 |
1977 | Rowland | M | 5 |
1978 | Sampson | M | 5 |
1979 | Samual | M | 5 |
1980 | Santos | M | 5 |
1981 | Schuyler | M | 5 |
1982 | Sheppard | M | 5 |
1983 | Spurgeon | M | 5 |
1984 | Starling | M | 5 |
1985 | Sylvanus | M | 5 |
1986 | Theadore | M | 5 |
1987 | Theophile | M | 5 |
1988 | Tilmon | M | 5 |
1989 | Tommy | M | 5 |
1990 | Unknown | M | 5 |
1991 | Vann | M | 5 |
1992 | Wes | M | 5 |
1993 | Winston | M | 5 |
1994 | Wood | M | 5 |
1995 | Woodie | M | 5 |
1996 | Worthy | M | 5 |
1997 | Wright | M | 5 |
1998 | York | M | 5 |
1999 | Zachariah | M | 5 |
2000 rows × 3 columns
names1880.groupby('sex').births.sum()
#성별에 따라 그룹화하여 birth값을 구함 (해당연도의 전체 출생수)
sex F 90993 M 110493 Name: births, dtype: int64
years = range(1880, 2011) #2010년 데이터가 가장 마지막
pieces=[]
columns = ['name', 'sex', 'births']
for year in years:
path = 'https://raw.githubusercontent.com/pydata/pydata-book/master/ch02/names/yob%d.txt' % year
frame = pd.read_csv(path, names=columns)
frame['year'] = year
pieces.append(frame)
#하나의 DataFrame으로 모든 데이터를 연결시키면
names = pd.concat(pieces, ignore_index=True)
#read_csv를 통해 읽어온 원래 행 순서는 몰라도 되므로 index무시
names
name | sex | births | year | |
---|---|---|---|---|
0 | Mary | F | 7065 | 1880 |
1 | Anna | F | 2604 | 1880 |
2 | Emma | F | 2003 | 1880 |
3 | Elizabeth | F | 1939 | 1880 |
4 | Minnie | F | 1746 | 1880 |
5 | Margaret | F | 1578 | 1880 |
6 | Ida | F | 1472 | 1880 |
7 | Alice | F | 1414 | 1880 |
8 | Bertha | F | 1320 | 1880 |
9 | Sarah | F | 1288 | 1880 |
10 | Annie | F | 1258 | 1880 |
11 | Clara | F | 1226 | 1880 |
12 | Ella | F | 1156 | 1880 |
13 | Florence | F | 1063 | 1880 |
14 | Cora | F | 1045 | 1880 |
15 | Martha | F | 1040 | 1880 |
16 | Laura | F | 1012 | 1880 |
17 | Nellie | F | 995 | 1880 |
18 | Grace | F | 982 | 1880 |
19 | Carrie | F | 949 | 1880 |
20 | Maude | F | 858 | 1880 |
21 | Mabel | F | 808 | 1880 |
22 | Bessie | F | 794 | 1880 |
23 | Jennie | F | 793 | 1880 |
24 | Gertrude | F | 787 | 1880 |
25 | Julia | F | 783 | 1880 |
26 | Hattie | F | 769 | 1880 |
27 | Edith | F | 768 | 1880 |
28 | Mattie | F | 704 | 1880 |
29 | Rose | F | 700 | 1880 |
... | ... | ... | ... | ... |
1690754 | Zaviyon | M | 5 | 2010 |
1690755 | Zaybrien | M | 5 | 2010 |
1690756 | Zayshawn | M | 5 | 2010 |
1690757 | Zayyan | M | 5 | 2010 |
1690758 | Zeal | M | 5 | 2010 |
1690759 | Zealan | M | 5 | 2010 |
1690760 | Zecharia | M | 5 | 2010 |
1690761 | Zeferino | M | 5 | 2010 |
1690762 | Zekariah | M | 5 | 2010 |
1690763 | Zeki | M | 5 | 2010 |
1690764 | Zeriah | M | 5 | 2010 |
1690765 | Zeshan | M | 5 | 2010 |
1690766 | Zhyier | M | 5 | 2010 |
1690767 | Zildjian | M | 5 | 2010 |
1690768 | Zinn | M | 5 | 2010 |
1690769 | Zishan | M | 5 | 2010 |
1690770 | Ziven | M | 5 | 2010 |
1690771 | Zmari | M | 5 | 2010 |
1690772 | Zoren | M | 5 | 2010 |
1690773 | Zuhaib | M | 5 | 2010 |
1690774 | Zyeire | M | 5 | 2010 |
1690775 | Zygmunt | M | 5 | 2010 |
1690776 | Zykerion | M | 5 | 2010 |
1690777 | Zylar | M | 5 | 2010 |
1690778 | Zylin | M | 5 | 2010 |
1690779 | Zymaire | M | 5 | 2010 |
1690780 | Zyonne | M | 5 | 2010 |
1690781 | Zyquarius | M | 5 | 2010 |
1690782 | Zyran | M | 5 | 2010 |
1690783 | Zzyzx | M | 5 | 2010 |
1690784 rows × 4 columns
total_births = names.pivot_table('births', index='year',
columns='sex', aggfunc=sum)
total_births.tail()
sex | F | M |
---|---|---|
year | ||
2006 | 1896468 | 2050234 |
2007 | 1916888 | 2069242 |
2008 | 1883645 | 2032310 |
2009 | 1827643 | 1973359 |
2010 | 1759010 | 1898382 |
total_births.plot(title='Total births by sex and year')
<matplotlib.axes._subplots.AxesSubplot at 0x1051b4e0>
#함수 정의
def add_prop(group):
#Integer division floors
births = group.births.astype(float) #birth를 float타입으로
group['prop'] = births/ births.sum()
#births를 births전체의 합으로 나누면 prop라는 출생률 계산
return group
names = names.groupby(['year', 'sex']).apply(add_prop)
#새로운 열을 추가
names
name | sex | births | year | prop | |
---|---|---|---|---|---|
0 | Mary | F | 7065 | 1880 | 0.077643 |
1 | Anna | F | 2604 | 1880 | 0.028618 |
2 | Emma | F | 2003 | 1880 | 0.022013 |
3 | Elizabeth | F | 1939 | 1880 | 0.021309 |
4 | Minnie | F | 1746 | 1880 | 0.019188 |
5 | Margaret | F | 1578 | 1880 | 0.017342 |
6 | Ida | F | 1472 | 1880 | 0.016177 |
7 | Alice | F | 1414 | 1880 | 0.015540 |
8 | Bertha | F | 1320 | 1880 | 0.014507 |
9 | Sarah | F | 1288 | 1880 | 0.014155 |
10 | Annie | F | 1258 | 1880 | 0.013825 |
11 | Clara | F | 1226 | 1880 | 0.013474 |
12 | Ella | F | 1156 | 1880 | 0.012704 |
13 | Florence | F | 1063 | 1880 | 0.011682 |
14 | Cora | F | 1045 | 1880 | 0.011484 |
15 | Martha | F | 1040 | 1880 | 0.011429 |
16 | Laura | F | 1012 | 1880 | 0.011122 |
17 | Nellie | F | 995 | 1880 | 0.010935 |
18 | Grace | F | 982 | 1880 | 0.010792 |
19 | Carrie | F | 949 | 1880 | 0.010429 |
20 | Maude | F | 858 | 1880 | 0.009429 |
21 | Mabel | F | 808 | 1880 | 0.008880 |
22 | Bessie | F | 794 | 1880 | 0.008726 |
23 | Jennie | F | 793 | 1880 | 0.008715 |
24 | Gertrude | F | 787 | 1880 | 0.008649 |
25 | Julia | F | 783 | 1880 | 0.008605 |
26 | Hattie | F | 769 | 1880 | 0.008451 |
27 | Edith | F | 768 | 1880 | 0.008440 |
28 | Mattie | F | 704 | 1880 | 0.007737 |
29 | Rose | F | 700 | 1880 | 0.007693 |
... | ... | ... | ... | ... | ... |
1690754 | Zaviyon | M | 5 | 2010 | 0.000003 |
1690755 | Zaybrien | M | 5 | 2010 | 0.000003 |
1690756 | Zayshawn | M | 5 | 2010 | 0.000003 |
1690757 | Zayyan | M | 5 | 2010 | 0.000003 |
1690758 | Zeal | M | 5 | 2010 | 0.000003 |
1690759 | Zealan | M | 5 | 2010 | 0.000003 |
1690760 | Zecharia | M | 5 | 2010 | 0.000003 |
1690761 | Zeferino | M | 5 | 2010 | 0.000003 |
1690762 | Zekariah | M | 5 | 2010 | 0.000003 |
1690763 | Zeki | M | 5 | 2010 | 0.000003 |
1690764 | Zeriah | M | 5 | 2010 | 0.000003 |
1690765 | Zeshan | M | 5 | 2010 | 0.000003 |
1690766 | Zhyier | M | 5 | 2010 | 0.000003 |
1690767 | Zildjian | M | 5 | 2010 | 0.000003 |
1690768 | Zinn | M | 5 | 2010 | 0.000003 |
1690769 | Zishan | M | 5 | 2010 | 0.000003 |
1690770 | Ziven | M | 5 | 2010 | 0.000003 |
1690771 | Zmari | M | 5 | 2010 | 0.000003 |
1690772 | Zoren | M | 5 | 2010 | 0.000003 |
1690773 | Zuhaib | M | 5 | 2010 | 0.000003 |
1690774 | Zyeire | M | 5 | 2010 | 0.000003 |
1690775 | Zygmunt | M | 5 | 2010 | 0.000003 |
1690776 | Zykerion | M | 5 | 2010 | 0.000003 |
1690777 | Zylar | M | 5 | 2010 | 0.000003 |
1690778 | Zylin | M | 5 | 2010 | 0.000003 |
1690779 | Zymaire | M | 5 | 2010 | 0.000003 |
1690780 | Zyonne | M | 5 | 2010 | 0.000003 |
1690781 | Zyquarius | M | 5 | 2010 | 0.000003 |
1690782 | Zyran | M | 5 | 2010 | 0.000003 |
1690783 | Zzyzx | M | 5 | 2010 | 0.000003 |
1690784 rows × 5 columns
import numpy as np
np.allclose(names.groupby(['year', 'sex']).prop.sum(), 1)
True
#함수정의
def get_top100(group):
return group.sort_index(by='births', ascending=False)[:100]
#birth로 내림차순 정렬
grouped = names.groupby(['year', 'sex']) #year,sex로 그룹화해서 top100 출력
top100 = grouped.apply(get_top100)
top100
name | sex | births | year | prop | |||
---|---|---|---|---|---|---|---|
year | sex | ||||||
1880 | F | 0 | Mary | F | 7065 | 1880 | 0.077643 |
1 | Anna | F | 2604 | 1880 | 0.028618 | ||
2 | Emma | F | 2003 | 1880 | 0.022013 | ||
3 | Elizabeth | F | 1939 | 1880 | 0.021309 | ||
4 | Minnie | F | 1746 | 1880 | 0.019188 | ||
5 | Margaret | F | 1578 | 1880 | 0.017342 | ||
6 | Ida | F | 1472 | 1880 | 0.016177 | ||
7 | Alice | F | 1414 | 1880 | 0.015540 | ||
8 | Bertha | F | 1320 | 1880 | 0.014507 | ||
9 | Sarah | F | 1288 | 1880 | 0.014155 | ||
10 | Annie | F | 1258 | 1880 | 0.013825 | ||
11 | Clara | F | 1226 | 1880 | 0.013474 | ||
12 | Ella | F | 1156 | 1880 | 0.012704 | ||
13 | Florence | F | 1063 | 1880 | 0.011682 | ||
14 | Cora | F | 1045 | 1880 | 0.011484 | ||
15 | Martha | F | 1040 | 1880 | 0.011429 | ||
16 | Laura | F | 1012 | 1880 | 0.011122 | ||
17 | Nellie | F | 995 | 1880 | 0.010935 | ||
18 | Grace | F | 982 | 1880 | 0.010792 | ||
19 | Carrie | F | 949 | 1880 | 0.010429 | ||
20 | Maude | F | 858 | 1880 | 0.009429 | ||
21 | Mabel | F | 808 | 1880 | 0.008880 | ||
22 | Bessie | F | 794 | 1880 | 0.008726 | ||
23 | Jennie | F | 793 | 1880 | 0.008715 | ||
24 | Gertrude | F | 787 | 1880 | 0.008649 | ||
25 | Julia | F | 783 | 1880 | 0.008605 | ||
26 | Hattie | F | 769 | 1880 | 0.008451 | ||
27 | Edith | F | 768 | 1880 | 0.008440 | ||
28 | Mattie | F | 704 | 1880 | 0.007737 | ||
29 | Rose | F | 700 | 1880 | 0.007693 | ||
... | ... | ... | ... | ... | ... | ... | ... |
2010 | M | 1676714 | Xavier | M | 5701 | 2010 | 0.003003 |
1676715 | Ian | M | 5510 | 2010 | 0.002902 | ||
1676716 | Colton | M | 5270 | 2010 | 0.002776 | ||
1676717 | Dominic | M | 5260 | 2010 | 0.002771 | ||
1676718 | Juan | M | 5217 | 2010 | 0.002748 | ||
1676719 | Cooper | M | 5206 | 2010 | 0.002742 | ||
1676720 | Josiah | M | 5138 | 2010 | 0.002707 | ||
1676721 | Luis | M | 5104 | 2010 | 0.002689 | ||
1676722 | Ayden | M | 5096 | 2010 | 0.002684 | ||
1676723 | Carson | M | 5064 | 2010 | 0.002668 | ||
1676724 | Adam | M | 5062 | 2010 | 0.002666 | ||
1676725 | Nathaniel | M | 5039 | 2010 | 0.002654 | ||
1676726 | Brody | M | 5015 | 2010 | 0.002642 | ||
1676727 | Tristan | M | 4854 | 2010 | 0.002557 | ||
1676728 | Diego | M | 4693 | 2010 | 0.002472 | ||
1676729 | Parker | M | 4687 | 2010 | 0.002469 | ||
1676730 | Blake | M | 4666 | 2010 | 0.002458 | ||
1676731 | Oliver | M | 4632 | 2010 | 0.002440 | ||
1676732 | Cole | M | 4562 | 2010 | 0.002403 | ||
1676733 | Carlos | M | 4559 | 2010 | 0.002402 | ||
1676734 | Jaden | M | 4468 | 2010 | 0.002354 | ||
1676735 | Jesus | M | 4425 | 2010 | 0.002331 | ||
1676736 | Alex | M | 4409 | 2010 | 0.002323 | ||
1676737 | Aidan | M | 4263 | 2010 | 0.002246 | ||
1676738 | Eric | M | 4163 | 2010 | 0.002193 | ||
1676739 | Hayden | M | 4151 | 2010 | 0.002187 | ||
1676740 | Bryan | M | 3914 | 2010 | 0.002062 | ||
1676741 | Max | M | 3819 | 2010 | 0.002012 | ||
1676742 | Jaxon | M | 3802 | 2010 | 0.002003 | ||
1676743 | Brian | M | 3744 | 2010 | 0.001972 |
26200 rows × 5 columns
boys = top100[top100.sex == 'M']
girls = top100[top100.sex == 'F']
total_births = top100.pivot_table('births', index='year', columns='name', aggfunc=sum)
total_births[:10]
name | Aaliyah | Aaron | Abigail | Ada | Adam | Addie | Addison | Adrian | Agnes | Aidan | ... | Willis | Wilma | Woodrow | Wyatt | Xavier | Yolanda | Yvonne | Zachary | Zoe | Zoey |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
year | |||||||||||||||||||||
1880 | NaN | NaN | NaN | 652 | NaN | 274 | NaN | NaN | 473 | NaN | ... | 166 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1881 | NaN | NaN | NaN | 628 | NaN | 287 | NaN | NaN | 424 | NaN | ... | 142 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1882 | NaN | NaN | NaN | 689 | NaN | 341 | NaN | NaN | 565 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1883 | NaN | NaN | NaN | 778 | NaN | 362 | NaN | NaN | 623 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1884 | NaN | NaN | NaN | 854 | NaN | 356 | NaN | NaN | 703 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1885 | NaN | NaN | NaN | 876 | NaN | 406 | NaN | NaN | 695 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1886 | NaN | NaN | NaN | 915 | NaN | 417 | NaN | NaN | 779 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1887 | NaN | NaN | NaN | 910 | NaN | 393 | NaN | NaN | 896 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1888 | NaN | NaN | NaN | 1116 | NaN | 455 | NaN | NaN | 1046 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1889 | NaN | NaN | NaN | 1005 | NaN | 448 | NaN | NaN | 1033 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
10 rows × 783 columns
subset = total_births[['John', 'Harry', 'Mary', 'Marilyn']]
%matplotlib inline
subset.plot(subplots=True, figsize=(12,10), grid=False, title='Nuber of births per year')
array([<matplotlib.axes._subplots.AxesSubplot object at 0x0000000017920208>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000010744B00>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000000176310F0>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000000014D53518>], dtype=object)
위의 표를 통해서 부모가 아이 이름을 지을 때 흔한 이름은 기피하는것으로 해석할 수 있다.
table = top100.pivot_table('prop', index='year', columns='sex', aggfunc=sum)
table[:10]
sex | F | M |
---|---|---|
year | ||
1880 | 0.765454 | 0.801381 |
1881 | 0.765766 | 0.802537 |
1882 | 0.760688 | 0.796098 |
1883 | 0.755177 | 0.799650 |
1884 | 0.747281 | 0.792232 |
1885 | 0.741334 | 0.789633 |
1886 | 0.736166 | 0.788870 |
1887 | 0.731332 | 0.790252 |
1888 | 0.725104 | 0.783794 |
1889 | 0.718897 | 0.786346 |
table.plot(title='Sum of table1000.prop by year and sex', yticks=np.linspace(0, 1.2, 13), xticks=range(1880, 2020, 10))
<matplotlib.axes._subplots.AxesSubplot at 0x13a129e8>
df = boys[boys.year == 2010] #2010년에 인기있는 이름순으로 정렬
df
name | sex | births | year | prop | |||
---|---|---|---|---|---|---|---|
year | sex | ||||||
2010 | M | 1676644 | Jacob | M | 21875 | 2010 | 0.011523 |
1676645 | Ethan | M | 17866 | 2010 | 0.009411 | ||
1676646 | Michael | M | 17133 | 2010 | 0.009025 | ||
1676647 | Jayden | M | 17030 | 2010 | 0.008971 | ||
1676648 | William | M | 16870 | 2010 | 0.008887 | ||
1676649 | Alexander | M | 16634 | 2010 | 0.008762 | ||
1676650 | Noah | M | 16281 | 2010 | 0.008576 | ||
1676651 | Daniel | M | 15679 | 2010 | 0.008259 | ||
1676652 | Aiden | M | 15403 | 2010 | 0.008114 | ||
1676653 | Anthony | M | 15364 | 2010 | 0.008093 | ||
1676654 | Joshua | M | 15238 | 2010 | 0.008027 | ||
1676655 | Mason | M | 14728 | 2010 | 0.007758 | ||
1676656 | Christopher | M | 14135 | 2010 | 0.007446 | ||
1676657 | Andrew | M | 14093 | 2010 | 0.007424 | ||
1676658 | David | M | 14042 | 2010 | 0.007397 | ||
1676659 | Matthew | M | 13954 | 2010 | 0.007350 | ||
1676660 | Logan | M | 13943 | 2010 | 0.007345 | ||
1676661 | Elijah | M | 13735 | 2010 | 0.007235 | ||
1676662 | James | M | 13714 | 2010 | 0.007224 | ||
1676663 | Joseph | M | 13657 | 2010 | 0.007194 | ||
1676664 | Gabriel | M | 12722 | 2010 | 0.006701 | ||
1676665 | Benjamin | M | 12280 | 2010 | 0.006469 | ||
1676666 | Ryan | M | 11886 | 2010 | 0.006261 | ||
1676667 | Samuel | M | 11776 | 2010 | 0.006203 | ||
1676668 | Jackson | M | 11693 | 2010 | 0.006159 | ||
1676669 | John | M | 11424 | 2010 | 0.006018 | ||
1676670 | Nathan | M | 11269 | 2010 | 0.005936 | ||
1676671 | Jonathan | M | 11028 | 2010 | 0.005809 | ||
1676672 | Christian | M | 10965 | 2010 | 0.005776 | ||
1676673 | Liam | M | 10852 | 2010 | 0.005716 | ||
... | ... | ... | ... | ... | ... | ||
... | Xavier | M | 5701 | 2010 | 0.003003 | ||
1676714 | Ian | M | 5510 | 2010 | 0.002902 | ||
1676715 | Colton | M | 5270 | 2010 | 0.002776 | ||
1676716 | Dominic | M | 5260 | 2010 | 0.002771 | ||
1676717 | Juan | M | 5217 | 2010 | 0.002748 | ||
1676718 | Cooper | M | 5206 | 2010 | 0.002742 | ||
1676719 | Josiah | M | 5138 | 2010 | 0.002707 | ||
1676720 | Luis | M | 5104 | 2010 | 0.002689 | ||
1676721 | Ayden | M | 5096 | 2010 | 0.002684 | ||
1676722 | Carson | M | 5064 | 2010 | 0.002668 | ||
1676723 | Adam | M | 5062 | 2010 | 0.002666 | ||
1676724 | Nathaniel | M | 5039 | 2010 | 0.002654 | ||
1676725 | Brody | M | 5015 | 2010 | 0.002642 | ||
1676726 | Tristan | M | 4854 | 2010 | 0.002557 | ||
1676727 | Diego | M | 4693 | 2010 | 0.002472 | ||
1676728 | Parker | M | 4687 | 2010 | 0.002469 | ||
1676729 | Blake | M | 4666 | 2010 | 0.002458 | ||
1676730 | Oliver | M | 4632 | 2010 | 0.002440 | ||
1676731 | Cole | M | 4562 | 2010 | 0.002403 | ||
1676732 | Carlos | M | 4559 | 2010 | 0.002402 | ||
1676733 | Jaden | M | 4468 | 2010 | 0.002354 | ||
1676734 | Jesus | M | 4425 | 2010 | 0.002331 | ||
1676735 | Alex | M | 4409 | 2010 | 0.002323 | ||
1676736 | Aidan | M | 4263 | 2010 | 0.002246 | ||
1676737 | Eric | M | 4163 | 2010 | 0.002193 | ||
1676738 | Hayden | M | 4151 | 2010 | 0.002187 | ||
1676739 | Bryan | M | 3914 | 2010 | 0.002062 | ||
1676740 | Max | M | 3819 | 2010 | 0.002012 | ||
1676741 | Jaxon | M | 3802 | 2010 | 0.002003 | ||
1676742 | Brian | M | 3744 | 2010 | 0.001972 |
100 rows × 5 columns
prop_cumsum = df.sort_index(by='prop', ascending=False).prop.cumsum()
#prop의 누계가 0.5가 되는 위치를 구한다.
prop_cumsum
year sex 2010 M 1676644 0.011523 1676645 0.020934 1676646 0.029959 1676647 0.038930 1676648 0.047817 1676649 0.056579 1676650 0.065155 1676651 0.073414 1676652 0.081528 1676653 0.089621 1676654 0.097648 1676655 0.105406 1676656 0.112852 1676657 0.120276 1676658 0.127672 1676659 0.135023 1676660 0.142368 1676661 0.149603 1676662 0.156827 1676663 0.164021 1676664 0.170722 1676665 0.177191 1676666 0.183452 1676667 0.189655 1676668 0.195815 1676669 0.201832 1676670 0.207769 1676671 0.213578 1676672 0.219354 1676673 0.225070 ... 1676714 0.397524 1676715 0.400426 1676716 0.403202 1676717 0.405973 1676718 0.408721 1676719 0.411464 1676720 0.414170 1676721 0.416859 1676722 0.419543 1676723 0.422211 1676724 0.424877 1676725 0.427531 1676726 0.430173 1676727 0.432730 1676728 0.435202 1676729 0.437671 1676730 0.440129 1676731 0.442569 1676732 0.444972 1676733 0.447374 1676734 0.449727 1676735 0.452058 1676736 0.454381 1676737 0.456626 1676738 0.458819 1676739 0.461006 1676740 0.463067 1676741 0.465079 1676742 0.467082 1676743 0.469054 Name: prop, dtype: float64
type(prop_cumsum)
pandas.core.series.Series
prop_cumsum.searchsorted(0.03)
array([3], dtype=int64)
prop_cumsum.searchsorted(0.03)[0]
3
prop_cumsum.searchsorted(0.5)[0] + 1
#색인의 경우 시작을 0부터 하기 때문에 +1을 한다.
101
df = boys[boys.year == 1900]
in1900 = df.sort_index(by='prop', ascending=False).prop.cumsum()
in1900.searchsorted(0.5)[0] + 1
25
연도와 성별을 Groupby로 묶고 각 그룹에 apply를 사용하여 연산을 적용한다.
#함수 선언
def get_quantile_count(group, q=0.5):
group = group.sort_index(by='prop', ascending=False)
return group.prop.cumsum().searchsorted(q)[0] + 1
diversity = top100.groupby(['year', 'sex']).apply(get_quantile_count)
diversity = diversity.unstack('sex')
diversity.head() #diversity[:5]와 같은 결과
sex | F | M |
---|---|---|
year | ||
1880 | 38 | 14 |
1881 | 38 | 14 |
1882 | 38 | 15 |
1883 | 39 | 15 |
1884 | 39 | 16 |
diversity.plot(title = "Number of popular names in top 50%")
#numeric searchsorted 는 배열의 위치를 찾는 것이기 때문에 int형이 아님
<matplotlib.axes._subplots.AxesSubplot at 0x15682128>
#name 열에서 마지막 글자를 추출
get_last_letter = lambda x : x[-1]
last_letters = names.name.map(get_last_letter)
last_letters.name = 'last_letter'
table = names.pivot_table('births', index=last_letters, columns=['sex', 'year'], aggfunc=sum)
subtable = table.reindex(columns=[1910, 1960, 2010], level='year')
subtable.head() #subtable[:5]와 같은 효과
sex | F | M | ||||
---|---|---|---|---|---|---|
year | 1910 | 1960 | 2010 | 1910 | 1960 | 2010 |
last_letter | ||||||
a | 108376 | 691247 | 670605 | 977 | 5204 | 28438 |
b | NaN | 694 | 450 | 411 | 3912 | 38859 |
c | 5 | 49 | 946 | 482 | 15476 | 23125 |
d | 6750 | 3729 | 2607 | 22111 | 262112 | 44398 |
e | 133569 | 435013 | 313833 | 28655 | 178823 | 129012 |
subtable.sum()
sex year F 1910 396416 1960 2022062 2010 1759010 M 1910 194198 1960 2132588 2010 1898382 dtype: float64
letter_prop = subtable / subtable.sum().astype(float)
import matplotlib.pyplot as plt
fig, axes = plt.subplots(2, 1, figsize=(10, 8))
letter_prop['M'].plot(kind='bar', rot=0, ax=axes[0], title='Male'), letter_prop['F'].plot(kind='bar', rot=0, ax=axes[1], title='Female', legend=False)
(<matplotlib.axes._subplots.AxesSubplot at 0x138253c8>, <matplotlib.axes._subplots.AxesSubplot at 0x1f054b38>)
그래프에서 보듯이 남자는 'n'으로 끝나는 이름이 1960이후 증가
letter_prop = table / table.sum().astype(float)
dny_ts = letter_prop.ix[['d', 'n', 'y'], 'M'].T
dny_ts.head()
last_letter | d | n | y |
---|---|---|---|
year | |||
1880 | 0.083055 | 0.153213 | 0.075760 |
1881 | 0.083247 | 0.153214 | 0.077451 |
1882 | 0.085340 | 0.149560 | 0.077537 |
1883 | 0.084066 | 0.151646 | 0.079144 |
1884 | 0.086120 | 0.149915 | 0.080405 |
dny_ts.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x13d76d68>
all_names = top100.name.unique()
mask = np.array(['lesl' in x.lower() for x in all_names])
lesley_like = all_names[mask]
lesley_like
array(['Leslie'], dtype=object)
2)이름들만 추려내어 이름별로 출생수를 구하고 상대 도수 확인
filtered = top100[top100.name.isin(lesley_like)]
filtered.groupby('name').births.sum()
name Leslie 191481 Name: births, dtype: int64
table = filtered.pivot_table('births', index='year', columns='sex', aggfunc=sum)
table = table.div(table.sum(1), axis=0)
table.tail()
sex | F | M |
---|---|---|
year | ||
2000 | 1 | NaN |
2001 | 1 | NaN |
2002 | 1 | NaN |
2003 | 1 | NaN |
2004 | 1 | NaN |
table.plot(style={'M': 'k-', 'F': 'k--'})
#남자는 실선으로, 여자는 점선으로 표시
<matplotlib.axes._subplots.AxesSubplot at 0x15e12e80>