함수 | 설명 |
---|---|
read_csv | 파일, URL 또는 파일과 유사한 객체로부터 구분된 데이터를 읽어온다. 데이터 구분자는 쉼표(,)를 기본으로 한다. |
read_table | 파일, URL 또는 파일과 유사한 객체로부터 구분된 데이터를 읽어온다. 데이터 구분자는 탭('\t')을 기본으로 한다. |
read_fwf | 고정폭 칼럼 형식에서 데이터를 읽어온다(구분자가 없는 데이터) |
read_clipboard | 클립보드에 있는 데이터를 읽어오는 read_table 함수. 웹페이지에서 표를 긁어올 때 유용하다. |
from pandas import DataFrame, Series
import pandas as pd
!cat ch06/ex1.csv
a,b,c,d,message 1,2,3,4,hello 5,6,7,8,world 9,10,11,12,foo
df = pd.read_csv('ch06/ex1.csv')
pd.read_csv('ch06/ex1.csv', header=None)
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
0 | a | b | c | d | message |
1 | 1 | 2 | 3 | 4 | hello |
2 | 5 | 6 | 7 | 8 | world |
3 | 9 | 10 | 11 | 12 | foo |
# 원래 있던 Column명 무시하고 내가 원하는 Column명 설정
pd.read_csv('ch06/ex1.csv', names=[5,6,7,8,9])
5 | 6 | 7 | 8 | 9 | |
---|---|---|---|---|---|
0 | a | b | c | d | message |
1 | 1 | 2 | 3 | 4 | hello |
2 | 5 | 6 | 7 | 8 | world |
3 | 9 | 10 | 11 | 12 | foo |
pd.read_csv('ch06/ex1.csv', names=['a1', 'b1', 'c1', 'd1', 'message1'])
a1 | b1 | c1 | d1 | message1 | |
---|---|---|---|---|---|
0 | a | b | c | d | message |
1 | 1 | 2 | 3 | 4 | hello |
2 | 5 | 6 | 7 | 8 | world |
3 | 9 | 10 | 11 | 12 | foo |
df
a | b | c | d | message | |
---|---|---|---|---|---|
0 | 1 | 2 | 3 | 4 | hello |
1 | 5 | 6 | 7 | 8 | world |
2 | 9 | 10 | 11 | 12 | foo |
# csv는 DataFrame으로 읽어온다.
type(df)
pandas.core.frame.DataFrame
pd.read_table('ch06/ex1.csv', sep=',')
a | b | c | d | message | |
---|---|---|---|---|---|
0 | 1 | 2 | 3 | 4 | hello |
1 | 5 | 6 | 7 | 8 | world |
2 | 9 | 10 | 11 | 12 | foo |
pd.read_table('ch06/ex1.csv', sep=',', header=None)
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
0 | a | b | c | d | message |
1 | 1 | 2 | 3 | 4 | hello |
2 | 5 | 6 | 7 | 8 | world |
3 | 9 | 10 | 11 | 12 | foo |
!cat ch06/ex2.csv
1,2,3,4,hello 5,6,7,8,world 9,10,11,12,foo
# header 자동 생성
pd.read_csv('ch06/ex2.csv', header=None)
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
0 | 1 | 2 | 3 | 4 | hello |
1 | 5 | 6 | 7 | 8 | world |
2 | 9 | 10 | 11 | 12 | foo |
# header 옵션이 없을시 header를 첫번째 줄로 이용
pd.read_csv('ch06/ex2.csv')
1 | 2 | 3 | 4 | hello | |
---|---|---|---|---|---|
0 | 5 | 6 | 7 | 8 | world |
1 | 9 | 10 | 11 | 12 | foo |
# Column명 추가
pd.read_csv('ch06/ex2.csv', names=['a', 'b', 'c', 'message'])
a | b | c | message | |
---|---|---|---|---|
1 | 2 | 3 | 4 | hello |
5 | 6 | 7 | 8 | world |
9 | 10 | 11 | 12 | foo |
names = ['a', 'b', 'c', 'd', 'message']
pd.read_csv('ch06/ex2.csv', names=names)
a | b | c | d | message | |
---|---|---|---|---|---|
0 | 1 | 2 | 3 | 4 | hello |
1 | 5 | 6 | 7 | 8 | world |
2 | 9 | 10 | 11 | 12 | foo |
# message -> index
pd.read_csv('ch06/ex2.csv', names=names, index_col='message')
a | b | c | d | |
---|---|---|---|---|
message | ||||
hello | 1 | 2 | 3 | 4 |
world | 5 | 6 | 7 | 8 |
foo | 9 | 10 | 11 | 12 |
pd.read_csv('ch06/ex2.csv', names=names, index_col='a')
b | c | d | message | |
---|---|---|---|---|
a | ||||
1 | 2 | 3 | 4 | hello |
5 | 6 | 7 | 8 | world |
9 | 10 | 11 | 12 | foo |
!cat ch06/csv_mindex.csv
key1,key2,value1,value2 one,a,1,2 one,b,3,4 one,c,5,6 one,d,7,8 two,a,9,10 two,b,11,12 two,c,13,14 two,d,15,16
parsed = pd.read_csv('ch06/csv_mindex.csv', index_col=['key1', 'key2'])
parsed
value1 | value2 | ||
---|---|---|---|
key1 | key2 | ||
one | a | 1 | 2 |
b | 3 | 4 | |
c | 5 | 6 | |
d | 7 | 8 | |
two | a | 9 | 10 |
b | 11 | 12 | |
c | 13 | 14 | |
d | 15 | 16 |
list(open('ch06/ex3.txt'))
[' A B C\n', 'aaa -0.264438 -1.026059 -0.619500\n', 'bbb 0.927272 0.302904 -0.032399\n', 'ccc -0.264273 -0.386314 -0.217601\n', 'ddd -0.871858 -0.348382 1.100491\n']
result = pd.read_table('ch06/ex3.txt', sep='\s+')
result
A | B | C | |
---|---|---|---|
aaa | -0.264438 | -1.026059 | -0.619500 |
bbb | 0.927272 | 0.302904 | -0.032399 |
ccc | -0.264273 | -0.386314 | -0.217601 |
ddd | -0.871858 | -0.348382 | 1.100491 |
pd.read_csv('ch06/ex3.txt', delimiter='\s+')
A | B | C | |
---|---|---|---|
aaa | -0.264438 | -1.026059 | -0.619500 |
bbb | 0.927272 | 0.302904 | -0.032399 |
ccc | -0.264273 | -0.386314 | -0.217601 |
ddd | -0.871858 | -0.348382 | 1.100491 |
# Read CSV(comma-separated) file into DataFrame
pd.read_csv?
!cat ch06/ex4.csv
# hey! a,b,c,d,message # just wanted to make things more difficult for you # who reads CSV files with computers, anyway? 1,2,3,4,hello 5,6,7,8,world 9,10,11,12,foo
pd.read_csv('ch06/ex4.csv', skiprows=[0, 2, 3])
a | b | c | d | message | |
---|---|---|---|---|---|
0 | 1 | 2 | 3 | 4 | hello |
1 | 5 | 6 | 7 | 8 | world |
2 | 9 | 10 | 11 | 12 | foo |
!cat ch06/ex5.csv
something,a,b,c,d,message one,1,2,3,4,NA two,5,6,,8,world three,9,10,11,12,foo
result = pd.read_csv('ch06/ex5.csv')
result
something | a | b | c | d | message | |
---|---|---|---|---|---|---|
0 | one | 1 | 2 | 3 | 4 | NaN |
1 | two | 5 | 6 | NaN | 8 | world |
2 | three | 9 | 10 | 11 | 12 | foo |
pd.isnull(result)
something | a | b | c | d | message | |
---|---|---|---|---|---|---|
0 | False | False | False | False | False | True |
1 | False | False | False | True | False | False |
2 | False | False | False | False | False | False |
result = pd.read_csv('ch06/ex5.csv', na_values=['NULL'])
result
something | a | b | c | d | message | |
---|---|---|---|---|---|---|
0 | one | 1 | 2 | 3 | 4 | NaN |
1 | two | 5 | 6 | NaN | 8 | world |
2 | three | 9 | 10 | 11 | 12 | foo |
# world를 NA값으로 처리하니 NaN으로 나온다.
# 특정한 값을 NA 처리할 수 있을것 같다.
pd.read_csv('ch06/ex5.csv', na_values=['world'])
something | a | b | c | d | message | |
---|---|---|---|---|---|---|
0 | one | 1 | 2 | 3 | 4 | NaN |
1 | two | 5 | 6 | NaN | 8 | NaN |
2 | three | 9 | 10 | 11 | 12 | foo |
sentinels = {'message': ['foo', 'NA'], 'something': ['two']}
pd.read_csv('ch06/ex5.csv', na_values=sentinels)
something | a | b | c | d | message | |
---|---|---|---|---|---|---|
0 | one | 1 | 2 | 3 | 4 | NaN |
1 | NaN | 5 | 6 | NaN | 8 | world |
2 | three | 9 | 10 | 11 | 12 | NaN |
인자 | 설명 |
---|---|
path | 파일 시스템에서의 위치, URL, 파일 객체를 나타내는 문자열 |
sep or delimiter | 필드를 구분하기 위해 사용할 연속된 문자나 정규표현식 |
header | 칼럼의 이름으로 사용할 로우의 번호, 기본 값은 0(첫 로우)이며 헤더가 없으면 None으로 지정할 수 있다. |
index_col | 색인으로 사용할 칼럼 번호나 이름, 계층적 색인을 지정할 경우 리스트를 넘길 수 있다. |
names | 컬럼 이름으로 사용할 리스트. header = None과 함께 사용한다. |
skiprows | 파일의 시작부터 무시할 로우의 개수 또는 무시할 로우 번호가 담긴 리스트 |
na_values | NA 값으로 처리할 값들의 나열 |
comment | 주석으로 분류되어 파싱하지 않을 문자 혹은 문자열 |
parse_dates | 날짜를 datetime으로 변환할지의 여부. 기본값은 False이며, True일 경우 모든 칼럼에 다 적용된다. 리스트를 넘기면 변환할 칼럼을 지정할 수 있는데, [1, 2, 3]을 넘기면 각각의 칼럼을 datetime으로 변환하고, [[1, 3]]을 넘기면 1, 3번 칼럼을 조합해서 하나의 datetime으로 변환한다. |
keep_date_col | 여러 칼럼을 datetime으로 변환했을 경우 원래 칼럼을 남겨둘지의 여부. 기본값은 False |
converters | 변환 시 칼럼에 적용할 함수를 지정한다. 예를 들어 {'foo': f}는 'foo'칼럼에 f 함수를 적용한다. 전달하는 사전의 키 값은 칼럼 이름이나 번호가 될 수 있다. |
dayfirst | 모호한 날짜 형식일 경우 국제 형식으로 간주한다(7/6/2012는 2012년 6월 7일로 간주한다). 기본값은 False |
date_parser | 날짜 변환 시 사용할 함수 |
nrows | 파일의 첫 일부만 읽어올 때 처음 몇 줄을 읽을 것인지 지정한다. |
iterator | 파일을 조금씩 읽을 때 사용하도록 TextParser 객체를 반환하도록 한다. 기본값은 False |
chunksize | TextParser 객체에서 사용할, 한 번에 읽을 파일의 크기 |
skip_footer | 무시할 파일의 마지막 줄 수 |
verbose | 파싱 결과에 대한 정보를 출력한다. 숫자가 아닌 값들이 들어있는 칼럼이면서 누락된 값이 있다면 줄 번호를 출력한다. 기본값은 False |
encoding | 유니코드 인코딩 종류를 지정한다. UTF-8로 인코딩된 텍스트일 경우 'utf-8'로 지정한다. |
squeeze | 로우가 하나뿐이라면 Series 객체를 반환한다. 기본값은 False |
thousands | 숫자를 천 단위로 끊을 때 사용할 ', '나 '.' 같은 구분자 |
# 이 명령어로 어떤 함수인지, 어떤 파라미터를 넘겨야 하는지 정확히 알 수 있다.
# 굳이 명령어들을 따라칠 필요는 없는데 어떤 파라미터들을 넘기는지 한 번 공부하는 겸겸해서 쳐봤다.
pd.read_csv?
result = pd.read_csv('ch06/ex6.csv')
result
<class 'pandas.core.frame.DataFrame'> Int64Index: 10000 entries, 0 to 9999 Data columns (total 5 columns): one 10000 non-null values two 10000 non-null values three 10000 non-null values four 10000 non-null values key 10000 non-null values dtypes: float64(4), object(1)
pd.read_csv('ch06/ex6.csv', nrows=5)
one | two | three | four | key | |
---|---|---|---|---|---|
0 | 0.467976 | -0.038649 | -0.295344 | -1.824726 | L |
1 | -0.358893 | 1.404453 | 0.704965 | -0.200638 | B |
2 | -0.501840 | 0.659254 | -0.421691 | -0.057688 | G |
3 | 0.204886 | 1.074134 | 1.388361 | -0.982404 | R |
4 | 0.354628 | -0.133116 | 0.283763 | -0.837063 | Q |
chunker = pd.read_csv('ch06/ex6.csv', chunksize=1000)
chunker
<pandas.io.parsers.TextFileReader at 0x10c3d2490>
chunker = pd.read_csv('ch06/ex6.csv', chunksize=1000)
tot = Series([])
for piece in chunker:
# piece['key']에 있는 E, X, L 등의 숫자를 센다. 값이 없는 것들은 0으로 채운다.
tot = tot.add( piece['key'].value_counts(), fill_value=0)
# Key가 아닌 값을(order) 기준으로 내림차순 정리
tot = tot.order(ascending=False)
tot[:10]
E 368 X 364 L 346 O 343 Q 340 M 338 J 337 F 335 K 334 H 330 dtype: float64
data = pd.read_csv('ch06/ex5.csv')
data
something | a | b | c | d | message | |
---|---|---|---|---|---|---|
0 | one | 1 | 2 | 3 | 4 | NaN |
1 | two | 5 | 6 | NaN | 8 | world |
2 | three | 9 | 10 | 11 | 12 | foo |
data.to_csv('ch06/out.csv')
!cat ch06/out.csv
,something,a,b,c,d,message 0,one,1,2,3.0,4, 1,two,5,6,,8,world 2,three,9,10,11.0,12,foo
# csv로 지정하는데 output은 표준아웃풋(모니터), separator는 '|'
data.to_csv(sys.stdout, sep='|')
|something|a|b|c|d|message 0|one|1|2|3.0|4| 1|two|5|6||8|world 2|three|9|10|11.0|12|foo
# Write DataFrame to a comma-separated value (csv) file
# na_rep -> Missing data representation. NA REPresentation
data.to_csv?
data.to_csv(sys.stdout, na_rep='NULL')
,something,a,b,c,d,message 0,one,1,2,3.0,4,NULL 1,two,5,6,NULL,8,world 2,three,9,10,11.0,12,foo
data.to_csv(sys.stdout, na_rep='NaN')
,something,a,b,c,d,message 0,one,1,2,3.0,4,NaN 1,two,5,6,NaN,8,world 2,three,9,10,11.0,12,foo
data.to_csv(sys.stdout, index=False, header=False)
one,1,2,3.0,4, two,5,6,,8,world three,9,10,11.0,12,foo
data.to_csv(sys.stdout, index=False, cols=['a', 'b', 'c'])
a,b,c 1,2,3.0 5,6, 9,10,11.0
dates = pd.date_range('1/1/2000', periods=7)
ts = Series(np.arange(7), index=dates)
ts.to_csv('ch06/tseries.csv')
!cat ch06/tseries.csv
2000-01-01,0 2000-01-02,1 2000-01-03,2 2000-01-04,3 2000-01-05,4 2000-01-06,5 2000-01-07,6
pd.DataFrame.to_csv?
Series.from_csv('ch06/tseries.csv', parse_dates=True)
2000-01-01 0 2000-01-02 1 2000-01-03 2 2000-01-04 3 2000-01-05 4 2000-01-06 5 2000-01-07 6 dtype: int64
type( Series.from_csv('ch06/tseries.csv', parse_dates=True) )
pandas.core.series.Series
# parse dates: boolean, default True.
# Parse dates. Different default from read_table
Series.from_csv?
pd.read_csv('ch06/tseries.csv', header=None)
0 | 1 | |
---|---|---|
0 | 2000-01-01 | 0 |
1 | 2000-01-02 | 1 |
2 | 2000-01-03 | 2 |
3 | 2000-01-04 | 3 |
4 | 2000-01-05 | 4 |
5 | 2000-01-06 | 5 |
6 | 2000-01-07 | 6 |
type(pd.read_csv('ch06/tseries.csv', header=None))
pandas.core.frame.DataFrame
pd.read_csv?
!cat ch06/ex7.csv
"a","b","c" "1","2","3" "1","2","3","4"
import csv
f = open('ch06/ex7.csv')
reader = csv.reader(f)
for line in reader:
print line
['a', 'b', 'c'] ['1', '2', '3'] ['1', '2', '3', '4']
lines = list(csv.reader(open('ch06/ex7.csv')))
header, values = lines[0], lines[1:]
header
['a', 'b', 'c']
values
[['1', '2', '3'], ['1', '2', '3', '4']]
# header = a,b,c
# values를 1,1을 같이 묶는다. 2,2 묶고. 3,3 묶고. 4는 header가 a,b,c 3개 밖에 없기 때문에 포함되지 않는다.
data_dict = {h: v for h, v in zip(header, zip(*values))}
data_dict
{'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}
class my_dialect(csv.Dialect):
lineterminator = '\n'
delimiter = ';'
quotechar = '"'
reader = csv.reader
reader = csv.reader?
reader = csv.reader
reader = csv.reader
# quoting이 꼭 integer여야 한다는 오류가 발생해서 삽질하다가 뒤에 quoting keyword를 붙여줌..
reader = csv.reader(f, dialect=my_dialect)
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-257-92557f61d368> in <module>() 1 # quoting이 꼭 integer여야 한다는 오류가 발생해서 삽질하다가 뒤에 quoting keyword를 붙여줌.. ----> 2 reader = csv.reader(f, dialect=my_dialect) TypeError: "quoting" must be an integer
reader = csv.reader(f, dialect=my_dialect, quoting=csv.QUOTE_NONE)
csv.QUOTE_NONE
3
reader = csv.reader(f, delimiter='|')
# 어떤 옵션들 있는지 보려고 했더니 안 보여주네...
csv.reader??
with open('ch06/mydata.csv', 'w') as f:
writer = csv.writer(f, dialect=my_dialect, quoting=csv.QUOTE_NONE)
writer.writerow(('one', 'two', 'three'))
writer.writerow(('1', '2', '3'))
writer.writerow(('4', '5', '6'))
writer.writerow(('7', '8', '9'))
!cat ch06/mydata.csv
one;two;three 1;2;3 4;5;6 7;8;9
# json은 python에서처럼 '으로 하면 안된다. 현재 """로 감싸 문자열로 저장되어 있기 때문에
# javascript에서는 '를 string 값으로 인식하지 않아서 에러 발생
obj = """
{
'name': 'Wes',
'places_lived': ['United States', 'Spain', 'Germany'],
'pet': null, 'siblings': [{'name': 'Scott', 'age':25, 'pet':'Zuko'},
{'name': 'Katie', 'age':33, 'pet': 'Cisco'}]
}
"""
import json
# ValueError: Expecting property name: line 3 column 5 (char 7)
result = json.loads(obj)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-266-f05e1f9794f2> in <module>() 1 # ValueError: Expecting property name: line 3 column 5 (char 7) ----> 2 result = json.loads(obj) /Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/json/__init__.pyc in loads(s, encoding, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw) 336 parse_int is None and parse_float is None and 337 parse_constant is None and object_pairs_hook is None and not kw): --> 338 return _default_decoder.decode(s) 339 if cls is None: 340 cls = JSONDecoder /Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/json/decoder.pyc in decode(self, s, _w) 363 364 """ --> 365 obj, end = self.raw_decode(s, idx=_w(s, 0).end()) 366 end = _w(s, end).end() 367 if end != len(s): /Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/json/decoder.pyc in raw_decode(self, s, idx) 379 """ 380 try: --> 381 obj, end = self.scan_once(s, idx) 382 except StopIteration: 383 raise ValueError("No JSON object could be decoded") ValueError: Expecting property name: line 3 column 5 (char 7)
obj = """
{
"name": "Wes",
"places_lived": ["United States", "Spain", "Germany"],
"pet": null, "siblings": [{"name": "Scott", "age":25, "pet":"Zuko"},
{"name": "Katie", "age":33, "pet": "Cisco"}]
}
"""
obj
'\n{\n "name": "Wes",\n "places_lived": ["United States", "Spain", "Germany"],\n "pet": null, "siblings": [{"name": "Scott", "age":25, "pet":"Zuko"},\n {"name": "Katie", "age":33, "pet": "Cisco"}]\n}\n'
# ValueError: Expecting property name: line 3 column 5 (char 7)
result = json.loads(obj)
result
{u'name': u'Wes', u'pet': None, u'places_lived': [u'United States', u'Spain', u'Germany'], u'siblings': [{u'age': 25, u'name': u'Scott', u'pet': u'Zuko'}, {u'age': 33, u'name': u'Katie', u'pet': u'Cisco'}]}
asjson = json.dumps(result)
# '가 아니라 "인 것을 확인하자
asjson
'{"pet": null, "siblings": [{"pet": "Zuko", "age": 25, "name": "Scott"}, {"pet": "Cisco", "age": 33, "name": "Katie"}], "name": "Wes", "places_lived": ["United States", "Spain", "Germany"]}'
siblings = DataFrame(result['siblings'], columns=['name', 'age'])
siblings
name | age | |
---|---|---|
0 | Scott | 25 |
1 | Katie | 33 |
# 책에 나와있지 않은 내용을 한 번 더 해봐야 쉽게 이해가 되는듯
siblings2 = DataFrame(result['siblings'], columns=['name', 'age', 'pet'])
siblings2
name | age | pet | |
---|---|---|---|
0 | Scott | 25 | Zuko |
1 | Katie | 33 | Cisco |
from lxml.html import parse
from urllib2 import urlopen
# 데이터를 가져 올 url을 넘긴 후
# 데이터를 받아 온 후 parse
parsed = parse(urlopen('http://finance.yahoo.com/q/op?s=AAPL+Options'))
doc = parsed.getroot()
links = doc.findall('.//a')
# 이 객체는 HTML 엘리멘트를 표현하는 객체일뿐
# URL과 링크 이름을 가져오려면 각 엘리먼트에 대해 get 메서드를 호출하여 URL을 얻고
# text_content 메서드를 사용해서 링크 이름을 가져와야 한다.
links[15:20]
[<Element a at 0x10c3e2ad0>, <Element a at 0x10c3e2b30>, <Element a at 0x10c3e2b90>, <Element a at 0x10c3e2bf0>, <Element a at 0x10c3e2c50>]
lnk = links[28]
lnk
<Element a at 0x10c3e2fb0>
lnk.get('href')
'https://edit.yahoo.com/mc2.0/eval_profile?.intl=us&.lang=en-US&.done=http://finance.yahoo.com/q/op%3fs=AAPL%2bOptions&.src=quote&.intl=us&.lang=en-US'
lnk.text_content()
'Account Info'
urls = [lnk.get('href') for lnk in doc.findall('.//a')]
len(urls)
1239
urls[-3:-1]
['http://www.capitaliq.com', 'http://www.csidata.com']
urls[-10:]
['/q?s=AAPL140517P00780000', '/q/op?s=AAPL&k=800.000000', '/q?s=AAPL140517P00800000', '/q/op?s=AAPL&k=805.000000', '/q?s=AAPL140517P00805000', '/q/os?s=AAPL&m=2014-05-30', 'http://help.yahoo.com/l/us/yahoo/finance/quotes/fitadelay.html', 'http://www.capitaliq.com', 'http://www.csidata.com', 'http://www.morningstar.com/']
tables = doc.findall('.//table')
tables
[<Element table at 0x100769470>, <Element table at 0x1007694d0>, <Element table at 0x100769530>, <Element table at 0x100769590>, <Element table at 0x1007695f0>, <Element table at 0x100769650>, <Element table at 0x1007696b0>, <Element table at 0x100769710>, <Element table at 0x100769770>, <Element table at 0x1007697d0>, <Element table at 0x100769830>, <Element table at 0x100769890>, <Element table at 0x100769ad0>, <Element table at 0x100769b30>, <Element table at 0x100769b90>, <Element table at 0x100769bf0>, <Element table at 0x100769c50>, <Element table at 0x100769cb0>]
calls = tables[9]
calls
<Element table at 0x1007697d0>
puts = tables[13]
rows = calls.findall('.//tr')
rows
[<Element tr at 0x100769d10>, <Element tr at 0x100769d70>, <Element tr at 0x100769e30>, <Element tr at 0x100769e90>, <Element tr at 0x100769ef0>, <Element tr at 0x100769f50>, <Element tr at 0x100769fb0>, <Element tr at 0x100790b90>, <Element tr at 0x100790ad0>, <Element tr at 0x100790050>, <Element tr at 0x1007900b0>, <Element tr at 0x100790110>, <Element tr at 0x100790170>, <Element tr at 0x1007901d0>, <Element tr at 0x100790230>, <Element tr at 0x100790290>, <Element tr at 0x1007902f0>, <Element tr at 0x100790350>, <Element tr at 0x1007903b0>, <Element tr at 0x100790410>, <Element tr at 0x100790470>, <Element tr at 0x1007904d0>, <Element tr at 0x100790530>, <Element tr at 0x100790590>, <Element tr at 0x1007905f0>, <Element tr at 0x100790650>, <Element tr at 0x1007906b0>, <Element tr at 0x100790710>, <Element tr at 0x100790770>, <Element tr at 0x1007907d0>, <Element tr at 0x100790830>, <Element tr at 0x100790890>, <Element tr at 0x1007908f0>, <Element tr at 0x100790950>, <Element tr at 0x1007909b0>, <Element tr at 0x100790a10>, <Element tr at 0x100790a70>, <Element tr at 0x10c3f3d10>, <Element tr at 0x10c3f3d70>, <Element tr at 0x10c3f3dd0>, <Element tr at 0x10c3f3e30>, <Element tr at 0x10c3f3e90>, <Element tr at 0x10c3f3ef0>, <Element tr at 0x10c3f3f50>, <Element tr at 0x10c3f3fb0>, <Element tr at 0x10078d050>, <Element tr at 0x10078d0b0>, <Element tr at 0x10078d110>, <Element tr at 0x10078d170>, <Element tr at 0x10078d1d0>, <Element tr at 0x10078d230>, <Element tr at 0x10078d290>, <Element tr at 0x10078d2f0>, <Element tr at 0x10078d350>, <Element tr at 0x10078d3b0>, <Element tr at 0x10078d410>, <Element tr at 0x10078d470>, <Element tr at 0x10078d4d0>, <Element tr at 0x10078d530>, <Element tr at 0x10078d590>, <Element tr at 0x10078d5f0>, <Element tr at 0x10078d650>, <Element tr at 0x10078d6b0>, <Element tr at 0x10078d710>, <Element tr at 0x10078d770>, <Element tr at 0x10078d7d0>, <Element tr at 0x10078d830>, <Element tr at 0x10078d890>, <Element tr at 0x10078d8f0>, <Element tr at 0x10078d950>, <Element tr at 0x10078d9b0>, <Element tr at 0x10078da10>, <Element tr at 0x10078da70>, <Element tr at 0x10078dad0>, <Element tr at 0x10078db30>, <Element tr at 0x10078db90>, <Element tr at 0x10078dbf0>, <Element tr at 0x10078dc50>, <Element tr at 0x10078dcb0>, <Element tr at 0x10078dd10>, <Element tr at 0x10078dd70>, <Element tr at 0x10078ddd0>, <Element tr at 0x10078de30>, <Element tr at 0x10078de90>, <Element tr at 0x10078def0>, <Element tr at 0x10078df50>, <Element tr at 0x10078dfb0>, <Element tr at 0x100796050>, <Element tr at 0x1007960b0>, <Element tr at 0x100796110>, <Element tr at 0x100796170>, <Element tr at 0x1007961d0>, <Element tr at 0x100796230>, <Element tr at 0x100796290>, <Element tr at 0x1007962f0>, <Element tr at 0x100796350>, <Element tr at 0x1007963b0>, <Element tr at 0x100796410>, <Element tr at 0x100796470>, <Element tr at 0x1007964d0>, <Element tr at 0x100796530>, <Element tr at 0x100796590>, <Element tr at 0x1007965f0>, <Element tr at 0x100796650>, <Element tr at 0x1007966b0>, <Element tr at 0x100796710>, <Element tr at 0x100796770>, <Element tr at 0x1007967d0>, <Element tr at 0x100796830>, <Element tr at 0x100796890>, <Element tr at 0x1007968f0>, <Element tr at 0x100796950>, <Element tr at 0x1007969b0>, <Element tr at 0x100796a10>, <Element tr at 0x100796a70>, <Element tr at 0x100796ad0>, <Element tr at 0x100796b30>, <Element tr at 0x100796b90>, <Element tr at 0x100796bf0>, <Element tr at 0x100796c50>, <Element tr at 0x100796cb0>, <Element tr at 0x100796d10>, <Element tr at 0x100796d70>, <Element tr at 0x100796dd0>, <Element tr at 0x100796e30>, <Element tr at 0x100796e90>, <Element tr at 0x100796ef0>, <Element tr at 0x100796f50>, <Element tr at 0x100796fb0>, <Element tr at 0x100791050>, <Element tr at 0x1007910b0>, <Element tr at 0x100791110>, <Element tr at 0x100791170>, <Element tr at 0x1007911d0>, <Element tr at 0x100791230>, <Element tr at 0x100791290>, <Element tr at 0x1007912f0>, <Element tr at 0x100791350>, <Element tr at 0x1007913b0>, <Element tr at 0x100791410>, <Element tr at 0x100791470>, <Element tr at 0x1007914d0>, <Element tr at 0x100791530>, <Element tr at 0x100791590>, <Element tr at 0x1007915f0>, <Element tr at 0x100791650>, <Element tr at 0x1007916b0>, <Element tr at 0x100791710>, <Element tr at 0x100791770>, <Element tr at 0x1007917d0>, <Element tr at 0x100791830>, <Element tr at 0x100791890>, <Element tr at 0x1007918f0>, <Element tr at 0x100791950>, <Element tr at 0x1007919b0>, <Element tr at 0x100791a10>, <Element tr at 0x100791a70>, <Element tr at 0x100791ad0>, <Element tr at 0x100791b30>, <Element tr at 0x100791b90>, <Element tr at 0x100791bf0>, <Element tr at 0x100791c50>, <Element tr at 0x100791cb0>, <Element tr at 0x100791d10>, <Element tr at 0x100791d70>, <Element tr at 0x100791dd0>, <Element tr at 0x100791e30>, <Element tr at 0x100791e90>, <Element tr at 0x100791ef0>, <Element tr at 0x100791f50>, <Element tr at 0x100791fb0>, <Element tr at 0x100794050>, <Element tr at 0x1007940b0>, <Element tr at 0x100794110>, <Element tr at 0x100794170>, <Element tr at 0x1007941d0>, <Element tr at 0x100794230>, <Element tr at 0x100794290>, <Element tr at 0x1007942f0>, <Element tr at 0x100794350>, <Element tr at 0x1007943b0>, <Element tr at 0x100794410>, <Element tr at 0x100794470>, <Element tr at 0x1007944d0>, <Element tr at 0x100794530>, <Element tr at 0x100794590>, <Element tr at 0x1007945f0>, <Element tr at 0x100794650>, <Element tr at 0x1007946b0>, <Element tr at 0x100794710>, <Element tr at 0x100794770>, <Element tr at 0x1007947d0>, <Element tr at 0x100794830>, <Element tr at 0x100794890>, <Element tr at 0x1007948f0>, <Element tr at 0x100794950>, <Element tr at 0x1007949b0>, <Element tr at 0x100794a10>, <Element tr at 0x100794a70>, <Element tr at 0x100794ad0>, <Element tr at 0x100794b30>, <Element tr at 0x100794b90>, <Element tr at 0x100794bf0>, <Element tr at 0x100794c50>, <Element tr at 0x100794cb0>, <Element tr at 0x100794d10>, <Element tr at 0x100794d70>, <Element tr at 0x100794dd0>, <Element tr at 0x100794e30>, <Element tr at 0x100794e90>, <Element tr at 0x100794ef0>, <Element tr at 0x100794f50>, <Element tr at 0x100794fb0>, <Element tr at 0x100797050>, <Element tr at 0x1007970b0>, <Element tr at 0x100797110>, <Element tr at 0x100797170>, <Element tr at 0x1007971d0>, <Element tr at 0x100797230>, <Element tr at 0x100797290>, <Element tr at 0x1007972f0>, <Element tr at 0x100797350>, <Element tr at 0x1007973b0>, <Element tr at 0x100797410>, <Element tr at 0x100797470>, <Element tr at 0x1007974d0>, <Element tr at 0x100797530>, <Element tr at 0x100797590>, <Element tr at 0x1007975f0>, <Element tr at 0x100797650>, <Element tr at 0x1007976b0>, <Element tr at 0x100797710>, <Element tr at 0x100797770>, <Element tr at 0x1007977d0>, <Element tr at 0x100797830>, <Element tr at 0x100797890>, <Element tr at 0x1007978f0>, <Element tr at 0x100797950>, <Element tr at 0x1007979b0>, <Element tr at 0x100797a10>, <Element tr at 0x100797a70>, <Element tr at 0x100797ad0>]
def _unpack(row, kind='td'):
elts = row.findall('.//%s' % kind)
return [val.text_content() for val in elts]
_unpack(rows[0], kind='th')
['Strike', 'Symbol', 'Last', 'Chg', 'Bid', 'Ask', 'Vol', 'Open Int']
_unpack(rows[1], kind='td')
['330.00', 'AAPL140517C00330000', '263.00', ' 0.00', '255.70', '258.25', '6', '2']
from pandas.io.parsers import TextParser
def parse_options_data(table):
rows = table.findall('.//tr')
# rows[0]은 header
header = _unpack(rows[0], kind='th')
# rows[1:] 부터 실제적인 data
data = [_unpack(r) for r in rows[1:]]
# TextParser에 data를 넘기고 column명으로 header를 사용
return TextParser(data, names=header).get_chunk()
# call option data
call_data = parse_options_data(calls)
# put option data
put_data = parse_options_data(puts)
call_data[:10]
Strike | Symbol | Last | Chg | Bid | Ask | Vol | Open Int | |
---|---|---|---|---|---|---|---|---|
0 | 330 | AAPL140517C00330000 | 263.00 | 0.00 | 255.70 | 258.25 | 6 | 2 |
1 | 400 | AAPL140517C00400000 | 193.00 | 0.00 | 186.40 | 187.95 | 1 | 9 |
2 | 410 | AAPL140517C00410000 | 181.30 | 0.00 | 175.70 | 178.10 | 68 | 10 |
3 | 420 | AAPL140517C00420000 | 170.80 | 0.00 | 165.85 | 168.65 | 124 | 1 |
4 | 430 | AAPL140517C00430000 | 160.75 | 0.00 | 155.85 | 158.05 | 376 | 2 |
5 | 440 | AAPL140517C00440000 | 152.87 | 0.00 | 146.35 | 147.90 | 1 | 1 |
6 | 445 | AAPL140517C00445000 | 145.55 | 0.00 | 140.95 | 143.25 | 106 | 1 |
7 | 450 | AAPL140517C00450000 | 137.00 | 5.00 | 136.35 | 137.90 | 1 | 48 |
8 | 450 | AAPL7140517C00450000 | 137.86 | 2.39 | 135.05 | 138.80 | 1 | 1 |
9 | 455 | AAPL140517C00455000 | 138.00 | 0.00 | 131.35 | 132.90 | 2 | 2 |
put_data[:10]
Strike | Symbol | Last | Chg | Bid | Ask | Vol | Open Int | |
---|---|---|---|---|---|---|---|---|
0 | 280 | AAPL140517P00280000 | 0.05 | 0 | N/A | 0.04 | 2 | 6 |
1 | 290 | AAPL140517P00290000 | 0.02 | 0 | N/A | 0.01 | 11 | 11 |
2 | 295 | AAPL140517P00295000 | 0.01 | 0 | N/A | 0.04 | 3 | 8 |
3 | 300 | AAPL140517P00300000 | 0.05 | 0 | N/A | 0.01 | 1 | 23 |
4 | 305 | AAPL140517P00305000 | 0.05 | 0 | N/A | 0.01 | 10 | 20 |
5 | 310 | AAPL140517P00310000 | 0.10 | 0 | N/A | 0.04 | 0 | 1 |
6 | 315 | AAPL140517P00315000 | 0.12 | 0 | N/A | 0.04 | 0 | 1 |
7 | 320 | AAPL140517P00320000 | 0.02 | 0 | N/A | 0.04 | 1 | 7 |
8 | 325 | AAPL140517P00325000 | 0.05 | 0 | N/A | 0.01 | 185 | 342 |
9 | 330 | AAPL140517P00330000 | 0.02 | 0 | N/A | 0.04 | 5 | 5 |
%%writefile ch06/Performance_MNR.xml
<INDICATOR>
<INDICATOR_SEQ>373889</INDICATOR_SEQ>
<PARENT_SEQ></PARENT_SEQ>
<AGENCY_NAME>MEtro-North Railroad</AGENCY_NAME>
<INDICATOR_NAME>Escalator Availability</INDICATOR_NAME>
<DESCRIPTION>Percent of the time that escalators are operational systemwide. The availability rate is based on physical observations performed the morning of regular business days only. This is a new indicator the agency began reporting in 2009.</DESCRIPTION>
<PERIOD_YEAR>2011</PERIOD_YEAR>
<PERIOD_MONTH>12</PERIOD_MONTH>
<CATEGORY>Service Indicators</CATEGORY>
<FREQUENCY>M</FREQUENCY>
<DESIRED_CHANGE>U</DESIRED_CHANGE>
<INDICATOR_UNIT>%</INDICATOR_UNIT>
<DECIMAL_PLACES>1</DECIMAL_PLACES>
<YTD_TARGET>97.00</YTD_TARGET>
<YTD_ACTUAL></YTD_ACTUAL>
<MONTHLY_TARGET>97.00</MONTHLY_TARGET>
<MONTHLY_ACTUAL></MONTHLY_ACTUAL>
</INDICATOR>
Overwriting ch06/Performance_MNR.xml
from lxml import objectify
import urllib2
path = 'Performance_MNR.xml'
# online_path = 'http://www.mta.info/developers/data/lirr/lirr_gtfs.xml'
# data = urllib2.urlopen(online_path).read()
# f = open(path, 'w')
# f.write(data)
# f.close()
# objectify를 이용해서 파일 파싱
parsed = objectify.parse(open(path))
root = parsed.getroot()
--------------------------------------------------------------------------- IOError Traceback (most recent call last) <ipython-input-304-a212823090af> in <module>() 11 12 # objectify를 이용해서 파일 파싱 ---> 13 parsed = objectify.parse(open(path)) 14 root = parsed.getroot() IOError: [Errno 2] No such file or directory: 'Performance_MNR.xml'
data = []
skip_fields = ['PARENT_SEQ', 'INDICATOR_SEQ',
'DESIRED_CHANGE', 'DECIMAL_PLACES']
# root.INDICATOR -> root
for elt in root:
el_data = {}
for child in elt.getchildren():
if child.tag in skip_fields:
continue
el_data[child.tag] = child.pyval
data.append(el_data)
data
# 위의 값과 비교하기 위해 테스트 해본 것
for elt in root:
for child in elt.getchildren():
print child.tag, child.pyval
perf = DataFrame(data)
perf
frame = pd.read_csv('ch06/ex1.csv')
frame
frame.save('ch06/frame_pickle')
pd.load('ch06/frame_pickle')
디스크에 이진 형식으로 저장된 대용량의 과학 자료를 효율적으로 읽고 쓸 수 있는 다양한 도구 존재
산업 기준에 맞는 인기 라이브러리중 하나가 HDF5(Hierarchical Data Format), 계층적 데이터 형식
내부적으로 파일 시스템 같은 노드 구조
여러 개의 데이터셋을 저장하고 부가 정보 기록 가능
다양한 압축 기술을 사용해서 on-the-fly(실시간) 압축 지원
반복되는 패턴을 가진 데이터 좀 더 효과적 저장
메모리에 모두 적재할 수 없는 엄청나게 큰 데이터를 아주 큰 배열에서 필요한 만큼의 작은 부분들만 효과적으로 읽고 쓸 수 있는 훌륭한 선택
PyTables: HDF5를 추상화하여 여러가지 유연한 데이터 컨테이너와 테이블 색인, 질의 기능 그리고 외부 메모리 연산(out-of-core, external memory algorithm) 지원
h5py: 직접적이지만 고수준의 HDF5 API에 대한 인터페이스 제공
pandas는 PyTable를 이용한 HDFStore라는 가벼운 사전 클래스를 통해 pandas 객체를 저장
그럼 일반적인 사용자들은 numexpr 인가? 이 라이브러리가 또 없다고 한다. 그럼 이 라이브러리를 먼저 설치하고 tables를 설치한다.
설치 방법1: sudo easy_install tables(오류 확인)
설치 방법2: sudo easy_install numexpr(확실하지 않음. 확인 요함)
설치 방법3: sudo easy_install tables
---------------------------------------------------------------------------
Exception Traceback (most recent call last)
<ipython-input-135-35f4287dfd8f> in <module>()
----> 1 store = pd.HDFStore('ch06/mydata.h5')
/Library/Python/2.7/site-packages/pandas-0.12.0_307_g3a2fe0b-py2.7-macosx-10.8-intel.egg/pandas/io/pytables.pyc in __init__(self, path, mode, complevel, complib, fletcher32, **kwargs)
343 import tables as _
344 except ImportError: # pragma: no cover
--> 345 raise Exception('HDFStore requires PyTables')
346
347 self._path = path
Exception: HDFStore requires PyTables
# 라이브러리 설치해보고 테스트 해보라.
store = pd.HDFStore('ch06/mydata.h5')
store['obj1'] = frame
store['obj1_col'] = frame['a']
store
store['obj1']
xls_file = pd.ExcelFile('data.xls')
table = xls_file.parse('Sheet1')
import requests
url = 'http://search.twitter.com/search.json?q=python%20pandas'
resp = requests.get(url)
resp
<Response [401]>
resp.text
u'{"errors":[{"message":"The Twitter REST API v1 is no longer active. Please migrate to API v1.1. https://dev.twitter.com/docs/api/1.1/overview.","code":64}]}'
import sqlite3
query = """
CREATE TABLE test
(a VARCHAR(20), b VARCHAR(20),
c REAL, d INTEGER
);"""
con = sqlite3.connect(':memory:')
con.execute(query)
con.commit()
data = [('Atlanta', 'Georgia', 1.25, 6),
('Tallahassee', 'Florida', 2.6, 3),
('Sacramento', 'California', 1.7, 5)]
stmt = "INSERT INTO test VALUES(?, ?, ?, ?)"
con.executemany(stmt, data)
con.commit()
대부분의 파이썬 SQL 드라이버(PyODBC, psycopg2, MySQLdb, pymssql 등)는 테이블에 대해 select 쿼리를 수행하면 튜플 리스트를 반환한다
cursor = con.execute('select * from test')
rows = cursor.fetchall()
rows
[(u'Atlanta', u'Georgia', 1.25, 6), (u'Tallahassee', u'Florida', 2.6, 3), (u'Sacramento', u'California', 1.7, 5)]
반환된 튜플 리스트를 DataFrame 생성자에 바로 전달해도 되지만 칼럼의 이름을 지정해주면 더 편하다. cursor의 description 속성을 활용하자.
This read-only attribute is a list of 7-item tuples, each containing (name, type_code, display_size, internal_size, precision, scale, null_ok). pyodbc only provides values for name, type_code, internal_size, and null_ok. The other values are set to None.
This attribute will be None for operations that do not return rows or if one of the execute methods has not been called.
The type_code member is the class type used to create the Python objects when reading rows. For example, a varchar column's type will be str.
cursor.description?
Type: tuple
String form: (('a', None, None, None, None, None, None), ('b', None, None, None, None, None, None), ('c', None, None, None, None, None, None), ('d', None, None, None, None, None, None))
Length: 4
Docstring:
tuple() -> empty tuple
tuple(iterable) -> tuple initialized from iterable's items
If the argument is a tuple, the return value is the same object.
cursor.description
(('a', None, None, None, None, None, None), ('b', None, None, None, None, None, None), ('c', None, None, None, None, None, None), ('d', None, None, None, None, None, None))
# cursor.description을 여러개 받아서 0번째 값들을 zip으로 묶는다.
zip(*cursor.description)[0]
('a', 'b', 'c', 'd')
DataFrame(rows, columns=zip(*cursor.description)[0])
a | b | c | d | |
---|---|---|---|---|
0 | Atlanta | Georgia | 1.25 | 6 |
1 | Tallahassee | Florida | 2.60 | 3 |
2 | Sacramento | California | 1.70 | 5 |
DataFrame(rows, columns=zip(*cursor.description)[1])
None | None | None | None | |
---|---|---|---|---|
0 | Atlanta | Georgia | 1.25 | 6 |
1 | Tallahassee | Florida | 2.60 | 3 |
2 | Sacramento | California | 1.70 | 5 |
# readonly attribute 란다.
# 난 cursor.description을 수정해서 내가 원하는 컬럼값으로 변경하려고 했는데..
# 그럼 어떻게 변경을 해야하지?
cursor.description = '1'
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-317-4bf5306f81d9> in <module>() 2 # 난 cursor.description을 수정해서 내가 원하는 컬럼값으로 변경하려고 했는데.. 3 # 그럼 어떻게 변경을 해야하지? ----> 4 cursor.description = '1' TypeError: readonly attribute
# 그냥 column에 내가 쓰고 싶은것 정하면 되네..
DataFrame(rows, columns=['country', 'state', 'grade1', 'grade2'])
country | state | grade1 | grade2 | |
---|---|---|---|---|
0 | Atlanta | Georgia | 1.25 | 6 |
1 | Tallahassee | Florida | 2.60 | 3 |
2 | Sacramento | California | 1.70 | 5 |
import pandas.io.sql as sql
sql.read_frame('select * from test', con)
a | b | c | d | |
---|---|---|---|---|
0 | Atlanta | Georgia | 1.25 | 6 |
1 | Tallahassee | Florida | 2.60 | 3 |
2 | Sacramento | California | 1.70 | 5 |