from pandas import DataFrame, Series import pandas as pd !cat ch06/ex1.csv df = pd.read_csv('ch06/ex1.csv') pd.read_csv('ch06/ex1.csv', header=None) # 원래 있던 Column명 무시하고 내가 원하는 Column명 설정 pd.read_csv('ch06/ex1.csv', names=[5,6,7,8,9]) pd.read_csv('ch06/ex1.csv', names=['a1', 'b1', 'c1', 'd1', 'message1']) df # csv는 DataFrame으로 읽어온다. type(df) pd.read_table('ch06/ex1.csv', sep=',') pd.read_table('ch06/ex1.csv', sep=',', header=None) !cat ch06/ex2.csv # header 자동 생성 pd.read_csv('ch06/ex2.csv', header=None) # header 옵션이 없을시 header를 첫번째 줄로 이용 pd.read_csv('ch06/ex2.csv') # Column명 추가 pd.read_csv('ch06/ex2.csv', names=['a', 'b', 'c', 'message']) names = ['a', 'b', 'c', 'd', 'message'] pd.read_csv('ch06/ex2.csv', names=names) # message -> index pd.read_csv('ch06/ex2.csv', names=names, index_col='message') pd.read_csv('ch06/ex2.csv', names=names, index_col='a') !cat ch06/csv_mindex.csv parsed = pd.read_csv('ch06/csv_mindex.csv', index_col=['key1', 'key2']) parsed list(open('ch06/ex3.txt')) result = pd.read_table('ch06/ex3.txt', sep='\s+') result pd.read_csv('ch06/ex3.txt', delimiter='\s+') # Read CSV(comma-separated) file into DataFrame pd.read_csv? !cat ch06/ex4.csv pd.read_csv('ch06/ex4.csv', skiprows=[0, 2, 3]) !cat ch06/ex5.csv result = pd.read_csv('ch06/ex5.csv') result pd.isnull(result) result = pd.read_csv('ch06/ex5.csv', na_values=['NULL']) result # world를 NA값으로 처리하니 NaN으로 나온다. # 특정한 값을 NA 처리할 수 있을것 같다. pd.read_csv('ch06/ex5.csv', na_values=['world']) sentinels = {'message': ['foo', 'NA'], 'something': ['two']} pd.read_csv('ch06/ex5.csv', na_values=sentinels) # 이 명령어로 어떤 함수인지, 어떤 파라미터를 넘겨야 하는지 정확히 알 수 있다. # 굳이 명령어들을 따라칠 필요는 없는데 어떤 파라미터들을 넘기는지 한 번 공부하는 겸겸해서 쳐봤다. pd.read_csv? Type: function String form: File: /Library/Python/2.7/site-packages/pandas-0.12.0_307_g3a2fe0b-py2.7-macosx-10.8-intel.egg/pandas/io/parsers.py Definition: pd.read_csv(filepath_or_buffer, sep=',', dialect=None, compression=None, doublequote=True, escapechar=None, quotechar='"', quoting=0, skipinitialspace=False, lineterminator=None, header='infer', index_col=None, names=None, prefix=None, skiprows=None, skipfooter=None, skip_footer=0, na_values=None, na_fvalues=None, true_values=None, false_values=None, delimiter=None, converters=None, dtype=None, usecols=None, engine='c', delim_whitespace=False, as_recarray=False, na_filter=True, compact_ints=False, use_unsigned=False, low_memory=True, buffer_lines=None, warn_bad_lines=True, error_bad_lines=True, keep_default_na=True, thousands=None, comment=None, decimal='.', parse_dates=False, keep_date_col=False, dayfirst=False, date_parser=None, memory_map=False, nrows=None, iterator=False, chunksize=None, verbose=False, encoding=None, squeeze=False, mangle_dupe_cols=True, tupleize_cols=True) Docstring: Read CSV (comma-separated) file into DataFrame Also supports optionally iterating or breaking of the file into chunks. Parameters ---------- filepath_or_buffer : string or file handle / StringIO. The string could be a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. For instance, a local file could be file ://localhost/path/to/table.csv sep : string, default ',' Delimiter to use. If sep is None, will try to automatically determine this. Regular expressions are accepted. lineterminator : string (length 1), default None Character to break file into lines. Only valid with C parser quotechar : string The character to used to denote the start and end of a quoted item. Quoted items can include the delimiter and it will be ignored. quoting : int Controls whether quotes should be recognized. Values are taken from `csv.QUOTE_*` values. Acceptable values are 0, 1, 2, and 3 for QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONE, and QUOTE_NONNUMERIC, respectively. skipinitialspace : boolean, default False Skip spaces after delimiter escapechar : string dtype : Type name or dict of column -> type Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32} compression : {'gzip', 'bz2', None}, default None For on-the-fly decompression of on-disk data dialect : string or csv.Dialect instance, default None If None defaults to Excel dialect. Ignored if sep longer than 1 char See csv.Dialect documentation for more details header : int, default 0 if names parameter not specified, Row to use for the column labels of the parsed DataFrame. Specify None if there is no header row. Can be a list of integers that specify row locations for a multi-index on the columns E.g. [0,1,3]. Interveaning rows that are not specified (E.g. 2 in this example are skipped) skiprows : list-like or integer Row numbers to skip (0-indexed) or number of rows to skip (int) at the start of the file index_col : int or sequence or False, default None Column to use as the row labels of the DataFrame. If a sequence is given, a MultiIndex is used. If you have a malformed file with delimiters at the end of each line, you might consider index_col=False to force pandas to _not_ use the first column as the index (row names) names : array-like List of column names to use. If file contains no header row, then you should explicitly pass header=None prefix : string or None (default) Prefix to add to column numbers when no header, e.g 'X' for X0, X1, ... na_values : list-like or dict, default None Additional strings to recognize as NA/NaN. If dict passed, specific per-column NA values true_values : list Values to consider as True false_values : list Values to consider as False keep_default_na : bool, default True If na_values are specified and keep_default_na is False the default NaN values are overridden, otherwise they're appended to parse_dates : boolean, list of ints or names, list of lists, or dict If True -> try parsing the index. If [1, 2, 3] -> try parsing columns 1, 2, 3 each as a separate date column. If [[1, 3]] -> combine columns 1 and 3 and parse as a single date column. {'foo' : [1, 3]} -> parse columns 1, 3 as date and call result 'foo' keep_date_col : boolean, default False If True and parse_dates specifies combining multiple columns then keep the original columns. date_parser : function Function to use for converting a sequence of string columns to an array of datetime instances. The default uses dateutil.parser.parser to do the conversion. dayfirst : boolean, default False DD/MM format dates, international and European format thousands : str, default None Thousands separator comment : str, default None Indicates remainder of line should not be parsed Does not support line commenting (will return empty line) decimal : str, default '.' Character to recognize as decimal point. E.g. use ',' for European data nrows : int, default None Number of rows of file to read. Useful for reading pieces of large files iterator : boolean, default False Return TextFileReader object chunksize : int, default None Return TextFileReader object for iteration skipfooter : int, default 0 Number of line at bottom of file to skip converters : dict. optional Dict of functions for converting values in certain columns. Keys can either be integers or column labels verbose : boolean, default False Indicate number of NA values placed in non-numeric columns delimiter : string, default None Alternative argument name for sep. Regular expressions are accepted. encoding : string, default None Encoding to use for UTF when reading/writing (ex. 'utf-8') squeeze : boolean, default False If the parsed data only contains one column then return a Series na_filter: boolean, default True Detect missing value markers (empty strings and the value of na_values). In data without any NAs, passing na_filter=False can improve the performance of reading a large file usecols : array-like Return a subset of the columns. Results in much faster parsing time and lower memory usage. mangle_dupe_cols: boolean, default True Duplicate columns will be specified as 'X.0'...'X.N', rather than 'X'...'X' tupleize_cols: boolean, default False Leave a list of tuples on columns as is (default is to convert to a Multi Index on the columns) Returns ------- result : DataFrame or TextParser result = pd.read_csv('ch06/ex6.csv') result pd.read_csv('ch06/ex6.csv', nrows=5) chunker = pd.read_csv('ch06/ex6.csv', chunksize=1000) chunker chunker = pd.read_csv('ch06/ex6.csv', chunksize=1000) tot = Series([]) for piece in chunker: # piece['key']에 있는 E, X, L 등의 숫자를 센다. 값이 없는 것들은 0으로 채운다. tot = tot.add( piece['key'].value_counts(), fill_value=0) # Key가 아닌 값을(order) 기준으로 내림차순 정리 tot = tot.order(ascending=False) tot[:10] data = pd.read_csv('ch06/ex5.csv') data data.to_csv('ch06/out.csv') !cat ch06/out.csv # csv로 지정하는데 output은 표준아웃풋(모니터), separator는 '|' data.to_csv(sys.stdout, sep='|') # Write DataFrame to a comma-separated value (csv) file # na_rep -> Missing data representation. NA REPresentation data.to_csv? data.to_csv(sys.stdout, na_rep='NULL') data.to_csv(sys.stdout, na_rep='NaN') data.to_csv(sys.stdout, index=False, header=False) data.to_csv(sys.stdout, index=False, cols=['a', 'b', 'c']) dates = pd.date_range('1/1/2000', periods=7) ts = Series(np.arange(7), index=dates) ts.to_csv('ch06/tseries.csv') !cat ch06/tseries.csv pd.DataFrame.to_csv? Series.from_csv('ch06/tseries.csv', parse_dates=True) type( Series.from_csv('ch06/tseries.csv', parse_dates=True) ) # parse dates: boolean, default True. # Parse dates. Different default from read_table Series.from_csv? pd.read_csv('ch06/tseries.csv', header=None) type(pd.read_csv('ch06/tseries.csv', header=None)) pd.read_csv? !cat ch06/ex7.csv import csv f = open('ch06/ex7.csv') reader = csv.reader(f) for line in reader: print line lines = list(csv.reader(open('ch06/ex7.csv'))) header, values = lines[0], lines[1:] header values # header = a,b,c # values를 1,1을 같이 묶는다. 2,2 묶고. 3,3 묶고. 4는 header가 a,b,c 3개 밖에 없기 때문에 포함되지 않는다. data_dict = {h: v for h, v in zip(header, zip(*values))} data_dict class my_dialect(csv.Dialect): lineterminator = '\n' delimiter = ';' quotechar = '"' reader = csv.reader reader = csv.reader? reader = csv.reader reader = csv.reader # quoting이 꼭 integer여야 한다는 오류가 발생해서 삽질하다가 뒤에 quoting keyword를 붙여줌.. reader = csv.reader(f, dialect=my_dialect) reader = csv.reader(f, dialect=my_dialect, quoting=csv.QUOTE_NONE) csv.QUOTE_NONE reader = csv.reader(f, delimiter='|') # 어떤 옵션들 있는지 보려고 했더니 안 보여주네... csv.reader?? with open('ch06/mydata.csv', 'w') as f: writer = csv.writer(f, dialect=my_dialect, quoting=csv.QUOTE_NONE) writer.writerow(('one', 'two', 'three')) writer.writerow(('1', '2', '3')) writer.writerow(('4', '5', '6')) writer.writerow(('7', '8', '9')) !cat ch06/mydata.csv # json은 python에서처럼 '으로 하면 안된다. 현재 """로 감싸 문자열로 저장되어 있기 때문에 # javascript에서는 '를 string 값으로 인식하지 않아서 에러 발생 obj = """ { 'name': 'Wes', 'places_lived': ['United States', 'Spain', 'Germany'], 'pet': null, 'siblings': [{'name': 'Scott', 'age':25, 'pet':'Zuko'}, {'name': 'Katie', 'age':33, 'pet': 'Cisco'}] } """ import json # ValueError: Expecting property name: line 3 column 5 (char 7) result = json.loads(obj) obj = """ { "name": "Wes", "places_lived": ["United States", "Spain", "Germany"], "pet": null, "siblings": [{"name": "Scott", "age":25, "pet":"Zuko"}, {"name": "Katie", "age":33, "pet": "Cisco"}] } """ obj # ValueError: Expecting property name: line 3 column 5 (char 7) result = json.loads(obj) result asjson = json.dumps(result) # '가 아니라 "인 것을 확인하자 asjson siblings = DataFrame(result['siblings'], columns=['name', 'age']) siblings # 책에 나와있지 않은 내용을 한 번 더 해봐야 쉽게 이해가 되는듯 siblings2 = DataFrame(result['siblings'], columns=['name', 'age', 'pet']) siblings2 from lxml.html import parse from urllib2 import urlopen # 데이터를 가져 올 url을 넘긴 후 # 데이터를 받아 온 후 parse parsed = parse(urlopen('http://finance.yahoo.com/q/op?s=AAPL+Options')) doc = parsed.getroot() links = doc.findall('.//a') # 이 객체는 HTML 엘리멘트를 표현하는 객체일뿐 # URL과 링크 이름을 가져오려면 각 엘리먼트에 대해 get 메서드를 호출하여 URL을 얻고 # text_content 메서드를 사용해서 링크 이름을 가져와야 한다. links[15:20] lnk = links[28] lnk lnk.get('href') lnk.text_content() urls = [lnk.get('href') for lnk in doc.findall('.//a')] len(urls) urls[-3:-1] urls[-10:] tables = doc.findall('.//table') tables calls = tables[9] calls puts = tables[13] rows = calls.findall('.//tr') rows def _unpack(row, kind='td'): elts = row.findall('.//%s' % kind) return [val.text_content() for val in elts] _unpack(rows[0], kind='th') _unpack(rows[1], kind='td') from pandas.io.parsers import TextParser def parse_options_data(table): rows = table.findall('.//tr') # rows[0]은 header header = _unpack(rows[0], kind='th') # rows[1:] 부터 실제적인 data data = [_unpack(r) for r in rows[1:]] # TextParser에 data를 넘기고 column명으로 header를 사용 return TextParser(data, names=header).get_chunk() # call option data call_data = parse_options_data(calls) # put option data put_data = parse_options_data(puts) call_data[:10] put_data[:10] %%writefile ch06/Performance_MNR.xml 373889 MEtro-North Railroad Escalator Availability Percent of the time that escalators are operational systemwide. The availability rate is based on physical observations performed the morning of regular business days only. This is a new indicator the agency began reporting in 2009. 2011 12 Service Indicators M U % 1 97.00 97.00 from lxml import objectify import urllib2 path = 'Performance_MNR.xml' # online_path = 'http://www.mta.info/developers/data/lirr/lirr_gtfs.xml' # data = urllib2.urlopen(online_path).read() # f = open(path, 'w') # f.write(data) # f.close() # objectify를 이용해서 파일 파싱 parsed = objectify.parse(open(path)) root = parsed.getroot() data = [] skip_fields = ['PARENT_SEQ', 'INDICATOR_SEQ', 'DESIRED_CHANGE', 'DECIMAL_PLACES'] # root.INDICATOR -> root for elt in root: el_data = {} for child in elt.getchildren(): if child.tag in skip_fields: continue el_data[child.tag] = child.pyval data.append(el_data) data # 위의 값과 비교하기 위해 테스트 해본 것 for elt in root: for child in elt.getchildren(): print child.tag, child.pyval perf = DataFrame(data) perf frame = pd.read_csv('ch06/ex1.csv') frame frame.save('ch06/frame_pickle') pd.load('ch06/frame_pickle') # 라이브러리 설치해보고 테스트 해보라. store = pd.HDFStore('ch06/mydata.h5') store['obj1'] = frame store['obj1_col'] = frame['a'] store store['obj1'] xls_file = pd.ExcelFile('data.xls') table = xls_file.parse('Sheet1') import requests url = 'http://search.twitter.com/search.json?q=python%20pandas' resp = requests.get(url) resp resp.text import sqlite3 query = """ CREATE TABLE test (a VARCHAR(20), b VARCHAR(20), c REAL, d INTEGER );""" con = sqlite3.connect(':memory:') con.execute(query) con.commit() data = [('Atlanta', 'Georgia', 1.25, 6), ('Tallahassee', 'Florida', 2.6, 3), ('Sacramento', 'California', 1.7, 5)] stmt = "INSERT INTO test VALUES(?, ?, ?, ?)" con.executemany(stmt, data) con.commit() cursor = con.execute('select * from test') rows = cursor.fetchall() rows cursor.description? cursor.description # cursor.description을 여러개 받아서 0번째 값들을 zip으로 묶는다. zip(*cursor.description)[0] DataFrame(rows, columns=zip(*cursor.description)[0]) DataFrame(rows, columns=zip(*cursor.description)[1]) # readonly attribute 란다. # 난 cursor.description을 수정해서 내가 원하는 컬럼값으로 변경하려고 했는데.. # 그럼 어떻게 변경을 해야하지? cursor.description = '1' # 그냥 column에 내가 쓰고 싶은것 정하면 되네.. DataFrame(rows, columns=['country', 'state', 'grade1', 'grade2']) import pandas.io.sql as sql sql.read_frame('select * from test', con)