import pandas as pd from StringIO import StringIO from IPython.core.display import display %pylab inline ## Specifying index column foo_csv = StringIO("""date,A,B,C 20090101,a,1,2 20090102,b,3,4 20090103,c,4,5 """) print '>> Without specifying index column' foo_csv.seek(0) display( pd.read_csv(foo_csv)) print '>> index column = 0, COLUMN INDEXING starts with 0' foo_csv.seek(0) display( pd.read_csv(foo_csv, index_col=0)) print '>> index column = date' foo_csv.seek(0) display( pd.read_csv(foo_csv, index_col='date')) print '>> parsing index column date date' foo_csv.seek(0) display( pd.read_csv(foo_csv, index_col='date', parse_dates='date')) print ">> hierachical index = [0, 'A']" foo_csv.seek(0) display( pd.read_csv(foo_csv, index_col = [0, 'A'], parse_dates = 0)) ## Speaking different CSV dialects ## Use dialect keyword to specify the file format, by default ## it uses Excel dialect but you can specify either the ## dialect name or a csv.Dialect instace data = """label1,label2,label3 index1,"a,c,e index2,b,d,f """ ## By default, read_csv uses the excel dialect and treats ## the double quote "" as the quote character, which causes ## it to fail when it finds a newline before it finds the ## closing double quote. try: pd.read_csv(StringIO(data)) except: print 'CParserError raised' ## we can get around this by using dialect import csv dia = csv.excel() dia.quoting = csv.QUOTE_NONE display(pd.read_csv(StringIO(data), dialect = dia)) ## equivalently display(pd.read_csv(StringIO(data), quoting=csv.QUOTE_NONE)) ## THINGS TO NOTE: all of dialect options can be specified ## separately by keyword arguments including ## *thousands* (specifies the thousands separator), ## *lineterminator* (str of len 1, char to break files into lines) ## *quotechar* (str, the char to denote start/end of a quote item), ## Quoted items can include the delimiter and it will be ignored (main purpose of quoting) ## *quoting* (int, csv.QUOTE_MINIMAL, csv.QUOTE_ALL, csv.QUOTE_NONE, csv.QUOTE_NONNUMERIC) ## *skipinitialspace* (default=false, ignore spaces after delimiter) ## *escapechar* (str specifying how to escape quoted data) ## *comment* () ## *encoding* (e.g., 'utf-8' or 'latin-1') ## LOOK UP csv.Dialect document for details ## e.g. data = 'a, b, c~1, 2, 3~4, 5, 6' #extra whitespace, new liner display(pd.read_csv(StringIO(data), lineterminator='~', skipinitialspace=True)) ## Specifying column data types ## You can indicate the data type for the whole df ## or individual columns data = 'a,b,c\n1,2,3\n4,5,6\n7,8,9' print pd.read_csv(StringIO(data)).dtypes print pd.read_csv(StringIO(data), dtype='object').dtypes print pd.read_csv(StringIO(data), dtype = {'a':'str', 'b':'float'}).dtypes ## strings are objects in numpy ## Overwrite header names by specifying 'names' param ## you can choose to keep or throw away original headers ## by specifying 'header' param data = """a,b,c 1,2,3 4,5,6 7,8,9 """ display(pd.read_csv(StringIO(data))) display(pd.read_csv(StringIO(data), names = list("ABC"), header = None)) display(pd.read_csv(StringIO(data), names = list("ABC"), header = 0)) ## Filtering columns (usecols) ## the 'usecols' param allows you to select any subset of the ## columns in a file, either the column names or position ## Based on doc, it usually results in much faster parsing time and lower memory usage. data = 'a,b,c,d\n1,2,3,foo\n4,5,6,bar\n7,8,9,baz' display(pd.read_csv(StringIO(data))) display(pd.read_csv(StringIO(data), usecols=['b', 'd'])) ## Dealing with Unicode data ## the encoding arg should be used for encoded unicode data ## which will result in byte strings being decoded to unicode ## in the result data = 'word,length\nTr\xe4umen,7\nGr\xfc\xdfe,5' df = pd.read_csv(StringIO(data), encoding='latin-1') df ## Specifying date columns ## the "parse_dates" and "date_parser" params allow ## users to specify a variety of cols and date/time formats tmp_data = """KORD,19990127, 19:00:00, 18:56:00, 0.8100 KORD,19990127, 20:00:00, 19:56:00, 0.0100 KORD,19990127, 21:00:00, 20:56:00, -0.5900 KORD,19990127, 21:00:00, 21:18:00, -0.9900 KORD,19990127, 22:00:00, 21:56:00, -0.5900 KORD,19990127, 23:00:00, 22:56:00, -0.5900 """ ## COMBINE DIFFERENT COLS to parse date pd.read_csv(StringIO(tmp_data), header = None, parse_dates={'end': [1, 2], 'start': [1, 3]},) ## Handling Boolean Values ## use true_values and false_values to specify true/false values data = 'a,b,c\n1,Yes,2\n3,No,4' df1 = pd.read_csv(StringIO(data)) display(df1) print df1.dtypes df2 = pd.read_csv(StringIO(data), true_values=['Yes'], false_values=['No']) display(df2) print df2.dtypes ## AUTO skipping bad lines data = 'a,b,c\n1,2,3\n4,5,6,7\n8,9,10' try: pd.read_csv(StringIO(data)) except Exception, ex: print ex display(pd.read_csv(StringIO(data), error_bad_lines=False)) ## Quoting and Escaping characters ## Quotes (and other escape characters) in embedded fields can be handled in any number of ways. ## One way is to use backslashes; ## to properly parse this data, you should pass the escapechar option data = 'a,b\n"hello, \\"Bob\\", nice to see you",5' pd.read_csv(StringIO(data), escapechar='\\') ## ITERATING through files chunk by chunk ## Suppose you wish to iterate through a (potentially very large) file ## LAZILY rather than reading the entire file into memory, ## such as the following: ## 1. BY SPECIFYING "chunksize" to read_csv() ror read_table(), ## THE RETURN VALUE WILL BE AN ITERABLE OBJECT OF TYPE ## "TextFileReader" tmp_sv = """|0|1|2|3 0|0.4691122999071863|-0.2828633443286633|-1.5090585031735124|-1.1356323710171934 1|1.2121120250208506|-0.17321464905330858|0.11920871129693428|-1.0442359662799567 2|-0.8618489633477999|-2.1045692188948086|-0.4949292740687813|1.071803807037338 3|0.7215551622443669|-0.7067711336300845|-1.0395749851146963|0.27185988554282986 4|-0.42497232978883753|0.567020349793672|0.27623201927771873|-1.0874006912859915 5|-0.6736897080883706|0.1136484096888855|-1.4784265524372235|0.5249876671147047 6|0.4047052186802365|0.5770459859204836|-1.7150020161146375|-1.0392684835147725 7|-0.3706468582364464|-1.1578922506419993|-1.344311812731667|0.8448851414248841 8|1.0757697837155533|-0.10904997528022223|1.6435630703622064|-1.4693879595399115 9|0.35702056413309086|-0.6746001037299882|-1.776903716971867|-0.9689138124473498 """ reader = pd.read_table(StringIO(tmp_sv), sep = '|', chunksize=4) print type(reader) for chunk in reader: print type(chunk) print chunk ## 2. SPECIFYING iterator = True WILL ALSO RETURN THE ## TextFileReader object reader = pd.read_table(StringIO(tmp_sv), sep="|", iterator=True, ) chunk = reader.get_chunk(5) print type(chunk) print chunk ## WRITING to CSV ## two BOOL parameters 'header' and 'index' specifying ## whether writing header and index information to the file from IPython.core.display import HTML HTML (df.to_html()) df.to_pickle('data/foo.pkl') !ls data/foo* dff = pd.read_pickle('data/foo.pkl') dff ## create or read a store import pandas as pd from numpy.random import randn store = pd.HDFStore('data/store.h5', 'w') print store ## adding some objects index = pd.date_range('1/1/2000', periods=8, freq='D') s = pd.Series(randn(5), index = list("abcde")) df = pd.DataFrame(randn(8, 3), index = index, columns = list("ABC")) store['s'] = s store['df'] = df print store ## read object print store['df'] store.close() ## read back objects as from a dict store = pd.HDFStore('data/store.h5', 'r') print store.df ## delete an object as from a dict store.close() store = pd.HDFStore('data/store.h5', 'a') # reopen in a mode del store['s'] store.close() ## use hdf5 in a managed context (SO painful to rmbr to close the file) with pd.get_store('data/store.h5') as store: print store.keys() df = pd.DataFrame(dict(A = range(5), B = range(5))) ## Append param is significant here df.to_hdf('data/store.h5', 'atable', append = True) ## make it a table pd.read_hdf('data/store.h5', 'atable', where = ['index>2']) with pd.get_store('data/store.h5') as store: print store ## A storer format will raise a TypeError if you try to ## tetrieve using a "where" parameter DataFrame(rand(10, 2)).to_hdf('data/simple_storer.h5', 'df') pd.read_hdf('data/simple_storer.h5', 'df', where = 'index > 5') with pd.get_store('data/store.h5') as store: del store['df_table'] ## On the other hand, table format (by using append or append parameter) ## supports appending and query with pd.get_store('data/store.h5') as store: df1 = df.iloc[:2] df2 = df.iloc[2:] store.append('df_table', df1) df2.to_hdf('data/store.h5', 'df_table', append=True) print store ## Now the table format in store is querable with pd.get_store('data/store.h5') as store: display(store.df_table) display(store.select('df_table', where=['index > 2'])) ## Hierarchical nature of HDF store with pd.get_store('data/store.h5') as store: store.put('foo/bar/bah', df) store.append('food/orange', df) store.append('food/apple', df) print '\n', store print '\n', store.keys() store.remove('food') print '\n', store with pd.get_store('data/store.h5') as store: display(store.df_table) print 'QUERY CAN ONLY BE APPLIED TO table BUT NOT TO storer' print '>> Selecting columns B and C only' display(store.select('df_table', columns=['B'])) print '>> Equivalently using where=' display(store.select('df_table', where = [('columns', '=', 'B')])) with pd.get_store('data/store.h5') as store: store.create_table_index('atable', optlevel=9, kind='full') print store.atable.dtypes print store.select('atable', where = ['index > 1', 'index < 3']) ## NOT VALID - BECAUSE DATA_COLUMNS ARE NOT INDEX YET #print store.select('atable', where = pd.Term('B > 3')) ## INDEX DATA COLUMNS SO QUERY CAN BE MADE ON THEM LATER with pd.get_store('data/store.h5') as store: dff = store.atable.copy() dff.index = range(dff.shape[0]) dff.ix[:3, 'A'] = 100 dff['string'] = 'foo' dff.ix[1:3, 'string'] = np.nan dff.iloc[7:9]['string'] = 'bar' dff['string2'] = 'cool' print dff ## indexing data columns ## BUILD ONCE AND IT WILL ALWAYS THERE!! ## SINCE IT SEEMS THAT YOU CAN ONLY DO THE INDEXING OF COLUMNS ## ONCE WHEN YOU CREATE THE TABLE, PROBABLY YOU SHOULD INCLUDE ## ALL INTERESTING COLUMNS AT THIS MOMENT. store.append('dff', dff, data_columns=['A', 'B', 'string', 'string2']) ## need to delete the table and recreate if ## adding new data columns to indexing with pd.get_store('data/store.h5') as store: del store['dff'] ## Selecting NaN values with string 'nan' with pd.get_store('data/store.h5') as store: print store.select('dff', where=['string == foo', 'B == 4']) print print store.select('dff', where = ('string != nan')) ## Iterating mode - the in-context version of iterating for df in pd.read_hdf('data/store.h5', 'dff', chunksize = 3): print df import sqlite3 from pandas.io import sql df = pd.DataFrame(dict(id = [26, 42, 63], Date = pd.date_range("20121018", periods=3, freq='D'), Col1 = list('XYZ'), Col2 = [25.7, -12.4, 5.73], Col3 = [True, False, True])) conn = sqlite3.connect(":memory:") display(df) df.to_sql('df', conn) pd.read_sql('select Col1, Col2, Date from df where Date > "2012-10-19"', conn)