import pandas as pd
from StringIO import StringIO
from IPython.core.display import display
%pylab inline

## Specifying index column
foo_csv = StringIO("""date,A,B,C
20090101,a,1,2
20090102,b,3,4
20090103,c,4,5
""")
print '>> Without specifying index column'
foo_csv.seek(0)
display( pd.read_csv(foo_csv))
print '>> index column = 0, COLUMN INDEXING starts with 0'
foo_csv.seek(0)
display( pd.read_csv(foo_csv, index_col=0))
print '>> index column = date'
foo_csv.seek(0)
display( pd.read_csv(foo_csv, index_col='date'))
print '>> parsing index column date date'
foo_csv.seek(0)
display( pd.read_csv(foo_csv, index_col='date', parse_dates='date'))
print ">> hierachical index = [0, 'A']"
foo_csv.seek(0)
display( pd.read_csv(foo_csv, index_col = [0, 'A'], 
                    parse_dates = 0))

## Speaking different CSV dialects
## Use dialect keyword to specify the file format, by default
## it uses Excel dialect but you can specify either the 
## dialect name or a csv.Dialect instace
data = """label1,label2,label3
index1,"a,c,e
index2,b,d,f
"""
## By default, read_csv uses the excel dialect and treats 
## the double quote "" as the quote character, which causes
## it to fail when it finds a newline before it finds the 
## closing double quote.
try:
    pd.read_csv(StringIO(data))
except:
    print 'CParserError raised'
    
## we can get around this by using dialect
import csv
dia = csv.excel()
dia.quoting = csv.QUOTE_NONE
display(pd.read_csv(StringIO(data), dialect = dia))

## equivalently
display(pd.read_csv(StringIO(data), quoting=csv.QUOTE_NONE))

## THINGS TO NOTE: all of dialect options can be specified
## separately by keyword arguments including
## *thousands* (specifies the thousands separator), 
## *lineterminator* (str of len 1, char to break files into lines)
## *quotechar* (str, the char to denote start/end of a quote item), 
##     Quoted items can include the delimiter and it will be ignored (main purpose of quoting)
## *quoting* (int, csv.QUOTE_MINIMAL, csv.QUOTE_ALL, csv.QUOTE_NONE, csv.QUOTE_NONNUMERIC)
## *skipinitialspace* (default=false, ignore spaces after delimiter)
## *escapechar* (str specifying how to escape quoted data)
## *comment* ()
## *encoding* (e.g., 'utf-8' or 'latin-1')
## LOOK UP csv.Dialect document for details
## e.g.
data = 'a, b, c~1, 2, 3~4, 5, 6' #extra whitespace, new liner
display(pd.read_csv(StringIO(data), lineterminator='~', 
                    skipinitialspace=True))

## Specifying column data types
## You can indicate the data type for the whole df
## or individual columns
data = 'a,b,c\n1,2,3\n4,5,6\n7,8,9'
print pd.read_csv(StringIO(data)).dtypes
print pd.read_csv(StringIO(data), dtype='object').dtypes
print pd.read_csv(StringIO(data), 
                  dtype = {'a':'str', 'b':'float'}).dtypes
## strings are objects in numpy

## Overwrite header names by specifying 'names' param
## you can choose to keep or throw away original headers
## by specifying 'header' param
data = """a,b,c
1,2,3
4,5,6
7,8,9
"""
display(pd.read_csv(StringIO(data)))
display(pd.read_csv(StringIO(data), names = list("ABC"), header = None))
display(pd.read_csv(StringIO(data), names = list("ABC"), header = 0))

## Filtering columns (usecols)
## the 'usecols' param allows you to select any subset of the
## columns in a file, either the column names or position
## Based on doc, it usually results in much faster parsing time and lower memory usage.
data = 'a,b,c,d\n1,2,3,foo\n4,5,6,bar\n7,8,9,baz'
display(pd.read_csv(StringIO(data)))
display(pd.read_csv(StringIO(data), usecols=['b', 'd']))

## Dealing with Unicode data
## the encoding arg should be used for encoded unicode data
## which will result in byte strings being decoded to unicode
## in the result
data = 'word,length\nTr\xe4umen,7\nGr\xfc\xdfe,5'
df = pd.read_csv(StringIO(data), encoding='latin-1')
df

## Specifying date columns
## the "parse_dates" and "date_parser" params allow 
## users to specify a variety of cols and date/time formats
tmp_data = """KORD,19990127, 19:00:00, 18:56:00, 0.8100
KORD,19990127, 20:00:00, 19:56:00, 0.0100
KORD,19990127, 21:00:00, 20:56:00, -0.5900
KORD,19990127, 21:00:00, 21:18:00, -0.9900
KORD,19990127, 22:00:00, 21:56:00, -0.5900
KORD,19990127, 23:00:00, 22:56:00, -0.5900
"""

## COMBINE DIFFERENT COLS to parse date
pd.read_csv(StringIO(tmp_data), header = None, 
            parse_dates={'end': [1, 2], 'start': [1, 3]},)

## Handling Boolean Values
## use true_values and false_values to specify true/false values
data = 'a,b,c\n1,Yes,2\n3,No,4'
df1 = pd.read_csv(StringIO(data))
display(df1)
print df1.dtypes
df2 = pd.read_csv(StringIO(data), true_values=['Yes'], false_values=['No'])
display(df2)
print df2.dtypes

## AUTO skipping bad lines
data = 'a,b,c\n1,2,3\n4,5,6,7\n8,9,10'
try:
    pd.read_csv(StringIO(data))
except Exception, ex:
    print ex

display(pd.read_csv(StringIO(data), error_bad_lines=False))

## Quoting and Escaping characters
## Quotes (and other escape characters) in embedded fields can be handled in any number of ways. 
## One way is to use backslashes; 
## to properly parse this data, you should pass the escapechar option
data = 'a,b\n"hello, \\"Bob\\", nice to see you",5'
pd.read_csv(StringIO(data), escapechar='\\')

## ITERATING through files chunk by chunk
## Suppose you wish to iterate through a (potentially very large) file 
## LAZILY rather than reading the entire file into memory, 
## such as the following:

## 1. BY SPECIFYING "chunksize" to read_csv() ror read_table(),
## THE RETURN VALUE WILL BE AN ITERABLE OBJECT OF TYPE
## "TextFileReader"

tmp_sv = """|0|1|2|3
0|0.4691122999071863|-0.2828633443286633|-1.5090585031735124|-1.1356323710171934
1|1.2121120250208506|-0.17321464905330858|0.11920871129693428|-1.0442359662799567
2|-0.8618489633477999|-2.1045692188948086|-0.4949292740687813|1.071803807037338
3|0.7215551622443669|-0.7067711336300845|-1.0395749851146963|0.27185988554282986
4|-0.42497232978883753|0.567020349793672|0.27623201927771873|-1.0874006912859915
5|-0.6736897080883706|0.1136484096888855|-1.4784265524372235|0.5249876671147047
6|0.4047052186802365|0.5770459859204836|-1.7150020161146375|-1.0392684835147725
7|-0.3706468582364464|-1.1578922506419993|-1.344311812731667|0.8448851414248841
8|1.0757697837155533|-0.10904997528022223|1.6435630703622064|-1.4693879595399115
9|0.35702056413309086|-0.6746001037299882|-1.776903716971867|-0.9689138124473498
"""
reader = pd.read_table(StringIO(tmp_sv), sep = '|', chunksize=4)
print type(reader)
for chunk in reader:
    print type(chunk)
    print chunk

## 2. SPECIFYING iterator = True WILL ALSO RETURN THE
## TextFileReader object
reader = pd.read_table(StringIO(tmp_sv), sep="|", iterator=True, )
chunk = reader.get_chunk(5)
print type(chunk)
print chunk

## WRITING to CSV
## two BOOL parameters 'header' and 'index' specifying 
## whether writing header and index information to the file

from IPython.core.display import HTML
HTML (df.to_html())

df.to_pickle('data/foo.pkl')
!ls data/foo*

dff = pd.read_pickle('data/foo.pkl')
dff

## create or read a store 
import pandas as pd
from numpy.random import randn
store = pd.HDFStore('data/store.h5', 'w')
print store
## adding some objects
index = pd.date_range('1/1/2000', periods=8, freq='D')
s = pd.Series(randn(5), index = list("abcde"))
df = pd.DataFrame(randn(8, 3), index = index, columns = list("ABC"))
store['s'] = s
store['df'] = df
print store
## read object
print store['df']
store.close()

## read back objects as from a dict
store = pd.HDFStore('data/store.h5', 'r')
print store.df

## delete an object as from a dict
store.close()
store = pd.HDFStore('data/store.h5', 'a') # reopen in a mode
del store['s']
store.close()

## use hdf5 in a managed context (SO painful to rmbr to close the file)
with pd.get_store('data/store.h5') as store:
    print store.keys()

df = pd.DataFrame(dict(A = range(5),  B = range(5)))
## Append param is significant here
df.to_hdf('data/store.h5', 'atable', append = True) ## make it a table
pd.read_hdf('data/store.h5', 'atable', where = ['index>2'])
with pd.get_store('data/store.h5') as store:
    print store

## A storer format will raise a TypeError if you try to
## tetrieve using a "where" parameter
DataFrame(rand(10, 2)).to_hdf('data/simple_storer.h5', 'df')
pd.read_hdf('data/simple_storer.h5', 'df', where = 'index > 5')
with pd.get_store('data/store.h5') as store:
    del store['df_table']
## On the other hand, table format (by using append or append parameter)
## supports appending and query
with pd.get_store('data/store.h5') as store:
    df1 = df.iloc[:2]
    df2 = df.iloc[2:]
    store.append('df_table', df1)
    df2.to_hdf('data/store.h5', 'df_table', append=True)
    print store

## Now the table format in store is querable
with pd.get_store('data/store.h5') as store:
    display(store.df_table)
    display(store.select('df_table', where=['index > 2']))

## Hierarchical nature of HDF store
with pd.get_store('data/store.h5') as store:
    store.put('foo/bar/bah', df)
    store.append('food/orange', df)
    store.append('food/apple', df)
    print '\n', store
    print '\n', store.keys()
    store.remove('food')
    print '\n', store

with pd.get_store('data/store.h5') as store:
    display(store.df_table)
    print 'QUERY CAN ONLY BE APPLIED TO table BUT NOT TO storer'
    print '>> Selecting columns B and C only'
    display(store.select('df_table', columns=['B']))
    print '>> Equivalently using where='
    display(store.select('df_table', 
                where = [('columns', '=', 'B')]))

with pd.get_store('data/store.h5') as store:
    store.create_table_index('atable', optlevel=9, kind='full')
    print store.atable.dtypes
    print store.select('atable', where = ['index > 1', 'index < 3'])
    ## NOT VALID - BECAUSE DATA_COLUMNS ARE NOT INDEX YET
    #print store.select('atable', where = pd.Term('B > 3'))

## INDEX DATA COLUMNS SO QUERY CAN BE MADE ON THEM LATER
with pd.get_store('data/store.h5') as store:
    dff = store.atable.copy()
    dff.index = range(dff.shape[0])
    dff.ix[:3, 'A'] = 100
    dff['string'] = 'foo'
    dff.ix[1:3, 'string'] = np.nan
    dff.iloc[7:9]['string'] = 'bar'
    dff['string2'] = 'cool'
    print dff
    ## indexing data columns
    ## BUILD ONCE AND IT WILL ALWAYS THERE!!
    ## SINCE IT SEEMS THAT YOU CAN ONLY DO THE INDEXING OF COLUMNS
    ## ONCE WHEN YOU CREATE THE TABLE, PROBABLY YOU SHOULD INCLUDE
    ## ALL INTERESTING COLUMNS AT THIS MOMENT.
    store.append('dff', dff, data_columns=['A', 'B', 'string', 'string2'])
## need to delete the table and recreate if 
## adding new data columns to indexing
with pd.get_store('data/store.h5') as store:
    del store['dff']
## Selecting NaN values with string 'nan'
with pd.get_store('data/store.h5') as store:
    print store.select('dff', where=['string == foo', 'B == 4'])
    print 
    print store.select('dff', where = ('string != nan'))

## Iterating mode - the in-context version of iterating
for df in pd.read_hdf('data/store.h5', 'dff', chunksize = 3):
    print df

import sqlite3
from pandas.io import sql
df = pd.DataFrame(dict(id = [26, 42, 63], 
                       Date = pd.date_range("20121018", periods=3, freq='D'),
                       Col1 = list('XYZ'),
                       Col2 = [25.7, -12.4, 5.73], 
                       Col3 = [True, False, True]))
conn = sqlite3.connect(":memory:")
display(df)
df.to_sql('df', conn)
pd.read_sql('select Col1, Col2, Date from df where Date > "2012-10-19"', conn)