import pandas as pd
from StringIO import StringIO
from IPython.core.display import display
%pylab inline
Populating the interactive namespace from numpy and matplotlib
## Specifying index column
foo_csv = StringIO("""date,A,B,C
20090101,a,1,2
20090102,b,3,4
20090103,c,4,5
""")
print '>> Without specifying index column'
foo_csv.seek(0)
display( pd.read_csv(foo_csv))
print '>> index column = 0, COLUMN INDEXING starts with 0'
foo_csv.seek(0)
display( pd.read_csv(foo_csv, index_col=0))
print '>> index column = date'
foo_csv.seek(0)
display( pd.read_csv(foo_csv, index_col='date'))
print '>> parsing index column date date'
foo_csv.seek(0)
display( pd.read_csv(foo_csv, index_col='date', parse_dates='date'))
print ">> hierachical index = [0, 'A']"
foo_csv.seek(0)
display( pd.read_csv(foo_csv, index_col = [0, 'A'],
parse_dates = 0))
>> Without specifying index column
date | A | B | C | |
---|---|---|---|---|
0 | 20090101 | a | 1 | 2 |
1 | 20090102 | b | 3 | 4 |
2 | 20090103 | c | 4 | 5 |
>> index column = 0, COLUMN INDEXING starts with 0
A | B | C | |
---|---|---|---|
date | |||
20090101 | a | 1 | 2 |
20090102 | b | 3 | 4 |
20090103 | c | 4 | 5 |
>> index column = date
A | B | C | |
---|---|---|---|
date | |||
20090101 | a | 1 | 2 |
20090102 | b | 3 | 4 |
20090103 | c | 4 | 5 |
>> parsing index column date date
A | B | C | |
---|---|---|---|
date | |||
2009-01-01 | a | 1 | 2 |
2009-01-02 | b | 3 | 4 |
2009-01-03 | c | 4 | 5 |
>> hierachical index = [0, 'A']
B | C | ||
---|---|---|---|
date | A | ||
2009-01-01 | a | 1 | 2 |
2009-01-02 | b | 3 | 4 |
2009-01-03 | c | 4 | 5 |
## Speaking different CSV dialects
## Use dialect keyword to specify the file format, by default
## it uses Excel dialect but you can specify either the
## dialect name or a csv.Dialect instace
data = """label1,label2,label3
index1,"a,c,e
index2,b,d,f
"""
## By default, read_csv uses the excel dialect and treats
## the double quote "" as the quote character, which causes
## it to fail when it finds a newline before it finds the
## closing double quote.
try:
pd.read_csv(StringIO(data))
except:
print 'CParserError raised'
## we can get around this by using dialect
import csv
dia = csv.excel()
dia.quoting = csv.QUOTE_NONE
display(pd.read_csv(StringIO(data), dialect = dia))
## equivalently
display(pd.read_csv(StringIO(data), quoting=csv.QUOTE_NONE))
## THINGS TO NOTE: all of dialect options can be specified
## separately by keyword arguments including
## *thousands* (specifies the thousands separator),
## *lineterminator* (str of len 1, char to break files into lines)
## *quotechar* (str, the char to denote start/end of a quote item),
## Quoted items can include the delimiter and it will be ignored (main purpose of quoting)
## *quoting* (int, csv.QUOTE_MINIMAL, csv.QUOTE_ALL, csv.QUOTE_NONE, csv.QUOTE_NONNUMERIC)
## *skipinitialspace* (default=false, ignore spaces after delimiter)
## *escapechar* (str specifying how to escape quoted data)
## *comment* ()
## *encoding* (e.g., 'utf-8' or 'latin-1')
## LOOK UP csv.Dialect document for details
## e.g.
data = 'a, b, c~1, 2, 3~4, 5, 6' #extra whitespace, new liner
display(pd.read_csv(StringIO(data), lineterminator='~',
skipinitialspace=True))
CParserError raised
label1 | label2 | label3 | |
---|---|---|---|
index1 | "a | c | e |
index2 | b | d | f |
label1 | label2 | label3 | |
---|---|---|---|
index1 | "a | c | e |
index2 | b | d | f |
a | b | c | |
---|---|---|---|
0 | 1 | 2 | 3 |
1 | 4 | 5 | 6 |
## Specifying column data types
## You can indicate the data type for the whole df
## or individual columns
data = 'a,b,c\n1,2,3\n4,5,6\n7,8,9'
print pd.read_csv(StringIO(data)).dtypes
print pd.read_csv(StringIO(data), dtype='object').dtypes
print pd.read_csv(StringIO(data),
dtype = {'a':'str', 'b':'float'}).dtypes
## strings are objects in numpy
a int64 b int64 c int64 dtype: object a object b object c object dtype: object a object b float64 c int64 dtype: object
## Overwrite header names by specifying 'names' param
## you can choose to keep or throw away original headers
## by specifying 'header' param
data = """a,b,c
1,2,3
4,5,6
7,8,9
"""
display(pd.read_csv(StringIO(data)))
display(pd.read_csv(StringIO(data), names = list("ABC"), header = None))
display(pd.read_csv(StringIO(data), names = list("ABC"), header = 0))
a | b | c | |
---|---|---|---|
0 | 1 | 2 | 3 |
1 | 4 | 5 | 6 |
2 | 7 | 8 | 9 |
A | B | C | |
---|---|---|---|
0 | a | b | c |
1 | 1 | 2 | 3 |
2 | 4 | 5 | 6 |
3 | 7 | 8 | 9 |
A | B | C | |
---|---|---|---|
0 | 1 | 2 | 3 |
1 | 4 | 5 | 6 |
2 | 7 | 8 | 9 |
## Filtering columns (usecols)
## the 'usecols' param allows you to select any subset of the
## columns in a file, either the column names or position
## Based on doc, it usually results in much faster parsing time and lower memory usage.
data = 'a,b,c,d\n1,2,3,foo\n4,5,6,bar\n7,8,9,baz'
display(pd.read_csv(StringIO(data)))
display(pd.read_csv(StringIO(data), usecols=['b', 'd']))
a | b | c | d | |
---|---|---|---|---|
0 | 1 | 2 | 3 | foo |
1 | 4 | 5 | 6 | bar |
2 | 7 | 8 | 9 | baz |
b | d | |
---|---|---|
0 | 2 | foo |
1 | 5 | bar |
2 | 8 | baz |
## Dealing with Unicode data
## the encoding arg should be used for encoded unicode data
## which will result in byte strings being decoded to unicode
## in the result
data = 'word,length\nTr\xe4umen,7\nGr\xfc\xdfe,5'
df = pd.read_csv(StringIO(data), encoding='latin-1')
df
word | length | |
---|---|---|
0 | Träumen | 7 |
1 | Grüße | 5 |
## Specifying date columns
## the "parse_dates" and "date_parser" params allow
## users to specify a variety of cols and date/time formats
tmp_data = """KORD,19990127, 19:00:00, 18:56:00, 0.8100
KORD,19990127, 20:00:00, 19:56:00, 0.0100
KORD,19990127, 21:00:00, 20:56:00, -0.5900
KORD,19990127, 21:00:00, 21:18:00, -0.9900
KORD,19990127, 22:00:00, 21:56:00, -0.5900
KORD,19990127, 23:00:00, 22:56:00, -0.5900
"""
## COMBINE DIFFERENT COLS to parse date
pd.read_csv(StringIO(tmp_data), header = None,
parse_dates={'end': [1, 2], 'start': [1, 3]},)
start | end | 0 | 4 | |
---|---|---|---|---|
0 | 1999-01-27 18:56:00 | 1999-01-27 19:00:00 | KORD | 0.81 |
1 | 1999-01-27 19:56:00 | 1999-01-27 20:00:00 | KORD | 0.01 |
2 | 1999-01-27 20:56:00 | 1999-01-27 21:00:00 | KORD | -0.59 |
3 | 1999-01-27 21:18:00 | 1999-01-27 21:00:00 | KORD | -0.99 |
4 | 1999-01-27 21:56:00 | 1999-01-27 22:00:00 | KORD | -0.59 |
5 | 1999-01-27 22:56:00 | 1999-01-27 23:00:00 | KORD | -0.59 |
## Handling Boolean Values
## use true_values and false_values to specify true/false values
data = 'a,b,c\n1,Yes,2\n3,No,4'
df1 = pd.read_csv(StringIO(data))
display(df1)
print df1.dtypes
df2 = pd.read_csv(StringIO(data), true_values=['Yes'], false_values=['No'])
display(df2)
print df2.dtypes
a | b | c | |
---|---|---|---|
0 | 1 | Yes | 2 |
1 | 3 | No | 4 |
a int64 b object c int64 dtype: object
a | b | c | |
---|---|---|---|
0 | 1 | True | 2 |
1 | 3 | False | 4 |
a int64 b bool c int64 dtype: object
## AUTO skipping bad lines
data = 'a,b,c\n1,2,3\n4,5,6,7\n8,9,10'
try:
pd.read_csv(StringIO(data))
except Exception, ex:
print ex
display(pd.read_csv(StringIO(data), error_bad_lines=False))
Error tokenizing data. C error: Expected 3 fields in line 3, saw 4
Skipping line 3: expected 3 fields, saw 4
a | b | c | |
---|---|---|---|
0 | 1 | 2 | 3 |
1 | 8 | 9 | 10 |
## Quoting and Escaping characters
## Quotes (and other escape characters) in embedded fields can be handled in any number of ways.
## One way is to use backslashes;
## to properly parse this data, you should pass the escapechar option
data = 'a,b\n"hello, \\"Bob\\", nice to see you",5'
pd.read_csv(StringIO(data), escapechar='\\')
a | b | |
---|---|---|
0 | hello, "Bob", nice to see you | 5 |
## ITERATING through files chunk by chunk
## Suppose you wish to iterate through a (potentially very large) file
## LAZILY rather than reading the entire file into memory,
## such as the following:
## 1. BY SPECIFYING "chunksize" to read_csv() ror read_table(),
## THE RETURN VALUE WILL BE AN ITERABLE OBJECT OF TYPE
## "TextFileReader"
tmp_sv = """|0|1|2|3
0|0.4691122999071863|-0.2828633443286633|-1.5090585031735124|-1.1356323710171934
1|1.2121120250208506|-0.17321464905330858|0.11920871129693428|-1.0442359662799567
2|-0.8618489633477999|-2.1045692188948086|-0.4949292740687813|1.071803807037338
3|0.7215551622443669|-0.7067711336300845|-1.0395749851146963|0.27185988554282986
4|-0.42497232978883753|0.567020349793672|0.27623201927771873|-1.0874006912859915
5|-0.6736897080883706|0.1136484096888855|-1.4784265524372235|0.5249876671147047
6|0.4047052186802365|0.5770459859204836|-1.7150020161146375|-1.0392684835147725
7|-0.3706468582364464|-1.1578922506419993|-1.344311812731667|0.8448851414248841
8|1.0757697837155533|-0.10904997528022223|1.6435630703622064|-1.4693879595399115
9|0.35702056413309086|-0.6746001037299882|-1.776903716971867|-0.9689138124473498
"""
reader = pd.read_table(StringIO(tmp_sv), sep = '|', chunksize=4)
print type(reader)
for chunk in reader:
print type(chunk)
print chunk
<class 'pandas.io.parsers.TextFileReader'> <class 'pandas.core.frame.DataFrame'> Unnamed: 0 0 1 2 3 0 0 0.469112 -0.282863 -1.509059 -1.135632 1 1 1.212112 -0.173215 0.119209 -1.044236 2 2 -0.861849 -2.104569 -0.494929 1.071804 3 3 0.721555 -0.706771 -1.039575 0.271860 <class 'pandas.core.frame.DataFrame'> Unnamed: 0 0 1 2 3 0 4 -0.424972 0.567020 0.276232 -1.087401 1 5 -0.673690 0.113648 -1.478427 0.524988 2 6 0.404705 0.577046 -1.715002 -1.039268 3 7 -0.370647 -1.157892 -1.344312 0.844885 <class 'pandas.core.frame.DataFrame'> Unnamed: 0 0 1 2 3 0 8 1.075770 -0.10905 1.643563 -1.469388 1 9 0.357021 -0.67460 -1.776904 -0.968914
## 2. SPECIFYING iterator = True WILL ALSO RETURN THE
## TextFileReader object
reader = pd.read_table(StringIO(tmp_sv), sep="|", iterator=True, )
chunk = reader.get_chunk(5)
print type(chunk)
print chunk
<class 'pandas.core.frame.DataFrame'> Unnamed: 0 0 1 2 3 0 0 0.469112 -0.282863 -1.509059 -1.135632 1 1 1.212112 -0.173215 0.119209 -1.044236 2 2 -0.861849 -2.104569 -0.494929 1.071804 3 3 0.721555 -0.706771 -1.039575 0.271860 4 4 -0.424972 0.567020 0.276232 -1.087401
## WRITING to CSV
## two BOOL parameters 'header' and 'index' specifying
## whether writing header and index information to the file
The format of the JSON string
split: dict like {index -> [index], columns -> [columns], data -> [values]}
records: list like [{column -> value}, ... , {column -> value}]
index: dict like {index -> {column -> value}}
columns: dict like {column -> {index -> value}}
values: just the values array
from IPython.core.display import HTML
HTML (df.to_html())
word | length | |
---|---|---|
0 | Träumen | 7 |
1 | Grüße | 5 |
df.to_pickle('data/foo.pkl')
!ls data/foo*
data/foo.pkl
dff = pd.read_pickle('data/foo.pkl')
dff
word | length | |
---|---|---|
0 | Träumen | 7 |
1 | Grüße | 5 |
## create or read a store
import pandas as pd
from numpy.random import randn
store = pd.HDFStore('data/store.h5', 'w')
print store
## adding some objects
index = pd.date_range('1/1/2000', periods=8, freq='D')
s = pd.Series(randn(5), index = list("abcde"))
df = pd.DataFrame(randn(8, 3), index = index, columns = list("ABC"))
store['s'] = s
store['df'] = df
print store
## read object
print store['df']
store.close()
<class 'pandas.io.pytables.HDFStore'> File path: data/store.h5 Empty <class 'pandas.io.pytables.HDFStore'> File path: data/store.h5 /df frame (shape->[8,3]) /s series (shape->[5]) A B C 2000-01-01 -0.855040 0.713266 0.053736 2000-01-02 1.434090 1.091144 1.169743 2000-01-03 -0.650549 -1.112457 -1.901135 2000-01-04 0.047931 0.809756 -0.221455 2000-01-05 0.536011 -0.320523 -0.321210 2000-01-06 -0.326509 -2.570665 -0.559427 2000-01-07 1.207612 1.922518 -0.475120 2000-01-08 -0.484949 1.387578 1.840412
## read back objects as from a dict
store = pd.HDFStore('data/store.h5', 'r')
print store.df
A B C 2000-01-01 -0.855040 0.713266 0.053736 2000-01-02 1.434090 1.091144 1.169743 2000-01-03 -0.650549 -1.112457 -1.901135 2000-01-04 0.047931 0.809756 -0.221455 2000-01-05 0.536011 -0.320523 -0.321210 2000-01-06 -0.326509 -2.570665 -0.559427 2000-01-07 1.207612 1.922518 -0.475120 2000-01-08 -0.484949 1.387578 1.840412
## delete an object as from a dict
store.close()
store = pd.HDFStore('data/store.h5', 'a') # reopen in a mode
del store['s']
store.close()
## use hdf5 in a managed context (SO painful to rmbr to close the file)
with pd.get_store('data/store.h5') as store:
print store.keys()
['/df']
df = pd.DataFrame(dict(A = range(5), B = range(5)))
## Append param is significant here
df.to_hdf('data/store.h5', 'atable', append = True) ## make it a table
pd.read_hdf('data/store.h5', 'atable', where = ['index>2'])
with pd.get_store('data/store.h5') as store:
print store
<class 'pandas.io.pytables.HDFStore'> File path: data/store.h5 /atable frame_table (typ->appendable,nrows->10,ncols->2,indexers->[index]) /df frame (shape->[8,3])
put()
, it writed the HDF5 to PyTables in a fixed array format, called the storer format. These types of sotres are not appendable once written (though you can simply remove them and rewrite). Nor are they queryable. They must be retrieved in their entirety. These offer very fast writing and sligthly faster reading than table
storestable
format. Conecputually a table is shaped very much like a DataFrame, with rows and columns. A table
may be appended in the same or other sessions. In addition, delete & query type operations are supportedstorer
format is assumped by default. table
format is stipulated by using append()
method or append=True
parameter## A storer format will raise a TypeError if you try to
## tetrieve using a "where" parameter
DataFrame(rand(10, 2)).to_hdf('data/simple_storer.h5', 'df')
pd.read_hdf('data/simple_storer.h5', 'df', where = 'index > 5')
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-25-d06e9b79f1e5> in <module>() 1 ## A storer format will raise a TypeError if you try to 2 ## tetrieve using a "where" parameter ----> 3 DataFrame(rand(10, 2)).to_hdf('data/simple_storer.h5', 'df') 4 pd.read_hdf('data/simple_storer.h5', 'df', where = 'index > 5') NameError: name 'DataFrame' is not defined
## On the other hand, table format (by using append or append parameter)
## supports appending and query
with pd.get_store('data/store.h5') as store:
df1 = df.iloc[:2]
df2 = df.iloc[2:]
store.append('df_table', df1)
df2.to_hdf('data/store.h5', 'df_table', append=True)
print store
<class 'pandas.io.pytables.HDFStore'> File path: data/store.h5 /atable frame_table (typ->appendable,nrows->10,ncols->2,indexers->[index]) /df frame (shape->[8,3]) /df_table frame_table (typ->appendable,nrows->5,ncols->2,indexers->[index])
## Now the table format in store is querable
with pd.get_store('data/store.h5') as store:
display(store.df_table)
display(store.select('df_table', where=['index > 2']))
A | B | |
---|---|---|
0 | 0 | 0 |
1 | 1 | 1 |
2 | 2 | 2 |
3 | 3 | 3 |
4 | 4 | 4 |
A | B | |
---|---|---|
3 | 3 | 3 |
4 | 4 | 4 |
foo/bar/bah
), which will generate a herarchy of sub-stores (or Groups
in PyTables parlance). Keys can be specified with out the leading '/' and are ALWAYS absolute (e.g. 'foo' refers to '/foo'). Removal operations can remove everything in the sub-store and BELOW, so be careful.## Hierarchical nature of HDF store
with pd.get_store('data/store.h5') as store:
store.put('foo/bar/bah', df)
store.append('food/orange', df)
store.append('food/apple', df)
print '\n', store
print '\n', store.keys()
store.remove('food')
print '\n', store
<class 'pandas.io.pytables.HDFStore'> File path: data/store.h5 /atable frame_table (typ->appendable,nrows->10,ncols->2,indexers->[index]) /df frame (shape->[8,3]) /df_table frame_table (typ->appendable,nrows->5,ncols->2,indexers->[index]) /food/apple frame_table (typ->appendable,nrows->5,ncols->2,indexers->[index]) /food/orange frame_table (typ->appendable,nrows->5,ncols->2,indexers->[index]) /foo/bar/bah frame (shape->[5,2]) ['/atable', '/df', '/df_table', '/food/apple', '/food/orange', '/foo/bar/bah'] <class 'pandas.io.pytables.HDFStore'> File path: data/store.h5 /atable frame_table (typ->appendable,nrows->10,ncols->2,indexers->[index]) /df frame (shape->[8,3]) /df_table frame_table (typ->appendable,nrows->5,ncols->2,indexers->[index]) /foo/bar/bah frame (shape->[5,2])
select()
and delete()
operations have optional criterion that can be specified to select/delete only a subset of data. This allows one to have a very large on-disk table and retrieve only a portion of the data .Term
class under the hood.index
and columns
are supported indexers of a DataFramemajor_axis
, minor_axis
, and items
are supported indexers of the PanelTerm
can be created from dict
, list
, tuple
or string
. Allowed operations are <, <=, >, >=, =,!=. And = will be inferred as an implicit set operation (e.g., if two or more values are provided). The following are all valid terms:Queries
are built up using a list of Terms
(currently only adding of terms is supported). E.g., ['major_axis > 20000102', ('minor_axis', '=', ['A', 'B'])]. This is roughly translated to : major_axis must be greater than the date 20000102 and the minor_axis must be A or Bcolumns
keyword can be supplied to select a list of columns to be returned, this is equivalent to passing a Term('columns', list_of_columns_to_filter)data_columns
parameter to create indexing for columns in dataframe (table format only)*: There is some performance degredation by making lots of columns into data columns, so it is up to the user to designate these. In addition, you cannot change data columns (nor indexables) after the first append/put operation (Of course you can simply read in the data and create a new table!)You can also use the iterator with read_hdf which will open, then automatically close the store when finished iterating..
select_column()
to select a single column from table in a faster waywith pd.get_store('data/store.h5') as store:
display(store.df_table)
print 'QUERY CAN ONLY BE APPLIED TO table BUT NOT TO storer'
print '>> Selecting columns B and C only'
display(store.select('df_table', columns=['B']))
print '>> Equivalently using where='
display(store.select('df_table',
where = [('columns', '=', 'B')]))
A | B | |
---|---|---|
0 | 0 | 0 |
1 | 1 | 1 |
2 | 2 | 2 |
3 | 3 | 3 |
4 | 4 | 4 |
QUERY CAN ONLY BE APPLIED TO table BUT NOT TO storer >> Selecting columns B and C only
B | |
---|---|
0 | 0 |
1 | 1 |
2 | 2 |
3 | 3 |
4 | 4 |
>> Equivalently using where=
B | |
---|---|
0 | 0 |
1 | 1 |
2 | 2 |
3 | 3 |
4 | 4 |
with pd.get_store('data/store.h5') as store:
store.create_table_index('atable', optlevel=9, kind='full')
print store.atable.dtypes
print store.select('atable', where = ['index > 1', 'index < 3'])
## NOT VALID - BECAUSE DATA_COLUMNS ARE NOT INDEX YET
#print store.select('atable', where = pd.Term('B > 3'))
A int64 B int64 dtype: object A B 2 2 2 2 2 2
## INDEX DATA COLUMNS SO QUERY CAN BE MADE ON THEM LATER
with pd.get_store('data/store.h5') as store:
dff = store.atable.copy()
dff.index = range(dff.shape[0])
dff.ix[:3, 'A'] = 100
dff['string'] = 'foo'
dff.ix[1:3, 'string'] = np.nan
dff.iloc[7:9]['string'] = 'bar'
dff['string2'] = 'cool'
print dff
## indexing data columns
## BUILD ONCE AND IT WILL ALWAYS THERE!!
## SINCE IT SEEMS THAT YOU CAN ONLY DO THE INDEXING OF COLUMNS
## ONCE WHEN YOU CREATE THE TABLE, PROBABLY YOU SHOULD INCLUDE
## ALL INTERESTING COLUMNS AT THIS MOMENT.
store.append('dff', dff, data_columns=['A', 'B', 'string', 'string2'])
A B string string2 0 100 0 foo cool 1 100 1 NaN cool 2 100 2 NaN cool 3 100 3 NaN cool 4 4 4 foo cool 5 0 0 foo cool 6 1 1 foo cool 7 2 2 bar cool 8 3 3 bar cool 9 4 4 foo cool
## Selecting NaN values with string 'nan'
with pd.get_store('data/store.h5') as store:
print store.select('dff', where=['string == foo', 'B == 4'])
print
print store.select('dff', where = ('string != nan'))
A B string string2 4 4 4 foo cool 9 4 4 foo cool 4 4 4 foo cool 9 4 4 foo cool 4 4 4 foo cool 9 4 4 foo cool 4 4 4 foo cool 9 4 4 foo cool 4 4 4 foo cool 9 4 4 foo cool A B string string2 0 0 0 foo cool 4 4 4 foo cool 5 0 0 foo cool 6 1 1 foo cool 7 2 2 bar cool 8 3 3 bar cool 9 4 4 foo cool 0 0 0 foo cool 4 4 4 foo cool 5 0 0 foo cool 6 1 1 foo cool 7 2 2 bar cool 8 3 3 bar cool 9 4 4 foo cool 0 0 0 foo cool 4 4 4 foo cool 5 0 0 foo cool 6 1 1 foo cool 7 2 2 bar cool 8 3 3 bar cool 9 4 4 foo cool 0 -9223372036854775808 0 foo cool 4 4 4 foo cool 5 0 0 foo cool 6 1 1 foo cool 7 2 2 bar cool 8 3 3 bar cool 9 4 4 foo cool 0 100 0 foo cool 4 4 4 foo cool 5 0 0 foo cool 6 1 1 foo cool 7 2 2 bar cool 8 3 3 bar cool 9 4 4 foo cool
## Iterating mode - the in-context version of iterating
for df in pd.read_hdf('data/store.h5', 'dff', chunksize = 3):
print df
A B string string2 0 0 0 foo cool 1 1 1 NaN cool 2 2 2 NaN cool A B string string2 3 3 3 NaN cool 4 4 4 foo cool 5 0 0 foo cool A B string string2 6 1 1 foo cool 7 2 2 bar cool 8 3 3 bar cool A B string string2 9 4 4 foo cool 0 0 0 foo cool 1 1 1 NaN cool A B string string2 2 2 2 NaN cool 3 3 3 NaN cool 4 4 4 foo cool A B string string2 5 0 0 foo cool 6 1 1 foo cool 7 2 2 bar cool A B string string2 8 3 3 bar cool 9 4 4 foo cool 0 0 0 foo cool A B string string2 1 1 1 NaN cool 2 2 2 NaN cool 3 3 3 NaN cool A B string string2 4 4 4 foo cool 5 0 0 foo cool 6 1 1 foo cool A B string string2 7 2 2 bar cool 8 3 3 bar cool 9 4 4 foo cool A B string string2 0 -9223372036854775808 0 foo cool 1 -9223372036854775808 1 NaN cool 2 -9223372036854775808 2 NaN cool A B string string2 3 -9223372036854775808 3 NaN cool 4 4 4 foo cool 5 0 0 foo cool A B string string2 6 1 1 foo cool 7 2 2 bar cool 8 3 3 bar cool A B string string2 9 4 4 foo cool 0 100 0 foo cool 1 100 1 NaN cool A B string string2 2 100 2 NaN cool 3 100 3 NaN cool 4 4 4 foo cool A B string string2 5 0 0 foo cool 6 1 1 foo cool 7 2 2 bar cool A B string string2 8 3 3 bar cool 9 4 4 foo cool
import sqlite3
from pandas.io import sql
df = pd.DataFrame(dict(id = [26, 42, 63],
Date = pd.date_range("20121018", periods=3, freq='D'),
Col1 = list('XYZ'),
Col2 = [25.7, -12.4, 5.73],
Col3 = [True, False, True]))
conn = sqlite3.connect(":memory:")
display(df)
df.to_sql('df', conn)
pd.read_sql('select Col1, Col2, Date from df where Date > "2012-10-19"', conn)
Col1 | Col2 | Col3 | Date | id | |
---|---|---|---|---|---|
0 | X | 25.70 | True | 2012-10-18 00:00:00 | 26 |
1 | Y | -12.40 | False | 2012-10-19 00:00:00 | 42 |
2 | Z | 5.73 | True | 2012-10-20 00:00:00 | 63 |
Col1 | Col2 | Date | |
---|---|---|---|
0 | Y | -12.40 | 2012-10-19 00:00:00 |
1 | Z | 5.73 | 2012-10-20 00:00:00 |