#!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd import sqlite3 # # 9.1 Reading data from SQL databases # So far we've only talked about reading data from CSV files. That's a pretty common way to store data, but there are many others! Pandas can read from HTML, JSON, SQL, Excel (!!!), HDF5, Stata, and a few other things. In this chapter we'll talk about reading data from SQL databases. # # You can read data from a SQL database using the `pd.read_sql` function. `read_sql` will automatically convert SQL column names to DataFrame column names. # # `read_sql` takes 2 arguments: a `SELECT` statement, and a database connection object. This is great because it means you can read from *any* kind of SQL database -- it doesn't matter if it's MySQL, SQLite, PostgreSQL, or something else. # # This example reads from a SQLite database, but any other database would work the same way. # In[2]: con = sqlite3.connect("../data/weather_2012.sqlite") df = pd.read_sql("SELECT * from weather_2012 LIMIT 3", con) df # `read_sql` doesn't automatically set the primary key (`id`) to be the index of the dataframe. You can make it do that by adding an `index_col` argument to `read_sql`. # # If you've used `read_csv` a lot, you may have seen that it has an `index_col` argument as well. This one behaves the same way. # In[3]: df = pd.read_sql("SELECT * from weather_2012 LIMIT 3", con, index_col='id') df # If you want your dataframe to be indexed by more than one column, you can give a list of columns to `index_col`: # In[4]: df = pd.read_sql("SELECT * from weather_2012 LIMIT 3", con, index_col=['id', 'date_time']) df # # 9.2 Writing to a SQLite database # Pandas has a `write_frame` function which creates a database table from a dataframe. Right now this only works for SQLite databases. Let's use it to move our 2012 weather data into SQL. # # You'll notice that this function is in `pd.io.sql`. There are a ton of useful functions for reading and writing various kind of data in `pd.io`, and it's worth spending some time exploring them. ([see the documentation!](http://pandas.pydata.org/pandas-docs/stable/io.html)) # In[5]: weather_df = pd.read_csv('../data/weather_2012.csv') con = sqlite3.connect("../data/test_db.sqlite") con.execute("DROP TABLE IF EXISTS weather_2012") weather_df.to_sql("weather_2012", con) # We can now read from the `weather_2012` table in `test_db.sqlite`, and we see that we get the same data back: # In[6]: con = sqlite3.connect("../data/test_db.sqlite") df = pd.read_sql("SELECT * from weather_2012 LIMIT 3", con) df # The nice thing about having your data in a database is that you can do arbitrary SQL queries. This is cool especially if you're more familiar with SQL. Here's an example of sorting by the Weather column: # In[7]: con = sqlite3.connect("../data/test_db.sqlite") df = pd.read_sql("SELECT * from weather_2012 ORDER BY Weather LIMIT 3", con) df # If you have a PostgreSQL database or MySQL database, reading from it works exactly the same way as reading from a SQLite database. You create a connection using `psycopg2.connect()` or `MySQLdb.connect()`, and then use # # `pd.read_sql("SELECT whatever from your_table", con)` # # 9.3 Connecting to other kinds of database # To connect to a MySQL database: # # *Note: For these to work, you will need a working MySQL / PostgreSQL database, with the correct localhost, database name, etc.* import MySQLdb con = MySQLdb.connect(host="localhost", db="test") # To connect to a PostgreSQL database: import psycopg2 con = psycopg2.connect(host="localhost") #