Data Loading, Storage,

In [ ]:
import numpy as np
import pandas as pd
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4, suppress=True)

Reading and Writing Data in Text Format

In [ ]:
!cat examples/ex1.csv
In [ ]:
df = pd.read_csv('examples/ex1.csv')
df
In [ ]:
pd.read_table('examples/ex1.csv', sep=',')
In [ ]:
!cat examples/ex2.csv
In [ ]:
pd.read_csv('examples/ex2.csv', header=None)
pd.read_csv('examples/ex2.csv', names=['a', 'b', 'c', 'd', 'message'])
In [ ]:
names = ['a', 'b', 'c', 'd', 'message']
pd.read_csv('examples/ex2.csv', names=names, index_col='message')
In [ ]:
!cat examples/csv_mindex.csv
parsed = pd.read_csv('examples/csv_mindex.csv',
                     index_col=['key1', 'key2'])
parsed
In [ ]:
list(open('examples/ex3.txt'))
In [ ]:
result = pd.read_table('examples/ex3.txt', sep='\s+')
result
In [ ]:
!cat examples/ex4.csv
pd.read_csv('examples/ex4.csv', skiprows=[0, 2, 3])
In [ ]:
!cat examples/ex5.csv
result = pd.read_csv('examples/ex5.csv')
result
pd.isnull(result)
In [ ]:
result = pd.read_csv('examples/ex5.csv', na_values=['NULL'])
result
In [ ]:
sentinels = {'message': ['foo', 'NA'], 'something': ['two']}
pd.read_csv('examples/ex5.csv', na_values=sentinels)

Reading Text Files in Pieces

In [ ]:
pd.options.display.max_rows = 10
In [ ]:
result = pd.read_csv('examples/ex6.csv')
result
In [ ]:
pd.read_csv('examples/ex6.csv', nrows=5)
In [ ]:
chunker = pd.read_csv('examples/ex6.csv', chunksize=1000)
chunker
In [ ]:
chunker = pd.read_csv('examples/ex6.csv', chunksize=1000)

tot = pd.Series([])
for piece in chunker:
    tot = tot.add(piece['key'].value_counts(), fill_value=0)

tot = tot.sort_values(ascending=False)
In [ ]:
tot[:10]

Writing Data to Text Format

In [ ]:
data = pd.read_csv('examples/ex5.csv')
data
In [ ]:
data.to_csv('examples/out.csv')
!cat examples/out.csv
In [ ]:
import sys
data.to_csv(sys.stdout, sep='|')
In [ ]:
data.to_csv(sys.stdout, na_rep='NULL')
In [ ]:
data.to_csv(sys.stdout, index=False, header=False)
In [ ]:
data.to_csv(sys.stdout, index=False, columns=['a', 'b', 'c'])
In [ ]:
dates = pd.date_range('1/1/2000', periods=7)
ts = pd.Series(np.arange(7), index=dates)
ts.to_csv('examples/tseries.csv')
!cat examples/tseries.csv

Working with Delimited Formats

In [ ]:
!cat examples/ex7.csv
In [ ]:
import csv
f = open('examples/ex7.csv')

reader = csv.reader(f)
In [ ]:
for line in reader:
    print(line)
In [ ]:
with open('examples/ex7.csv') as f:
    lines = list(csv.reader(f))
In [ ]:
header, values = lines[0], lines[1:]
In [ ]:
data_dict = {h: v for h, v in zip(header, zip(*values))}
data_dict

class my_dialect(csv.Dialect): lineterminator = '\n' delimiter = ';' quotechar = '"' quoting = csv.QUOTE_MINIMAL

reader = csv.reader(f, dialect=my_dialect)

reader = csv.reader(f, delimiter='|')

with open('mydata.csv', 'w') as f: writer = csv.writer(f, dialect=my_dialect) writer.writerow(('one', 'two', 'three')) writer.writerow(('1', '2', '3')) writer.writerow(('4', '5', '6')) writer.writerow(('7', '8', '9'))

JSON Data

In [ ]:
obj = """
{"name": "Wes",
 "places_lived": ["United States", "Spain", "Germany"],
 "pet": null,
 "siblings": [{"name": "Scott", "age": 30, "pets": ["Zeus", "Zuko"]},
              {"name": "Katie", "age": 38,
               "pets": ["Sixes", "Stache", "Cisco"]}]
}
"""
In [ ]:
import json
result = json.loads(obj)
result
In [ ]:
asjson = json.dumps(result)
In [ ]:
siblings = pd.DataFrame(result['siblings'], columns=['name', 'age'])
siblings
In [ ]:
!cat examples/example.json
In [ ]:
data = pd.read_json('examples/example.json')
data
In [ ]:
print(data.to_json())
print(data.to_json(orient='records'))

XML and HTML: Web Scraping

conda install lxml pip install beautifulsoup4 html5lib

In [ ]:
tables = pd.read_html('examples/fdic_failed_bank_list.html')
len(tables)
failures = tables[0]
failures.head()
In [ ]:
close_timestamps = pd.to_datetime(failures['Closing Date'])
close_timestamps.dt.year.value_counts()

Parsing XML with lxml.objectify

373889 Metro-North Railroad Escalator Availability Percent of the time that escalators are operational systemwide. The availability rate is based on physical observations performed the morning of regular business days only. This is a new indicator the agency began reporting in 2009. 2011 12 Service Indicators M U % 1 97.00 97.00
In [ ]:
from lxml import objectify

path = 'datasets/mta_perf/Performance_MNR.xml'
parsed = objectify.parse(open(path))
root = parsed.getroot()
In [ ]:
data = []

skip_fields = ['PARENT_SEQ', 'INDICATOR_SEQ',
               'DESIRED_CHANGE', 'DECIMAL_PLACES']

for elt in root.INDICATOR:
    el_data = {}
    for child in elt.getchildren():
        if child.tag in skip_fields:
            continue
        el_data[child.tag] = child.pyval
    data.append(el_data)
In [ ]:
perf = pd.DataFrame(data)
perf.head()
In [ ]:
from io import StringIO
tag = '<a href="http://www.google.com">Google</a>'
root = objectify.parse(StringIO(tag)).getroot()
In [ ]:
root
root.get('href')
root.text

Binary Data Formats

In [ ]:
frame = pd.read_csv('examples/ex1.csv')
frame
frame.to_pickle('examples/frame_pickle')
In [ ]:
pd.read_pickle('examples/frame_pickle')
In [ ]:
!rm examples/frame_pickle

Using HDF5 Format

In [ ]:
frame = pd.DataFrame({'a': np.random.randn(100)})
store = pd.HDFStore('mydata.h5')
store['obj1'] = frame
store['obj1_col'] = frame['a']
store
In [ ]:
store['obj1']
In [ ]:
store.put('obj2', frame, format='table')
store.select('obj2', where=['index >= 10 and index <= 15'])
store.close()
In [ ]:
frame.to_hdf('mydata.h5', 'obj3', format='table')
pd.read_hdf('mydata.h5', 'obj3', where=['index < 5'])
In [ ]:
os.remove('mydata.h5')

Reading Microsoft Excel Files

In [ ]:
xlsx = pd.ExcelFile('examples/ex1.xlsx')
In [ ]:
pd.read_excel(xlsx, 'Sheet1')
In [ ]:
frame = pd.read_excel('examples/ex1.xlsx', 'Sheet1')
frame
In [ ]:
writer = pd.ExcelWriter('examples/ex2.xlsx')
frame.to_excel(writer, 'Sheet1')
writer.save()
In [ ]:
frame.to_excel('examples/ex2.xlsx')
In [ ]:
!rm examples/ex2.xlsx

Interacting with Web APIs

In [ ]:
import requests
url = 'https://api.github.com/repos/pandas-dev/pandas/issues'
resp = requests.get(url)
resp
In [ ]:
data = resp.json()
data[0]['title']
In [ ]:
issues = pd.DataFrame(data, columns=['number', 'title',
                                     'labels', 'state'])
issues

Interacting with Databases

In [ ]:
import sqlite3
query = """
CREATE TABLE test
(a VARCHAR(20), b VARCHAR(20),
 c REAL,        d INTEGER
);"""
con = sqlite3.connect('mydata.sqlite')
con.execute(query)
con.commit()
In [ ]:
data = [('Atlanta', 'Georgia', 1.25, 6),
        ('Tallahassee', 'Florida', 2.6, 3),
        ('Sacramento', 'California', 1.7, 5)]
stmt = "INSERT INTO test VALUES(?, ?, ?, ?)"
con.executemany(stmt, data)
con.commit()
In [ ]:
cursor = con.execute('select * from test')
rows = cursor.fetchall()
rows
In [ ]:
cursor.description
pd.DataFrame(rows, columns=[x[0] for x in cursor.description])
In [ ]:
import sqlalchemy as sqla
db = sqla.create_engine('sqlite:///mydata.sqlite')
pd.read_sql('select * from test', db)
In [ ]:
!rm mydata.sqlite

Conclusion