In [1]:
# import
import pandas as pd
import numpy as np

Parsing functions in pandas

read_csv              Load delimited data from a file, URL, or file-like object. Use comma as default delimiter
read_table            Load delimited data from a file, URL, or file-like object. Use tab ('\t') as default delimiter
read_fwf              Read data in fixed-width column format (that is, no delimiters)
read_clipboard      Version of read_table that reads data from the clipboard. Useful for converting tables from web pages
In [2]:
# reading csv file, first check the content of it
!more "dataset/Employees2Head.csv"
LNAME,FNAME,JOB TITLE,DEPARTMENT,EMPLOYEE ANNUAL SALARY,ESTIMATED ANNUAL SALARY MINUS FURLOUGHS
AARON,ELVIA J,WATER RATE TAKER,WATER MGMNT,$81000.00,$73862.00
AARON,JEFFERY M,POLICE OFFICER,POLICE,$74628.00,$74628.00
AARON,KIMBERLEI R,CHIEF CONTRACT EXPEDITER,FLEET MANAGEMNT,$77280.00,$70174.00
ABAD JR,VICENTE M,CIVIL ENGINEER IV,WATER MGMNT,$96276.00,$96276.00
ABBATACOLA,ROBERT J,ELECTRICAL MECHANIC,WATER MGMNT,$84032.00,$76627.00
ABBATE,TERRY M,POLICE OFFICER,POLICE,$79926.00,$79926.00
ABBATEMARCO,JAMES J,FIREFIGHTER,FIRE,$77238.00,$77238.00
ABBOTT,BETTY L,FOSTER GRANDPARENT,FAMILY & SUPPORT,$2756.00,$2756.00
ABBOTT,LYNISE M,CLERK III,FAMILY & SUPPORT,$38568.00,$38568.00
ABBOTT,SAM J,ELECTRICAL MECHANIC,TRANSPORTN,$84032.00,$76627.00
ABDELHADI,ABDALMAHD,POLICE OFFICER,POLICE,$71040.00,$71040.00
ABDELLATIF,AREF R,FIREFIGHTER (PER ARBITRATORS AWARD)-PARAMEDIC,FIRE,$86922.00,$86922.00
In [3]:
# this file is having header and delimiter is comma (,), we can read this with read_csv method
fh = pd.read_csv("dataset/Employees2Head.csv")
len(fh)   # of lines in file excluding header
Out[3]:
12
In [4]:
print(fh)
          LNAME        FNAME                                      JOB TITLE  \
0         AARON      ELVIA J                               WATER RATE TAKER   
1         AARON    JEFFERY M                                 POLICE OFFICER   
2         AARON  KIMBERLEI R                       CHIEF CONTRACT EXPEDITER   
3       ABAD JR    VICENTE M                              CIVIL ENGINEER IV   
4    ABBATACOLA     ROBERT J                            ELECTRICAL MECHANIC   
5        ABBATE      TERRY M                                 POLICE OFFICER   
6   ABBATEMARCO      JAMES J                                    FIREFIGHTER   
7        ABBOTT      BETTY L                             FOSTER GRANDPARENT   
8        ABBOTT     LYNISE M                                      CLERK III   
9        ABBOTT        SAM J                            ELECTRICAL MECHANIC   
10    ABDELHADI    ABDALMAHD                                 POLICE OFFICER   
11   ABDELLATIF       AREF R  FIREFIGHTER (PER ARBITRATORS AWARD)-PARAMEDIC   

          DEPARTMENT EMPLOYEE ANNUAL SALARY  \
0        WATER MGMNT              $81000.00   
1             POLICE              $74628.00   
2    FLEET MANAGEMNT              $77280.00   
3        WATER MGMNT              $96276.00   
4        WATER MGMNT              $84032.00   
5             POLICE              $79926.00   
6               FIRE              $77238.00   
7   FAMILY & SUPPORT               $2756.00   
8   FAMILY & SUPPORT              $38568.00   
9         TRANSPORTN              $84032.00   
10            POLICE              $71040.00   
11              FIRE              $86922.00   

   ESTIMATED ANNUAL SALARY MINUS FURLOUGHS  
0                                $73862.00  
1                                $74628.00  
2                                $70174.00  
3                                $96276.00  
4                                $76627.00  
5                                $79926.00  
6                                $77238.00  
7                                 $2756.00  
8                                $38568.00  
9                                $76627.00  
10                               $71040.00  
11                               $86922.00  
In [5]:
# we can read the same file with read_table method with passing an option sep=
fh = pd.read_table("dataset/Employees2Head.csv", sep=",")
len(fh)
Out[5]:
12
In [6]:
# let's check how to read a file without header
!more "dataset/Employees2.csv"
AARON,ELVIA J,WATER RATE TAKER,WATER MGMNT,$81000.00,$73862.00
AARON,JEFFERY M,POLICE OFFICER,POLICE,$74628.00,$74628.00
AARON,KIMBERLEI R,CHIEF CONTRACT EXPEDITER,FLEET MANAGEMNT,$77280.00,$70174.00
ABAD JR,VICENTE M,CIVIL ENGINEER IV,WATER MGMNT,$96276.00,$96276.00
ABBATACOLA,ROBERT J,ELECTRICAL MECHANIC,WATER MGMNT,$84032.00,$76627.00
ABBATE,TERRY M,POLICE OFFICER,POLICE,$79926.00,$79926.00
ABBATEMARCO,JAMES J,FIREFIGHTER,FIRE,$77238.00,$77238.00
ABBOTT,BETTY L,FOSTER GRANDPARENT,FAMILY & SUPPORT,$2756.00,$2756.00
ABBOTT,LYNISE M,CLERK III,FAMILY & SUPPORT,$38568.00,$38568.00
ABBOTT,SAM J,ELECTRICAL MECHANIC,TRANSPORTN,$84032.00,$76627.00
ABDELHADI,ABDALMAHD,POLICE OFFICER,POLICE,$71040.00,$71040.00
ABDELLATIF,AREF R,FIREFIGHTER (PER ARBITRATORS AWARD)-PARAMEDIC,FIRE,$86922.00,$86922.00
In [7]:
fh = pd.read_csv("dataset/Employees2.csv", header=None)
len(fh)
Out[7]:
12
In [8]:
print(fh) # check the column names, it've been assigned by pandas itself
              0            1                                              2  \
0         AARON      ELVIA J                               WATER RATE TAKER   
1         AARON    JEFFERY M                                 POLICE OFFICER   
2         AARON  KIMBERLEI R                       CHIEF CONTRACT EXPEDITER   
3       ABAD JR    VICENTE M                              CIVIL ENGINEER IV   
4    ABBATACOLA     ROBERT J                            ELECTRICAL MECHANIC   
5        ABBATE      TERRY M                                 POLICE OFFICER   
6   ABBATEMARCO      JAMES J                                    FIREFIGHTER   
7        ABBOTT      BETTY L                             FOSTER GRANDPARENT   
8        ABBOTT     LYNISE M                                      CLERK III   
9        ABBOTT        SAM J                            ELECTRICAL MECHANIC   
10    ABDELHADI    ABDALMAHD                                 POLICE OFFICER   
11   ABDELLATIF       AREF R  FIREFIGHTER (PER ARBITRATORS AWARD)-PARAMEDIC   

                   3          4          5  
0        WATER MGMNT  $81000.00  $73862.00  
1             POLICE  $74628.00  $74628.00  
2    FLEET MANAGEMNT  $77280.00  $70174.00  
3        WATER MGMNT  $96276.00  $96276.00  
4        WATER MGMNT  $84032.00  $76627.00  
5             POLICE  $79926.00  $79926.00  
6               FIRE  $77238.00  $77238.00  
7   FAMILY & SUPPORT   $2756.00   $2756.00  
8   FAMILY & SUPPORT  $38568.00  $38568.00  
9         TRANSPORTN  $84032.00  $76627.00  
10            POLICE  $71040.00  $71040.00  
11              FIRE  $86922.00  $86922.00  
In [9]:
# we can assign file dataframe the column names
fh.columns
Out[9]:
Int64Index([0, 1, 2, 3, 4, 5], dtype='int64')
In [10]:
fh.columns = ['LNAME','FNAME','JOB TITLE','DEPARTMENT','EMPLOYEE ANNUAL SALARY','ESTIMATED ANNUAL SALARY MINUS FURLOUGHS'
]
print(fh)
          LNAME        FNAME                                      JOB TITLE  \
0         AARON      ELVIA J                               WATER RATE TAKER   
1         AARON    JEFFERY M                                 POLICE OFFICER   
2         AARON  KIMBERLEI R                       CHIEF CONTRACT EXPEDITER   
3       ABAD JR    VICENTE M                              CIVIL ENGINEER IV   
4    ABBATACOLA     ROBERT J                            ELECTRICAL MECHANIC   
5        ABBATE      TERRY M                                 POLICE OFFICER   
6   ABBATEMARCO      JAMES J                                    FIREFIGHTER   
7        ABBOTT      BETTY L                             FOSTER GRANDPARENT   
8        ABBOTT     LYNISE M                                      CLERK III   
9        ABBOTT        SAM J                            ELECTRICAL MECHANIC   
10    ABDELHADI    ABDALMAHD                                 POLICE OFFICER   
11   ABDELLATIF       AREF R  FIREFIGHTER (PER ARBITRATORS AWARD)-PARAMEDIC   

          DEPARTMENT EMPLOYEE ANNUAL SALARY  \
0        WATER MGMNT              $81000.00   
1             POLICE              $74628.00   
2    FLEET MANAGEMNT              $77280.00   
3        WATER MGMNT              $96276.00   
4        WATER MGMNT              $84032.00   
5             POLICE              $79926.00   
6               FIRE              $77238.00   
7   FAMILY & SUPPORT               $2756.00   
8   FAMILY & SUPPORT              $38568.00   
9         TRANSPORTN              $84032.00   
10            POLICE              $71040.00   
11              FIRE              $86922.00   

   ESTIMATED ANNUAL SALARY MINUS FURLOUGHS  
0                                $73862.00  
1                                $74628.00  
2                                $70174.00  
3                                $96276.00  
4                                $76627.00  
5                                $79926.00  
6                                $77238.00  
7                                 $2756.00  
8                                $38568.00  
9                                $76627.00  
10                               $71040.00  
11                               $86922.00  
In [11]:
# or we can assign the names while reading the files
fh = pd.read_table("dataset/Employees2.csv", header=None, sep=',', 
                   names=['LNAME','FNAME','JOB TITLE','DEPARTMENT','EMPLOYEE ANNUAL SALARY',
                          'ESTIMATED ANNUAL SALARY MINUS FURLOUGHS'])
len(fh)
Out[11]:
12
In [12]:
print(fh)
          LNAME        FNAME                                      JOB TITLE  \
0         AARON      ELVIA J                               WATER RATE TAKER   
1         AARON    JEFFERY M                                 POLICE OFFICER   
2         AARON  KIMBERLEI R                       CHIEF CONTRACT EXPEDITER   
3       ABAD JR    VICENTE M                              CIVIL ENGINEER IV   
4    ABBATACOLA     ROBERT J                            ELECTRICAL MECHANIC   
5        ABBATE      TERRY M                                 POLICE OFFICER   
6   ABBATEMARCO      JAMES J                                    FIREFIGHTER   
7        ABBOTT      BETTY L                             FOSTER GRANDPARENT   
8        ABBOTT     LYNISE M                                      CLERK III   
9        ABBOTT        SAM J                            ELECTRICAL MECHANIC   
10    ABDELHADI    ABDALMAHD                                 POLICE OFFICER   
11   ABDELLATIF       AREF R  FIREFIGHTER (PER ARBITRATORS AWARD)-PARAMEDIC   

          DEPARTMENT EMPLOYEE ANNUAL SALARY  \
0        WATER MGMNT              $81000.00   
1             POLICE              $74628.00   
2    FLEET MANAGEMNT              $77280.00   
3        WATER MGMNT              $96276.00   
4        WATER MGMNT              $84032.00   
5             POLICE              $79926.00   
6               FIRE              $77238.00   
7   FAMILY & SUPPORT               $2756.00   
8   FAMILY & SUPPORT              $38568.00   
9         TRANSPORTN              $84032.00   
10            POLICE              $71040.00   
11              FIRE              $86922.00   

   ESTIMATED ANNUAL SALARY MINUS FURLOUGHS  
0                                $73862.00  
1                                $74628.00  
2                                $70174.00  
3                                $96276.00  
4                                $76627.00  
5                                $79926.00  
6                                $77238.00  
7                                 $2756.00  
8                                $38568.00  
9                                $76627.00  
10                               $71040.00  
11                               $86922.00  
In [13]:
# In some cases, a table might not have a fixed delimiter, using whitespace or some other
# pattern to separate fields. In these cases, you can pass a regular expression as a delimiter
# for read_table.
# pd.read_table('ch06/ex3.txt', sep='\s+')
In [14]:
# Skip the rows to read
fh = pd.read_table("dataset/Employees2Head.csv", sep=",", skiprows=[0,1,2])
len(fh)  # total was 12 rows but skipped 3 rows
Out[14]:
9

Handling missing data while reading files

In [15]:
!more "dataset/Emp2HeadMiss.csv"
LNAME,FNAME,JOB TITLE,DEPARTMENT,EMPLOYEE ANNUAL SALARY,ESTIMATED ANNUAL SALARY MINUS FURLOUGHS
AARON,ELVIA J,,WATER MGMNT,NULL,$73862.00
,JEFFERY M,POLICE OFFICER,POLICE,$74628.00,$74628.00
AARON,,CHIEF CONTRACT EXPEDITER,NaN,$77280.00,$70174.00
ABAD JR,VICENTE M,CIVIL ENGINEER IV,WATER MGMNT,$96276.00,NaN
|,ROBERT J,,NULL,$84032.00,$76627.00
ABBATE,TERRY M,POLICE OFFICER,POLICE,$79926.00,NaN
NULL,JAMES J,NA,FIRE,NULL,$77238.00
NaN,,FOSTER GRANDPARENT,FAMILY & SUPPORT,NA,$2756.00
NA,LYNISE M,CLERK III,FAMILY & SUPPORT,$38568.00,$38568.00
|,SAM J,ELECTRICAL MECHANIC,TRANSPORTN,$84032.00,$76627.00
ABDELHADI,ABDALMAHD,POLICE OFFICER,POLICE,$71040.00,$71040.00
,AREF R,NA,FIRE,,$86922.00
In [16]:
# reading the file
fh = pd.read_csv("dataset/Emp2HeadMiss.csv")
print(fh)
        LNAME      FNAME                 JOB TITLE        DEPARTMENT  \
0       AARON    ELVIA J                       NaN       WATER MGMNT   
1         NaN  JEFFERY M            POLICE OFFICER            POLICE   
2       AARON        NaN  CHIEF CONTRACT EXPEDITER               NaN   
3     ABAD JR  VICENTE M         CIVIL ENGINEER IV       WATER MGMNT   
4           |   ROBERT J                       NaN               NaN   
5      ABBATE    TERRY M            POLICE OFFICER            POLICE   
6         NaN    JAMES J                       NaN              FIRE   
7         NaN        NaN        FOSTER GRANDPARENT  FAMILY & SUPPORT   
8         NaN   LYNISE M                 CLERK III  FAMILY & SUPPORT   
9           |      SAM J       ELECTRICAL MECHANIC        TRANSPORTN   
10  ABDELHADI  ABDALMAHD            POLICE OFFICER            POLICE   
11        NaN     AREF R                       NaN              FIRE   

   EMPLOYEE ANNUAL SALARY ESTIMATED ANNUAL SALARY MINUS FURLOUGHS  
0                     NaN                               $73862.00  
1               $74628.00                               $74628.00  
2               $77280.00                               $70174.00  
3               $96276.00                                     NaN  
4               $84032.00                               $76627.00  
5               $79926.00                                     NaN  
6                     NaN                               $77238.00  
7                     NaN                                $2756.00  
8               $38568.00                               $38568.00  
9               $84032.00                               $76627.00  
10              $71040.00                               $71040.00  
11                    NaN                               $86922.00  
In [17]:
pd.isnull(fh)  # TRUE denotes the NULL value
Out[17]:
LNAME FNAME JOB TITLE DEPARTMENT EMPLOYEE ANNUAL SALARY ESTIMATED ANNUAL SALARY MINUS FURLOUGHS
0 False False True False True False
1 True False False False False False
2 False True False True False False
3 False False False False False True
4 False False True True False False
5 False False False False False True
6 True False True False True False
7 True True False False True False
8 True False False False False False
9 False False False False False False
10 False False False False False False
11 True False True False True False
In [18]:
# The na_values option can take either a list or set of strings to consider missing values
fh = pd.read_csv("dataset/Emp2HeadMiss.csv", na_values=['NULL'])
In [19]:
# we can define NA values for each columns as well
naValues = {'LNAME':['NA', 'NaN', 'NULL', '|'], 'FNAME':['NULL']}
fh = pd.read_csv("dataset/Emp2HeadMiss.csv", na_values=naValues)
print(fh)
        LNAME      FNAME                 JOB TITLE        DEPARTMENT  \
0       AARON    ELVIA J                       NaN       WATER MGMNT   
1         NaN  JEFFERY M            POLICE OFFICER            POLICE   
2       AARON        NaN  CHIEF CONTRACT EXPEDITER               NaN   
3     ABAD JR  VICENTE M         CIVIL ENGINEER IV       WATER MGMNT   
4         NaN   ROBERT J                       NaN               NaN   
5      ABBATE    TERRY M            POLICE OFFICER            POLICE   
6         NaN    JAMES J                       NaN              FIRE   
7         NaN        NaN        FOSTER GRANDPARENT  FAMILY & SUPPORT   
8         NaN   LYNISE M                 CLERK III  FAMILY & SUPPORT   
9         NaN      SAM J       ELECTRICAL MECHANIC        TRANSPORTN   
10  ABDELHADI  ABDALMAHD            POLICE OFFICER            POLICE   
11        NaN     AREF R                       NaN              FIRE   

   EMPLOYEE ANNUAL SALARY ESTIMATED ANNUAL SALARY MINUS FURLOUGHS  
0                     NaN                               $73862.00  
1               $74628.00                               $74628.00  
2               $77280.00                               $70174.00  
3               $96276.00                                     NaN  
4               $84032.00                               $76627.00  
5               $79926.00                                     NaN  
6                     NaN                               $77238.00  
7                     NaN                                $2756.00  
8               $38568.00                               $38568.00  
9               $84032.00                               $76627.00  
10              $71040.00                               $71040.00  
11                    NaN                               $86922.00  

read_csv /read_table function arguments

path                        String indicating filesystem location, URL, or file-like object
sep or delimiter            Character sequence or regular expression to use to split fields in each row
header                      Row number to use as column names. Defaults to 0 (first row), but should be None if there is no header row
index_col                   Column numbers or names to use as the row index in the result. Can be a single name/number or a list of them for a hierarchical index
names                       List of column names for result, combine with header=None
skiprows                    Number of rows at beginning of file to ignore or list of row numbers (starting from 0) to skip
na_values                   Sequence of values to replace with NA
comment                     Character or characters to split comments off the end of lines
parse_dates                 Attempt to parse data to datetime; False by default. If True, will attempt to parse all columns. Otherwise can specify a list of column numbers or name to parse. If element of list is tuple or list, will combine multiple columns together and parse to date (for example if date/time split across two columns)
keep_date_col               If joining columns to parse date, drop the joined columns. Default True
converters                  Dict containing column number of name mapping to functions. For example {'foo': f} would apply the function f to all values in the 'foo' column
dayfirst                    When parsing potentially ambiguous dates, treat as international format (e.g. 7/6/2012 -> June 7, 2012). Default False
date_parser                 Function to use to parse dates
nrows                       Number of rows to read from beginning of file
iterator                    Return a TextParser object for reading file piecemeal
chunksize                   For iteration, size of file chunks
skip_footer                 Number of lines to ignore at end of file
verbose                     Print various parser output information, like the number of missing values placed in non-numeric columns
encoding                    Text encoding for unicode. For example 'utf-8' for UTF-8 encoded text
squeeze                     If the parsed data only contains one column return a Series
thousands                   Separator for thousands, e.g. ',' or '.'

Reading Text Files in Pieces

In [20]:
fh = pd.read_csv("dataset/Employees.csv")
len(fh)
Out[20]:
34218
In [21]:
fh.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34218 entries, 0 to 34217
Data columns (total 6 columns):
LNAME                                      34218 non-null object
FNAME                                      34218 non-null object
JOB TITLE                                  34218 non-null object
DEPARTMENT                                 34218 non-null object
EMPLOYEE ANNUAL SALARY                     34218 non-null object
ESTIMATED ANNUAL SALARY MINUS FURLOUGHS    34218 non-null object
dtypes: object(6)
memory usage: 1.6+ MB
In [22]:
fh.describe()
Out[22]:
LNAME FNAME JOB TITLE DEPARTMENT EMPLOYEE ANNUAL SALARY ESTIMATED ANNUAL SALARY MINUS FURLOUGHS
count 34218 34218 34218 34218 34218 34218
unique 14431 12188 1124 39 1028 1088
top WILLIAMS MICHAEL J POLICE OFFICER POLICE $77238.00 $77238.00
freq 277 309 10918 14147 3028 3028
In [23]:
# if want to read few lines from file
fh = pd.read_csv("dataset/Employees.csv", nrows=5)
print(fh)
        LNAME        FNAME                 JOB TITLE       DEPARTMENT  \
0       AARON      ELVIA J          WATER RATE TAKER      WATER MGMNT   
1       AARON    JEFFERY M            POLICE OFFICER           POLICE   
2       AARON  KIMBERLEI R  CHIEF CONTRACT EXPEDITER  FLEET MANAGEMNT   
3     ABAD JR    VICENTE M         CIVIL ENGINEER IV      WATER MGMNT   
4  ABBATACOLA     ROBERT J       ELECTRICAL MECHANIC      WATER MGMNT   

  EMPLOYEE ANNUAL SALARY ESTIMATED ANNUAL SALARY MINUS FURLOUGHS  
0              $81000.00                               $73862.00  
1              $74628.00                               $74628.00  
2              $77280.00                               $70174.00  
3              $96276.00                               $96276.00  
4              $84032.00                               $76627.00  
In [24]:
# To read out a file in pieces, specify a chunksize as a number of rows
chunker = pd.read_csv("dataset/Employees.csv", chunksize=1000)
chunker
Out[24]:
<pandas.io.parsers.TextFileReader at 0x178479579e8>
In [25]:
line = 0
for data in chunker:
    line = line + 1
print(line)
35

Writing Data Out to Text Format

In [26]:
fh = pd.read_csv("dataset/Employees.csv")
len(fh)
Out[26]:
34218
In [27]:
# save the fh as csv
fh.to_csv("dataset/Emp.csv")
In [28]:
# save the fh as excel
fh.to_excel("dataset/Emp.xls")
In [29]:
# Other delimiters can be used
# fh.to_csv("dataset/Emp.csv", sep="|")
In [30]:
# Missing values appear as empty strings in the output. You might want to denote them
# by some other sentinel value:
# sentinel = ['NULL']
# data.to_csv(sys.stdout, na_rep=sentinel)
In [31]:
# With no other options specified, both the row and column labels are written. Both of
# these can be disabled:
fh.to_csv("dataset/Emp.csv", header = False, index = False)
In [32]:
# can write column subset as well
cols = ['LNAME', 'FNAME']
fh.to_csv("dataset/Emp.csv", index = False, columns=cols)

Save Series

In [33]:
dates = pd.date_range('1/1/2000', periods=7)
ts = pd.Series(np.arange(7), index=dates)
ts.to_csv("dataset/times.psv", sep="|")
In [34]:
!more "dataset/times.psv"
2000-01-01|0
2000-01-02|1
2000-01-03|2
2000-01-04|3
2000-01-05|4
2000-01-06|5
2000-01-07|6
In [35]:
# With a bit of wrangling (no header, first column as index), you can read a CSV version
# of a Series with read_csv, but there is also a from_csv convenience method that makes
# it a bit simpler:
pd.Series.from_csv('dataset/times.psv', parse_dates=True, sep="|")
Out[35]:
2000-01-01    0
2000-01-02    1
2000-01-03    2
2000-01-04    3
2000-01-05    4
2000-01-06    5
2000-01-07    6
dtype: int64

Manually Working with Delimited Formats

In [36]:
# import csv
import csv
fh = open("dataset/Emp.csv")
reader = csv.reader(fh)

count=0
for line in reader:
    count = count+1
print(count)
34219
In [37]:
lines = list(csv.reader(open("dataset/Emp.csv")))
header, values = lines[0], lines[1:]
header
Out[37]:
['LNAME', 'FNAME']
In [38]:
data_dict = {h: v for h, v in zip(header, zip(*values))}
data_dict.keys()
Out[38]:
dict_keys(['LNAME', 'FNAME'])
In [39]:
# Defining a new format with a different delimiter, string quoting convention, 
# or line terminator is done by defining a simple subclass of csv.Dialect
f = open("dataset/times.psv")

class my_dialect(csv.Dialect):
    lineterminator = '\n'
    delimiter = '|'
    quoting = csv.QUOTE_NONE
    
reader = csv.reader(f, dialect=my_dialect)
print(reader)
<_csv.reader object at 0x0000017847958D48>
In [40]:
# or we can define as
reader = csv.reader(open("dataset/times.psv"), delimiter = '|')
for line in reader:
    print(line)
['2000-01-01', '0']
['2000-01-02', '1']
['2000-01-03', '2']
['2000-01-04', '3']
['2000-01-05', '4']
['2000-01-06', '5']
['2000-01-07', '6']

CSV dialect options

delimiter                   One-character string to separate fields. Defaults to ','.
lineterminator              Line terminator for writing, defaults to '\r\n'. Reader ignores this and recognizes cross-platform line terminators.
quotechar                   Quote character for fields with special characters (like a delimiter). Default is '"'.
quoting                     Quoting convention. Options include csv.QUOTE_ALL (quote all fields),
                            csv.QUOTE_MINIMAL (only fields with special characters like the delimiter),
                            csv.QUOTE_NONNUMERIC, and csv.QUOTE_NON (no quoting). See Python’s documentation for full details. Defaults to QUOTE_MINIMAL.
skipinitialspace            Ignore whitespace after each delimiter. Default False.
doublequote                 How to handle quoting character inside a field. If True, it is doubled. 
escapechar                  String to escape the delimiter if quoting is set to csv.QUOTE_NONE. Disabled by default
In [41]:
# To write delimited files manually, use csv.writer.
with open('dataset/mydata.csv', 'w') as f:
    writer = csv.writer(f, dialect=my_dialect)
    writer.writerow(('one', 'two', 'three'))
    writer.writerow(('1', '2', '3'))
    writer.writerow(('4', '5', '6'))
    writer.writerow(('7', '8', '9'))

JSON Data

In [42]:
obj = """
{"name": "Wes",
 "places_lived": ["United States", "Spain", "Germany"],
 "pet": null,
 "siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"},
              {"name": "Katie", "age": 33, "pet": "Cisco"}]
}
"""
obj
Out[42]:
'\n{"name": "Wes",\n "places_lived": ["United States", "Spain", "Germany"],\n "pet": null,\n "siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"},\n              {"name": "Katie", "age": 33, "pet": "Cisco"}]\n}\n'
In [43]:
import json
data = json.loads(obj)
data
Out[43]:
{'name': 'Wes',
 'pet': None,
 'places_lived': ['United States', 'Spain', 'Germany'],
 'siblings': [{'age': 25, 'name': 'Scott', 'pet': 'Zuko'},
  {'age': 33, 'name': 'Katie', 'pet': 'Cisco'}]}
In [44]:
asjson = json.dumps(data)
asjson
Out[44]:
'{"name": "Wes", "siblings": [{"name": "Scott", "pet": "Zuko", "age": 25}, {"name": "Katie", "pet": "Cisco", "age": 33}], "places_lived": ["United States", "Spain", "Germany"], "pet": null}'
In [45]:
siblings = pd.DataFrame(data['siblings'], columns=['name', 'age'])
siblings
Out[45]:
name age
0 Scott 25
1 Katie 33

XML and HTML: Web Scraping

In [46]:
# lxml pkg
from lxml.html import parse
# from urllib2 import urlopen  # for python 2
from urllib.request import urlopen
In [47]:
# this will download some data from net
#parsed = parse(urlopen('http://finance.yahoo.com/q/op?s=AAPL+Options'))
parsed = parse(urlopen('http://finance.yahoo.com'))
#parsed = parse(urlopen('http://www.omdbapi.com/'))
doc = parsed.getroot()
In [48]:
print(doc)
<Element html at 0x17847b09958>
In [49]:
# to fetch all hyperlinks <a
links = doc.findall('.//a')
len(links)
Out[49]:
49
In [50]:
lnk = links[18]
lnk.get('href')
Out[50]:
'https://www.yahoo.com/tech'
In [51]:
lnk.text_content()
Out[51]:
'Technology'
In [52]:
# to get all the urls in one liner
urls = [lnk.get('href') for lnk in doc.findall('.//a')]
print(urls[:10])
['https://www.yahoo.com/', 'https://mail.yahoo.com/?.intl=us&.lang=en-US', 'https://www.flickr.com/', 'https://www.tumblr.com/', 'https://www.yahoo.com/news/', 'http://sports.yahoo.com/', 'http://finance.yahoo.com/', 'https://www.yahoo.com/celebrity/', 'https://answers.yahoo.com/', 'https://groups.yahoo.com/']
In [53]:
## Finding the tables in html 
tables = doc.findall('.//table')
print(len(tables))

table = tables[0]
print(table)
1
<Element table at 0x17847b21098>
In [54]:
# table will have only 1 table
# get td tag
rows = table.findall('.//tr')
print(rows)
[<Element tr at 0x17847a70a98>]
In [55]:
# For the header as well as the data rows, we want to extract the text from each cell; in
# the case of the header these are th cells and td cells for the data:
def _unpack(row, kind='td'):
    elts = row.findall('.//%s' % kind)
    return [val.text_content() for val in elts]
In [56]:
# printing table header th tag
_unpack(rows[0], kind='th')
Out[56]:
[]
In [57]:
# printing table data td tag
_unpack(rows[0], kind='td')
Out[57]:
['', 'Search']
In [58]:
# combining all the code into one func
from pandas.io.parsers import TextParser
def parse_options_data(table):
    rows = table.findall('.//tr')
    header = _unpack(rows[0], kind='th')
    data = [_unpack(r) for r in rows[1:]]
    return TextParser(data, names=header).get_chunk()
In [59]:
data = parse_options_data(table)
print(data)
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
C:\tools\Anaconda3\lib\site-packages\pandas\io\parsers.py in _next_line(self)
   1986                 try:
-> 1987                     line = self._check_comments([self.data[self.pos]])[0]
   1988                     self.pos += 1

IndexError: list index out of range

During handling of the above exception, another exception occurred:

StopIteration                             Traceback (most recent call last)
C:\tools\Anaconda3\lib\site-packages\pandas\io\parsers.py in _infer_columns(self)
   1906             try:
-> 1907                 line = self._buffered_line()
   1908 

C:\tools\Anaconda3\lib\site-packages\pandas\io\parsers.py in _buffered_line(self)
   1974         else:
-> 1975             return self._next_line()
   1976 

C:\tools\Anaconda3\lib\site-packages\pandas\io\parsers.py in _next_line(self)
   1998                 except IndexError:
-> 1999                     raise StopIteration
   2000         else:

StopIteration: 

During handling of the above exception, another exception occurred:

EmptyDataError                            Traceback (most recent call last)
<ipython-input-59-88fac389ba91> in <module>()
----> 1 data = parse_options_data(table)
      2 print(data)

<ipython-input-58-92759a96dbe8> in parse_options_data(table)
      5     header = _unpack(rows[0], kind='th')
      6     data = [_unpack(r) for r in rows[1:]]
----> 7     return TextParser(data, names=header).get_chunk()

C:\tools\Anaconda3\lib\site-packages\pandas\io\parsers.py in TextParser(*args, **kwds)
   1470     """
   1471     kwds['engine'] = 'python'
-> 1472     return TextFileReader(*args, **kwds)
   1473 
   1474 

C:\tools\Anaconda3\lib\site-packages\pandas\io\parsers.py in __init__(self, f, engine, **kwds)
    643             self.options['has_index_names'] = kwds['has_index_names']
    644 
--> 645         self._make_engine(self.engine)
    646 
    647     def close(self):

C:\tools\Anaconda3\lib\site-packages\pandas\io\parsers.py in _make_engine(self, engine)
    803             elif engine == 'python-fwf':
    804                 klass = FixedWidthFieldParser
--> 805             self._engine = klass(self.f, **self.options)
    806 
    807     def _failover_to_python(self):

C:\tools\Anaconda3\lib\site-packages\pandas\io\parsers.py in __init__(self, f, **kwds)
   1606         # infer column indices from self.usecols if is is specified.
   1607         self._col_indices = None
-> 1608         self.columns, self.num_original_columns = self._infer_columns()
   1609 
   1610         # Now self.columns has the set of columns that we will process.

C:\tools\Anaconda3\lib\site-packages\pandas\io\parsers.py in _infer_columns(self)
   1910                 if not names:
   1911                     raise EmptyDataError(
-> 1912                         "No columns to parse from file")
   1913 
   1914                 line = names[:]

EmptyDataError: No columns to parse from file

Parsing XML with lxml.objectify

In [60]:
from lxml import objectify
parsed = objectify.parse(open("dataset/Performance_MNR.xml"))
root = parsed.getroot()
In [61]:
print(root)

In [62]:
data = []
skip_fields = ['PARENT_SEQ', 'INDICATOR_SEQ',
               'DESIRED_CHANGE', 'DECIMAL_PLACES']
for elt in root.INDICATOR:
    el_data = {}
    for child in elt.getchildren():
        if child.tag in skip_fields:
            continue
        el_data[child.tag] = child.pyval
    data.append(el_data)
In [63]:
perf = pd.DataFrame(data)
In [64]:
perf.head()
Out[64]:
AGENCY_NAME CATEGORY DESCRIPTION FREQUENCY INDICATOR_NAME INDICATOR_UNIT MONTHLY_ACTUAL MONTHLY_TARGET PERIOD_MONTH PERIOD_YEAR YTD_ACTUAL YTD_TARGET
0 Metro-North Railroad Service Indicators Percent of commuter trains that arrive at thei... M On-Time Performance (West of Hudson) % 96.9 95 1 2008 96.9 95
1 Metro-North Railroad Service Indicators Percent of commuter trains that arrive at thei... M On-Time Performance (West of Hudson) % 95 95 2 2008 96 95
2 Metro-North Railroad Service Indicators Percent of commuter trains that arrive at thei... M On-Time Performance (West of Hudson) % 96.9 95 3 2008 96.3 95
3 Metro-North Railroad Service Indicators Percent of commuter trains that arrive at thei... M On-Time Performance (West of Hudson) % 98.3 95 4 2008 96.8 95
4 Metro-North Railroad Service Indicators Percent of commuter trains that arrive at thei... M On-Time Performance (West of Hudson) % 95.8 95 5 2008 96.6 95