# import
import pandas as pd
import numpy as np
read_csv Load delimited data from a file, URL, or file-like object. Use comma as default delimiter
read_table Load delimited data from a file, URL, or file-like object. Use tab ('\t') as default delimiter
read_fwf Read data in fixed-width column format (that is, no delimiters)
read_clipboard Version of read_table that reads data from the clipboard. Useful for converting tables from web pages
# reading csv file, first check the content of it
!more "dataset/Employees2Head.csv"
LNAME,FNAME,JOB TITLE,DEPARTMENT,EMPLOYEE ANNUAL SALARY,ESTIMATED ANNUAL SALARY MINUS FURLOUGHS AARON,ELVIA J,WATER RATE TAKER,WATER MGMNT,$81000.00,$73862.00 AARON,JEFFERY M,POLICE OFFICER,POLICE,$74628.00,$74628.00 AARON,KIMBERLEI R,CHIEF CONTRACT EXPEDITER,FLEET MANAGEMNT,$77280.00,$70174.00 ABAD JR,VICENTE M,CIVIL ENGINEER IV,WATER MGMNT,$96276.00,$96276.00 ABBATACOLA,ROBERT J,ELECTRICAL MECHANIC,WATER MGMNT,$84032.00,$76627.00 ABBATE,TERRY M,POLICE OFFICER,POLICE,$79926.00,$79926.00 ABBATEMARCO,JAMES J,FIREFIGHTER,FIRE,$77238.00,$77238.00 ABBOTT,BETTY L,FOSTER GRANDPARENT,FAMILY & SUPPORT,$2756.00,$2756.00 ABBOTT,LYNISE M,CLERK III,FAMILY & SUPPORT,$38568.00,$38568.00 ABBOTT,SAM J,ELECTRICAL MECHANIC,TRANSPORTN,$84032.00,$76627.00 ABDELHADI,ABDALMAHD,POLICE OFFICER,POLICE,$71040.00,$71040.00 ABDELLATIF,AREF R,FIREFIGHTER (PER ARBITRATORS AWARD)-PARAMEDIC,FIRE,$86922.00,$86922.00
# this file is having header and delimiter is comma (,), we can read this with read_csv method
fh = pd.read_csv("dataset/Employees2Head.csv")
len(fh) # of lines in file excluding header
12
print(fh)
LNAME FNAME JOB TITLE \ 0 AARON ELVIA J WATER RATE TAKER 1 AARON JEFFERY M POLICE OFFICER 2 AARON KIMBERLEI R CHIEF CONTRACT EXPEDITER 3 ABAD JR VICENTE M CIVIL ENGINEER IV 4 ABBATACOLA ROBERT J ELECTRICAL MECHANIC 5 ABBATE TERRY M POLICE OFFICER 6 ABBATEMARCO JAMES J FIREFIGHTER 7 ABBOTT BETTY L FOSTER GRANDPARENT 8 ABBOTT LYNISE M CLERK III 9 ABBOTT SAM J ELECTRICAL MECHANIC 10 ABDELHADI ABDALMAHD POLICE OFFICER 11 ABDELLATIF AREF R FIREFIGHTER (PER ARBITRATORS AWARD)-PARAMEDIC DEPARTMENT EMPLOYEE ANNUAL SALARY \ 0 WATER MGMNT $81000.00 1 POLICE $74628.00 2 FLEET MANAGEMNT $77280.00 3 WATER MGMNT $96276.00 4 WATER MGMNT $84032.00 5 POLICE $79926.00 6 FIRE $77238.00 7 FAMILY & SUPPORT $2756.00 8 FAMILY & SUPPORT $38568.00 9 TRANSPORTN $84032.00 10 POLICE $71040.00 11 FIRE $86922.00 ESTIMATED ANNUAL SALARY MINUS FURLOUGHS 0 $73862.00 1 $74628.00 2 $70174.00 3 $96276.00 4 $76627.00 5 $79926.00 6 $77238.00 7 $2756.00 8 $38568.00 9 $76627.00 10 $71040.00 11 $86922.00
# we can read the same file with read_table method with passing an option sep=
fh = pd.read_table("dataset/Employees2Head.csv", sep=",")
len(fh)
12
# let's check how to read a file without header
!more "dataset/Employees2.csv"
AARON,ELVIA J,WATER RATE TAKER,WATER MGMNT,$81000.00,$73862.00 AARON,JEFFERY M,POLICE OFFICER,POLICE,$74628.00,$74628.00 AARON,KIMBERLEI R,CHIEF CONTRACT EXPEDITER,FLEET MANAGEMNT,$77280.00,$70174.00 ABAD JR,VICENTE M,CIVIL ENGINEER IV,WATER MGMNT,$96276.00,$96276.00 ABBATACOLA,ROBERT J,ELECTRICAL MECHANIC,WATER MGMNT,$84032.00,$76627.00 ABBATE,TERRY M,POLICE OFFICER,POLICE,$79926.00,$79926.00 ABBATEMARCO,JAMES J,FIREFIGHTER,FIRE,$77238.00,$77238.00 ABBOTT,BETTY L,FOSTER GRANDPARENT,FAMILY & SUPPORT,$2756.00,$2756.00 ABBOTT,LYNISE M,CLERK III,FAMILY & SUPPORT,$38568.00,$38568.00 ABBOTT,SAM J,ELECTRICAL MECHANIC,TRANSPORTN,$84032.00,$76627.00 ABDELHADI,ABDALMAHD,POLICE OFFICER,POLICE,$71040.00,$71040.00 ABDELLATIF,AREF R,FIREFIGHTER (PER ARBITRATORS AWARD)-PARAMEDIC,FIRE,$86922.00,$86922.00
fh = pd.read_csv("dataset/Employees2.csv", header=None)
len(fh)
12
print(fh) # check the column names, it've been assigned by pandas itself
0 1 2 \ 0 AARON ELVIA J WATER RATE TAKER 1 AARON JEFFERY M POLICE OFFICER 2 AARON KIMBERLEI R CHIEF CONTRACT EXPEDITER 3 ABAD JR VICENTE M CIVIL ENGINEER IV 4 ABBATACOLA ROBERT J ELECTRICAL MECHANIC 5 ABBATE TERRY M POLICE OFFICER 6 ABBATEMARCO JAMES J FIREFIGHTER 7 ABBOTT BETTY L FOSTER GRANDPARENT 8 ABBOTT LYNISE M CLERK III 9 ABBOTT SAM J ELECTRICAL MECHANIC 10 ABDELHADI ABDALMAHD POLICE OFFICER 11 ABDELLATIF AREF R FIREFIGHTER (PER ARBITRATORS AWARD)-PARAMEDIC 3 4 5 0 WATER MGMNT $81000.00 $73862.00 1 POLICE $74628.00 $74628.00 2 FLEET MANAGEMNT $77280.00 $70174.00 3 WATER MGMNT $96276.00 $96276.00 4 WATER MGMNT $84032.00 $76627.00 5 POLICE $79926.00 $79926.00 6 FIRE $77238.00 $77238.00 7 FAMILY & SUPPORT $2756.00 $2756.00 8 FAMILY & SUPPORT $38568.00 $38568.00 9 TRANSPORTN $84032.00 $76627.00 10 POLICE $71040.00 $71040.00 11 FIRE $86922.00 $86922.00
# we can assign file dataframe the column names
fh.columns
Int64Index([0, 1, 2, 3, 4, 5], dtype='int64')
fh.columns = ['LNAME','FNAME','JOB TITLE','DEPARTMENT','EMPLOYEE ANNUAL SALARY','ESTIMATED ANNUAL SALARY MINUS FURLOUGHS'
]
print(fh)
LNAME FNAME JOB TITLE \ 0 AARON ELVIA J WATER RATE TAKER 1 AARON JEFFERY M POLICE OFFICER 2 AARON KIMBERLEI R CHIEF CONTRACT EXPEDITER 3 ABAD JR VICENTE M CIVIL ENGINEER IV 4 ABBATACOLA ROBERT J ELECTRICAL MECHANIC 5 ABBATE TERRY M POLICE OFFICER 6 ABBATEMARCO JAMES J FIREFIGHTER 7 ABBOTT BETTY L FOSTER GRANDPARENT 8 ABBOTT LYNISE M CLERK III 9 ABBOTT SAM J ELECTRICAL MECHANIC 10 ABDELHADI ABDALMAHD POLICE OFFICER 11 ABDELLATIF AREF R FIREFIGHTER (PER ARBITRATORS AWARD)-PARAMEDIC DEPARTMENT EMPLOYEE ANNUAL SALARY \ 0 WATER MGMNT $81000.00 1 POLICE $74628.00 2 FLEET MANAGEMNT $77280.00 3 WATER MGMNT $96276.00 4 WATER MGMNT $84032.00 5 POLICE $79926.00 6 FIRE $77238.00 7 FAMILY & SUPPORT $2756.00 8 FAMILY & SUPPORT $38568.00 9 TRANSPORTN $84032.00 10 POLICE $71040.00 11 FIRE $86922.00 ESTIMATED ANNUAL SALARY MINUS FURLOUGHS 0 $73862.00 1 $74628.00 2 $70174.00 3 $96276.00 4 $76627.00 5 $79926.00 6 $77238.00 7 $2756.00 8 $38568.00 9 $76627.00 10 $71040.00 11 $86922.00
# or we can assign the names while reading the files
fh = pd.read_table("dataset/Employees2.csv", header=None, sep=',',
names=['LNAME','FNAME','JOB TITLE','DEPARTMENT','EMPLOYEE ANNUAL SALARY',
'ESTIMATED ANNUAL SALARY MINUS FURLOUGHS'])
len(fh)
12
print(fh)
LNAME FNAME JOB TITLE \ 0 AARON ELVIA J WATER RATE TAKER 1 AARON JEFFERY M POLICE OFFICER 2 AARON KIMBERLEI R CHIEF CONTRACT EXPEDITER 3 ABAD JR VICENTE M CIVIL ENGINEER IV 4 ABBATACOLA ROBERT J ELECTRICAL MECHANIC 5 ABBATE TERRY M POLICE OFFICER 6 ABBATEMARCO JAMES J FIREFIGHTER 7 ABBOTT BETTY L FOSTER GRANDPARENT 8 ABBOTT LYNISE M CLERK III 9 ABBOTT SAM J ELECTRICAL MECHANIC 10 ABDELHADI ABDALMAHD POLICE OFFICER 11 ABDELLATIF AREF R FIREFIGHTER (PER ARBITRATORS AWARD)-PARAMEDIC DEPARTMENT EMPLOYEE ANNUAL SALARY \ 0 WATER MGMNT $81000.00 1 POLICE $74628.00 2 FLEET MANAGEMNT $77280.00 3 WATER MGMNT $96276.00 4 WATER MGMNT $84032.00 5 POLICE $79926.00 6 FIRE $77238.00 7 FAMILY & SUPPORT $2756.00 8 FAMILY & SUPPORT $38568.00 9 TRANSPORTN $84032.00 10 POLICE $71040.00 11 FIRE $86922.00 ESTIMATED ANNUAL SALARY MINUS FURLOUGHS 0 $73862.00 1 $74628.00 2 $70174.00 3 $96276.00 4 $76627.00 5 $79926.00 6 $77238.00 7 $2756.00 8 $38568.00 9 $76627.00 10 $71040.00 11 $86922.00
# In some cases, a table might not have a fixed delimiter, using whitespace or some other
# pattern to separate fields. In these cases, you can pass a regular expression as a delimiter
# for read_table.
# pd.read_table('ch06/ex3.txt', sep='\s+')
# Skip the rows to read
fh = pd.read_table("dataset/Employees2Head.csv", sep=",", skiprows=[0,1,2])
len(fh) # total was 12 rows but skipped 3 rows
9
!more "dataset/Emp2HeadMiss.csv"
LNAME,FNAME,JOB TITLE,DEPARTMENT,EMPLOYEE ANNUAL SALARY,ESTIMATED ANNUAL SALARY MINUS FURLOUGHS AARON,ELVIA J,,WATER MGMNT,NULL,$73862.00 ,JEFFERY M,POLICE OFFICER,POLICE,$74628.00,$74628.00 AARON,,CHIEF CONTRACT EXPEDITER,NaN,$77280.00,$70174.00 ABAD JR,VICENTE M,CIVIL ENGINEER IV,WATER MGMNT,$96276.00,NaN |,ROBERT J,,NULL,$84032.00,$76627.00 ABBATE,TERRY M,POLICE OFFICER,POLICE,$79926.00,NaN NULL,JAMES J,NA,FIRE,NULL,$77238.00 NaN,,FOSTER GRANDPARENT,FAMILY & SUPPORT,NA,$2756.00 NA,LYNISE M,CLERK III,FAMILY & SUPPORT,$38568.00,$38568.00 |,SAM J,ELECTRICAL MECHANIC,TRANSPORTN,$84032.00,$76627.00 ABDELHADI,ABDALMAHD,POLICE OFFICER,POLICE,$71040.00,$71040.00 ,AREF R,NA,FIRE,,$86922.00
# reading the file
fh = pd.read_csv("dataset/Emp2HeadMiss.csv")
print(fh)
LNAME FNAME JOB TITLE DEPARTMENT \ 0 AARON ELVIA J NaN WATER MGMNT 1 NaN JEFFERY M POLICE OFFICER POLICE 2 AARON NaN CHIEF CONTRACT EXPEDITER NaN 3 ABAD JR VICENTE M CIVIL ENGINEER IV WATER MGMNT 4 | ROBERT J NaN NaN 5 ABBATE TERRY M POLICE OFFICER POLICE 6 NaN JAMES J NaN FIRE 7 NaN NaN FOSTER GRANDPARENT FAMILY & SUPPORT 8 NaN LYNISE M CLERK III FAMILY & SUPPORT 9 | SAM J ELECTRICAL MECHANIC TRANSPORTN 10 ABDELHADI ABDALMAHD POLICE OFFICER POLICE 11 NaN AREF R NaN FIRE EMPLOYEE ANNUAL SALARY ESTIMATED ANNUAL SALARY MINUS FURLOUGHS 0 NaN $73862.00 1 $74628.00 $74628.00 2 $77280.00 $70174.00 3 $96276.00 NaN 4 $84032.00 $76627.00 5 $79926.00 NaN 6 NaN $77238.00 7 NaN $2756.00 8 $38568.00 $38568.00 9 $84032.00 $76627.00 10 $71040.00 $71040.00 11 NaN $86922.00
pd.isnull(fh) # TRUE denotes the NULL value
LNAME | FNAME | JOB TITLE | DEPARTMENT | EMPLOYEE ANNUAL SALARY | ESTIMATED ANNUAL SALARY MINUS FURLOUGHS | |
---|---|---|---|---|---|---|
0 | False | False | True | False | True | False |
1 | True | False | False | False | False | False |
2 | False | True | False | True | False | False |
3 | False | False | False | False | False | True |
4 | False | False | True | True | False | False |
5 | False | False | False | False | False | True |
6 | True | False | True | False | True | False |
7 | True | True | False | False | True | False |
8 | True | False | False | False | False | False |
9 | False | False | False | False | False | False |
10 | False | False | False | False | False | False |
11 | True | False | True | False | True | False |
# The na_values option can take either a list or set of strings to consider missing values
fh = pd.read_csv("dataset/Emp2HeadMiss.csv", na_values=['NULL'])
# we can define NA values for each columns as well
naValues = {'LNAME':['NA', 'NaN', 'NULL', '|'], 'FNAME':['NULL']}
fh = pd.read_csv("dataset/Emp2HeadMiss.csv", na_values=naValues)
print(fh)
LNAME FNAME JOB TITLE DEPARTMENT \ 0 AARON ELVIA J NaN WATER MGMNT 1 NaN JEFFERY M POLICE OFFICER POLICE 2 AARON NaN CHIEF CONTRACT EXPEDITER NaN 3 ABAD JR VICENTE M CIVIL ENGINEER IV WATER MGMNT 4 NaN ROBERT J NaN NaN 5 ABBATE TERRY M POLICE OFFICER POLICE 6 NaN JAMES J NaN FIRE 7 NaN NaN FOSTER GRANDPARENT FAMILY & SUPPORT 8 NaN LYNISE M CLERK III FAMILY & SUPPORT 9 NaN SAM J ELECTRICAL MECHANIC TRANSPORTN 10 ABDELHADI ABDALMAHD POLICE OFFICER POLICE 11 NaN AREF R NaN FIRE EMPLOYEE ANNUAL SALARY ESTIMATED ANNUAL SALARY MINUS FURLOUGHS 0 NaN $73862.00 1 $74628.00 $74628.00 2 $77280.00 $70174.00 3 $96276.00 NaN 4 $84032.00 $76627.00 5 $79926.00 NaN 6 NaN $77238.00 7 NaN $2756.00 8 $38568.00 $38568.00 9 $84032.00 $76627.00 10 $71040.00 $71040.00 11 NaN $86922.00
path String indicating filesystem location, URL, or file-like object
sep or delimiter Character sequence or regular expression to use to split fields in each row
header Row number to use as column names. Defaults to 0 (first row), but should be None if there is no header row
index_col Column numbers or names to use as the row index in the result. Can be a single name/number or a list of them for a hierarchical index
names List of column names for result, combine with header=None
skiprows Number of rows at beginning of file to ignore or list of row numbers (starting from 0) to skip
na_values Sequence of values to replace with NA
comment Character or characters to split comments off the end of lines
parse_dates Attempt to parse data to datetime; False by default. If True, will attempt to parse all columns. Otherwise can specify a list of column numbers or name to parse. If element of list is tuple or list, will combine multiple columns together and parse to date (for example if date/time split across two columns)
keep_date_col If joining columns to parse date, drop the joined columns. Default True
converters Dict containing column number of name mapping to functions. For example {'foo': f} would apply the function f to all values in the 'foo' column
dayfirst When parsing potentially ambiguous dates, treat as international format (e.g. 7/6/2012 -> June 7, 2012). Default False
date_parser Function to use to parse dates
nrows Number of rows to read from beginning of file
iterator Return a TextParser object for reading file piecemeal
chunksize For iteration, size of file chunks
skip_footer Number of lines to ignore at end of file
verbose Print various parser output information, like the number of missing values placed in non-numeric columns
encoding Text encoding for unicode. For example 'utf-8' for UTF-8 encoded text
squeeze If the parsed data only contains one column return a Series
thousands Separator for thousands, e.g. ',' or '.'
fh = pd.read_csv("dataset/Employees.csv")
len(fh)
34218
fh.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 34218 entries, 0 to 34217 Data columns (total 6 columns): LNAME 34218 non-null object FNAME 34218 non-null object JOB TITLE 34218 non-null object DEPARTMENT 34218 non-null object EMPLOYEE ANNUAL SALARY 34218 non-null object ESTIMATED ANNUAL SALARY MINUS FURLOUGHS 34218 non-null object dtypes: object(6) memory usage: 1.6+ MB
fh.describe()
LNAME | FNAME | JOB TITLE | DEPARTMENT | EMPLOYEE ANNUAL SALARY | ESTIMATED ANNUAL SALARY MINUS FURLOUGHS | |
---|---|---|---|---|---|---|
count | 34218 | 34218 | 34218 | 34218 | 34218 | 34218 |
unique | 14431 | 12188 | 1124 | 39 | 1028 | 1088 |
top | WILLIAMS | MICHAEL J | POLICE OFFICER | POLICE | $77238.00 | $77238.00 |
freq | 277 | 309 | 10918 | 14147 | 3028 | 3028 |
# if want to read few lines from file
fh = pd.read_csv("dataset/Employees.csv", nrows=5)
print(fh)
LNAME FNAME JOB TITLE DEPARTMENT \ 0 AARON ELVIA J WATER RATE TAKER WATER MGMNT 1 AARON JEFFERY M POLICE OFFICER POLICE 2 AARON KIMBERLEI R CHIEF CONTRACT EXPEDITER FLEET MANAGEMNT 3 ABAD JR VICENTE M CIVIL ENGINEER IV WATER MGMNT 4 ABBATACOLA ROBERT J ELECTRICAL MECHANIC WATER MGMNT EMPLOYEE ANNUAL SALARY ESTIMATED ANNUAL SALARY MINUS FURLOUGHS 0 $81000.00 $73862.00 1 $74628.00 $74628.00 2 $77280.00 $70174.00 3 $96276.00 $96276.00 4 $84032.00 $76627.00
# To read out a file in pieces, specify a chunksize as a number of rows
chunker = pd.read_csv("dataset/Employees.csv", chunksize=1000)
chunker
<pandas.io.parsers.TextFileReader at 0x178479579e8>
line = 0
for data in chunker:
line = line + 1
print(line)
35
fh = pd.read_csv("dataset/Employees.csv")
len(fh)
34218
# save the fh as csv
fh.to_csv("dataset/Emp.csv")
# save the fh as excel
fh.to_excel("dataset/Emp.xls")
# Other delimiters can be used
# fh.to_csv("dataset/Emp.csv", sep="|")
# Missing values appear as empty strings in the output. You might want to denote them
# by some other sentinel value:
# sentinel = ['NULL']
# data.to_csv(sys.stdout, na_rep=sentinel)
# With no other options specified, both the row and column labels are written. Both of
# these can be disabled:
fh.to_csv("dataset/Emp.csv", header = False, index = False)
# can write column subset as well
cols = ['LNAME', 'FNAME']
fh.to_csv("dataset/Emp.csv", index = False, columns=cols)
dates = pd.date_range('1/1/2000', periods=7)
ts = pd.Series(np.arange(7), index=dates)
ts.to_csv("dataset/times.psv", sep="|")
!more "dataset/times.psv"
2000-01-01|0 2000-01-02|1 2000-01-03|2 2000-01-04|3 2000-01-05|4 2000-01-06|5 2000-01-07|6
# With a bit of wrangling (no header, first column as index), you can read a CSV version
# of a Series with read_csv, but there is also a from_csv convenience method that makes
# it a bit simpler:
pd.Series.from_csv('dataset/times.psv', parse_dates=True, sep="|")
2000-01-01 0 2000-01-02 1 2000-01-03 2 2000-01-04 3 2000-01-05 4 2000-01-06 5 2000-01-07 6 dtype: int64
# import csv
import csv
fh = open("dataset/Emp.csv")
reader = csv.reader(fh)
count=0
for line in reader:
count = count+1
print(count)
34219
lines = list(csv.reader(open("dataset/Emp.csv")))
header, values = lines[0], lines[1:]
header
['LNAME', 'FNAME']
data_dict = {h: v for h, v in zip(header, zip(*values))}
data_dict.keys()
dict_keys(['LNAME', 'FNAME'])
# Defining a new format with a different delimiter, string quoting convention,
# or line terminator is done by defining a simple subclass of csv.Dialect
f = open("dataset/times.psv")
class my_dialect(csv.Dialect):
lineterminator = '\n'
delimiter = '|'
quoting = csv.QUOTE_NONE
reader = csv.reader(f, dialect=my_dialect)
print(reader)
<_csv.reader object at 0x0000017847958D48>
# or we can define as
reader = csv.reader(open("dataset/times.psv"), delimiter = '|')
for line in reader:
print(line)
['2000-01-01', '0'] ['2000-01-02', '1'] ['2000-01-03', '2'] ['2000-01-04', '3'] ['2000-01-05', '4'] ['2000-01-06', '5'] ['2000-01-07', '6']
delimiter One-character string to separate fields. Defaults to ','.
lineterminator Line terminator for writing, defaults to '\r\n'. Reader ignores this and recognizes cross-platform line terminators.
quotechar Quote character for fields with special characters (like a delimiter). Default is '"'.
quoting Quoting convention. Options include csv.QUOTE_ALL (quote all fields),
csv.QUOTE_MINIMAL (only fields with special characters like the delimiter),
csv.QUOTE_NONNUMERIC, and csv.QUOTE_NON (no quoting). See Python’s documentation for full details. Defaults to QUOTE_MINIMAL.
skipinitialspace Ignore whitespace after each delimiter. Default False.
doublequote How to handle quoting character inside a field. If True, it is doubled.
escapechar String to escape the delimiter if quoting is set to csv.QUOTE_NONE. Disabled by default
# To write delimited files manually, use csv.writer.
with open('dataset/mydata.csv', 'w') as f:
writer = csv.writer(f, dialect=my_dialect)
writer.writerow(('one', 'two', 'three'))
writer.writerow(('1', '2', '3'))
writer.writerow(('4', '5', '6'))
writer.writerow(('7', '8', '9'))
obj = """
{"name": "Wes",
"places_lived": ["United States", "Spain", "Germany"],
"pet": null,
"siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"},
{"name": "Katie", "age": 33, "pet": "Cisco"}]
}
"""
obj
'\n{"name": "Wes",\n "places_lived": ["United States", "Spain", "Germany"],\n "pet": null,\n "siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"},\n {"name": "Katie", "age": 33, "pet": "Cisco"}]\n}\n'
import json
data = json.loads(obj)
data
{'name': 'Wes', 'pet': None, 'places_lived': ['United States', 'Spain', 'Germany'], 'siblings': [{'age': 25, 'name': 'Scott', 'pet': 'Zuko'}, {'age': 33, 'name': 'Katie', 'pet': 'Cisco'}]}
asjson = json.dumps(data)
asjson
'{"name": "Wes", "siblings": [{"name": "Scott", "pet": "Zuko", "age": 25}, {"name": "Katie", "pet": "Cisco", "age": 33}], "places_lived": ["United States", "Spain", "Germany"], "pet": null}'
siblings = pd.DataFrame(data['siblings'], columns=['name', 'age'])
siblings
name | age | |
---|---|---|
0 | Scott | 25 |
1 | Katie | 33 |
# lxml pkg
from lxml.html import parse
# from urllib2 import urlopen # for python 2
from urllib.request import urlopen
# this will download some data from net
#parsed = parse(urlopen('http://finance.yahoo.com/q/op?s=AAPL+Options'))
parsed = parse(urlopen('http://finance.yahoo.com'))
#parsed = parse(urlopen('http://www.omdbapi.com/'))
doc = parsed.getroot()
print(doc)
<Element html at 0x17847b09958>
# to fetch all hyperlinks <a
links = doc.findall('.//a')
len(links)
49
lnk = links[18]
lnk.get('href')
'https://www.yahoo.com/tech'
lnk.text_content()
'Technology'
# to get all the urls in one liner
urls = [lnk.get('href') for lnk in doc.findall('.//a')]
print(urls[:10])
['https://www.yahoo.com/', 'https://mail.yahoo.com/?.intl=us&.lang=en-US', 'https://www.flickr.com/', 'https://www.tumblr.com/', 'https://www.yahoo.com/news/', 'http://sports.yahoo.com/', 'http://finance.yahoo.com/', 'https://www.yahoo.com/celebrity/', 'https://answers.yahoo.com/', 'https://groups.yahoo.com/']
## Finding the tables in html
tables = doc.findall('.//table')
print(len(tables))
table = tables[0]
print(table)
1 <Element table at 0x17847b21098>
# table will have only 1 table
# get td tag
rows = table.findall('.//tr')
print(rows)
[<Element tr at 0x17847a70a98>]
# For the header as well as the data rows, we want to extract the text from each cell; in
# the case of the header these are th cells and td cells for the data:
def _unpack(row, kind='td'):
elts = row.findall('.//%s' % kind)
return [val.text_content() for val in elts]
# printing table header th tag
_unpack(rows[0], kind='th')
[]
# printing table data td tag
_unpack(rows[0], kind='td')
['', 'Search']
# combining all the code into one func
from pandas.io.parsers import TextParser
def parse_options_data(table):
rows = table.findall('.//tr')
header = _unpack(rows[0], kind='th')
data = [_unpack(r) for r in rows[1:]]
return TextParser(data, names=header).get_chunk()
data = parse_options_data(table)
print(data)
--------------------------------------------------------------------------- IndexError Traceback (most recent call last) C:\tools\Anaconda3\lib\site-packages\pandas\io\parsers.py in _next_line(self) 1986 try: -> 1987 line = self._check_comments([self.data[self.pos]])[0] 1988 self.pos += 1 IndexError: list index out of range During handling of the above exception, another exception occurred: StopIteration Traceback (most recent call last) C:\tools\Anaconda3\lib\site-packages\pandas\io\parsers.py in _infer_columns(self) 1906 try: -> 1907 line = self._buffered_line() 1908 C:\tools\Anaconda3\lib\site-packages\pandas\io\parsers.py in _buffered_line(self) 1974 else: -> 1975 return self._next_line() 1976 C:\tools\Anaconda3\lib\site-packages\pandas\io\parsers.py in _next_line(self) 1998 except IndexError: -> 1999 raise StopIteration 2000 else: StopIteration: During handling of the above exception, another exception occurred: EmptyDataError Traceback (most recent call last) <ipython-input-59-88fac389ba91> in <module>() ----> 1 data = parse_options_data(table) 2 print(data) <ipython-input-58-92759a96dbe8> in parse_options_data(table) 5 header = _unpack(rows[0], kind='th') 6 data = [_unpack(r) for r in rows[1:]] ----> 7 return TextParser(data, names=header).get_chunk() C:\tools\Anaconda3\lib\site-packages\pandas\io\parsers.py in TextParser(*args, **kwds) 1470 """ 1471 kwds['engine'] = 'python' -> 1472 return TextFileReader(*args, **kwds) 1473 1474 C:\tools\Anaconda3\lib\site-packages\pandas\io\parsers.py in __init__(self, f, engine, **kwds) 643 self.options['has_index_names'] = kwds['has_index_names'] 644 --> 645 self._make_engine(self.engine) 646 647 def close(self): C:\tools\Anaconda3\lib\site-packages\pandas\io\parsers.py in _make_engine(self, engine) 803 elif engine == 'python-fwf': 804 klass = FixedWidthFieldParser --> 805 self._engine = klass(self.f, **self.options) 806 807 def _failover_to_python(self): C:\tools\Anaconda3\lib\site-packages\pandas\io\parsers.py in __init__(self, f, **kwds) 1606 # infer column indices from self.usecols if is is specified. 1607 self._col_indices = None -> 1608 self.columns, self.num_original_columns = self._infer_columns() 1609 1610 # Now self.columns has the set of columns that we will process. C:\tools\Anaconda3\lib\site-packages\pandas\io\parsers.py in _infer_columns(self) 1910 if not names: 1911 raise EmptyDataError( -> 1912 "No columns to parse from file") 1913 1914 line = names[:] EmptyDataError: No columns to parse from file
from lxml import objectify
parsed = objectify.parse(open("dataset/Performance_MNR.xml"))
root = parsed.getroot()
print(root)
data = []
skip_fields = ['PARENT_SEQ', 'INDICATOR_SEQ',
'DESIRED_CHANGE', 'DECIMAL_PLACES']
for elt in root.INDICATOR:
el_data = {}
for child in elt.getchildren():
if child.tag in skip_fields:
continue
el_data[child.tag] = child.pyval
data.append(el_data)
perf = pd.DataFrame(data)
perf.head()
AGENCY_NAME | CATEGORY | DESCRIPTION | FREQUENCY | INDICATOR_NAME | INDICATOR_UNIT | MONTHLY_ACTUAL | MONTHLY_TARGET | PERIOD_MONTH | PERIOD_YEAR | YTD_ACTUAL | YTD_TARGET | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Metro-North Railroad | Service Indicators | Percent of commuter trains that arrive at thei... | M | On-Time Performance (West of Hudson) | % | 96.9 | 95 | 1 | 2008 | 96.9 | 95 |
1 | Metro-North Railroad | Service Indicators | Percent of commuter trains that arrive at thei... | M | On-Time Performance (West of Hudson) | % | 95 | 95 | 2 | 2008 | 96 | 95 |
2 | Metro-North Railroad | Service Indicators | Percent of commuter trains that arrive at thei... | M | On-Time Performance (West of Hudson) | % | 96.9 | 95 | 3 | 2008 | 96.3 | 95 |
3 | Metro-North Railroad | Service Indicators | Percent of commuter trains that arrive at thei... | M | On-Time Performance (West of Hudson) | % | 98.3 | 95 | 4 | 2008 | 96.8 | 95 |
4 | Metro-North Railroad | Service Indicators | Percent of commuter trains that arrive at thei... | M | On-Time Performance (West of Hudson) | % | 95.8 | 95 | 5 | 2008 | 96.6 | 95 |