#!/usr/bin/env python # coding: utf-8 # In[1]: # import import pandas as pd import numpy as np # #### Parsing functions in pandas # ``` # read_csv Load delimited data from a file, URL, or file-like object. Use comma as default delimiter # read_table Load delimited data from a file, URL, or file-like object. Use tab ('\t') as default delimiter # read_fwf Read data in fixed-width column format (that is, no delimiters) # read_clipboard Version of read_table that reads data from the clipboard. Useful for converting tables from web pages # ``` # In[2]: # reading csv file, first check the content of it get_ipython().system('more "dataset/Employees2Head.csv"') # In[3]: # this file is having header and delimiter is comma (,), we can read this with read_csv method fh = pd.read_csv("dataset/Employees2Head.csv") len(fh) # of lines in file excluding header # In[4]: print(fh) # In[5]: # we can read the same file with read_table method with passing an option sep= fh = pd.read_table("dataset/Employees2Head.csv", sep=",") len(fh) # In[6]: # let's check how to read a file without header get_ipython().system('more "dataset/Employees2.csv"') # In[7]: fh = pd.read_csv("dataset/Employees2.csv", header=None) len(fh) # In[8]: print(fh) # check the column names, it've been assigned by pandas itself # In[9]: # we can assign file dataframe the column names fh.columns # In[10]: fh.columns = ['LNAME','FNAME','JOB TITLE','DEPARTMENT','EMPLOYEE ANNUAL SALARY','ESTIMATED ANNUAL SALARY MINUS FURLOUGHS' ] print(fh) # In[11]: # or we can assign the names while reading the files fh = pd.read_table("dataset/Employees2.csv", header=None, sep=',', names=['LNAME','FNAME','JOB TITLE','DEPARTMENT','EMPLOYEE ANNUAL SALARY', 'ESTIMATED ANNUAL SALARY MINUS FURLOUGHS']) len(fh) # In[12]: print(fh) # In[13]: # In some cases, a table might not have a fixed delimiter, using whitespace or some other # pattern to separate fields. In these cases, you can pass a regular expression as a delimiter # for read_table. # pd.read_table('ch06/ex3.txt', sep='\s+') # In[14]: # Skip the rows to read fh = pd.read_table("dataset/Employees2Head.csv", sep=",", skiprows=[0,1,2]) len(fh) # total was 12 rows but skipped 3 rows # #### Handling missing data while reading files # In[15]: get_ipython().system('more "dataset/Emp2HeadMiss.csv"') # In[16]: # reading the file fh = pd.read_csv("dataset/Emp2HeadMiss.csv") print(fh) # In[17]: pd.isnull(fh) # TRUE denotes the NULL value # In[18]: # The na_values option can take either a list or set of strings to consider missing values fh = pd.read_csv("dataset/Emp2HeadMiss.csv", na_values=['NULL']) # In[19]: # we can define NA values for each columns as well naValues = {'LNAME':['NA', 'NaN', 'NULL', '|'], 'FNAME':['NULL']} fh = pd.read_csv("dataset/Emp2HeadMiss.csv", na_values=naValues) print(fh) # #### read_csv /read_table function arguments # ``` # path String indicating filesystem location, URL, or file-like object # sep or delimiter Character sequence or regular expression to use to split fields in each row # header Row number to use as column names. Defaults to 0 (first row), but should be None if there is no header row # index_col Column numbers or names to use as the row index in the result. Can be a single name/number or a list of them for a hierarchical index # names List of column names for result, combine with header=None # skiprows Number of rows at beginning of file to ignore or list of row numbers (starting from 0) to skip # na_values Sequence of values to replace with NA # comment Character or characters to split comments off the end of lines # parse_dates Attempt to parse data to datetime; False by default. If True, will attempt to parse all columns. Otherwise can specify a list of column numbers or name to parse. If element of list is tuple or list, will combine multiple columns together and parse to date (for example if date/time split across two columns) # keep_date_col If joining columns to parse date, drop the joined columns. Default True # converters Dict containing column number of name mapping to functions. For example {'foo': f} would apply the function f to all values in the 'foo' column # dayfirst When parsing potentially ambiguous dates, treat as international format (e.g. 7/6/2012 -> June 7, 2012). Default False # date_parser Function to use to parse dates # nrows Number of rows to read from beginning of file # iterator Return a TextParser object for reading file piecemeal # chunksize For iteration, size of file chunks # skip_footer Number of lines to ignore at end of file # verbose Print various parser output information, like the number of missing values placed in non-numeric columns # encoding Text encoding for unicode. For example 'utf-8' for UTF-8 encoded text # squeeze If the parsed data only contains one column return a Series # thousands Separator for thousands, e.g. ',' or '.' # ``` # ### Reading Text Files in Pieces # In[20]: fh = pd.read_csv("dataset/Employees.csv") len(fh) # In[21]: fh.info() # In[22]: fh.describe() # In[23]: # if want to read few lines from file fh = pd.read_csv("dataset/Employees.csv", nrows=5) print(fh) # In[24]: # To read out a file in pieces, specify a chunksize as a number of rows chunker = pd.read_csv("dataset/Employees.csv", chunksize=1000) chunker # In[25]: line = 0 for data in chunker: line = line + 1 print(line) # #### Writing Data Out to Text Format # In[26]: fh = pd.read_csv("dataset/Employees.csv") len(fh) # In[27]: # save the fh as csv fh.to_csv("dataset/Emp.csv") # In[28]: # save the fh as excel fh.to_excel("dataset/Emp.xls") # In[29]: # Other delimiters can be used # fh.to_csv("dataset/Emp.csv", sep="|") # In[30]: # Missing values appear as empty strings in the output. You might want to denote them # by some other sentinel value: # sentinel = ['NULL'] # data.to_csv(sys.stdout, na_rep=sentinel) # In[31]: # With no other options specified, both the row and column labels are written. Both of # these can be disabled: fh.to_csv("dataset/Emp.csv", header = False, index = False) # In[32]: # can write column subset as well cols = ['LNAME', 'FNAME'] fh.to_csv("dataset/Emp.csv", index = False, columns=cols) # #### Save Series # In[33]: dates = pd.date_range('1/1/2000', periods=7) ts = pd.Series(np.arange(7), index=dates) ts.to_csv("dataset/times.psv", sep="|") # In[34]: get_ipython().system('more "dataset/times.psv"') # In[35]: # With a bit of wrangling (no header, first column as index), you can read a CSV version # of a Series with read_csv, but there is also a from_csv convenience method that makes # it a bit simpler: pd.Series.from_csv('dataset/times.psv', parse_dates=True, sep="|") # #### Manually Working with Delimited Formats # In[36]: # import csv import csv fh = open("dataset/Emp.csv") reader = csv.reader(fh) count=0 for line in reader: count = count+1 print(count) # In[37]: lines = list(csv.reader(open("dataset/Emp.csv"))) header, values = lines[0], lines[1:] header # In[38]: data_dict = {h: v for h, v in zip(header, zip(*values))} data_dict.keys() # In[39]: # Defining a new format with a different delimiter, string quoting convention, # or line terminator is done by defining a simple subclass of csv.Dialect f = open("dataset/times.psv") class my_dialect(csv.Dialect): lineterminator = '\n' delimiter = '|' quoting = csv.QUOTE_NONE reader = csv.reader(f, dialect=my_dialect) print(reader) # In[40]: # or we can define as reader = csv.reader(open("dataset/times.psv"), delimiter = '|') for line in reader: print(line) # #### CSV dialect options # # ``` # delimiter One-character string to separate fields. Defaults to ','. # lineterminator Line terminator for writing, defaults to '\r\n'. Reader ignores this and recognizes cross-platform line terminators. # quotechar Quote character for fields with special characters (like a delimiter). Default is '"'. # quoting Quoting convention. Options include csv.QUOTE_ALL (quote all fields), # csv.QUOTE_MINIMAL (only fields with special characters like the delimiter), # csv.QUOTE_NONNUMERIC, and csv.QUOTE_NON (no quoting). See Python’s documentation for full details. Defaults to QUOTE_MINIMAL. # skipinitialspace Ignore whitespace after each delimiter. Default False. # doublequote How to handle quoting character inside a field. If True, it is doubled. # escapechar String to escape the delimiter if quoting is set to csv.QUOTE_NONE. Disabled by default # ``` # In[41]: # To write delimited files manually, use csv.writer. with open('dataset/mydata.csv', 'w') as f: writer = csv.writer(f, dialect=my_dialect) writer.writerow(('one', 'two', 'three')) writer.writerow(('1', '2', '3')) writer.writerow(('4', '5', '6')) writer.writerow(('7', '8', '9')) # ### JSON Data # In[42]: obj = """ {"name": "Wes", "places_lived": ["United States", "Spain", "Germany"], "pet": null, "siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"}, {"name": "Katie", "age": 33, "pet": "Cisco"}] } """ obj # In[43]: import json data = json.loads(obj) data # In[44]: asjson = json.dumps(data) asjson # In[45]: siblings = pd.DataFrame(data['siblings'], columns=['name', 'age']) siblings # ### XML and HTML: Web Scraping # In[46]: # lxml pkg from lxml.html import parse # from urllib2 import urlopen # for python 2 from urllib.request import urlopen # In[47]: # this will download some data from net #parsed = parse(urlopen('http://finance.yahoo.com/q/op?s=AAPL+Options')) parsed = parse(urlopen('http://finance.yahoo.com')) #parsed = parse(urlopen('http://www.omdbapi.com/')) doc = parsed.getroot() # In[48]: print(doc) # In[49]: # to fetch all hyperlinks