#!/usr/bin/env python
# coding: utf-8

# In[1]:


# import
import pandas as pd
import numpy as np


# #### Parsing functions in pandas
# ```
# read_csv              Load delimited data from a file, URL, or file-like object. Use comma as default delimiter
# read_table            Load delimited data from a file, URL, or file-like object. Use tab ('\t') as default delimiter
# read_fwf              Read data in fixed-width column format (that is, no delimiters)
# read_clipboard      Version of read_table that reads data from the clipboard. Useful for converting tables from web pages
# ```

# In[2]:


# reading csv file, first check the content of it
get_ipython().system('more "dataset/Employees2Head.csv"')


# In[3]:


# this file is having header and delimiter is comma (,), we can read this with read_csv method
fh = pd.read_csv("dataset/Employees2Head.csv")
len(fh)   # of lines in file excluding header


# In[4]:


print(fh)


# In[5]:


# we can read the same file with read_table method with passing an option sep=
fh = pd.read_table("dataset/Employees2Head.csv", sep=",")
len(fh)


# In[6]:


# let's check how to read a file without header
get_ipython().system('more "dataset/Employees2.csv"')


# In[7]:


fh = pd.read_csv("dataset/Employees2.csv", header=None)
len(fh)


# In[8]:


print(fh) # check the column names, it've been assigned by pandas itself


# In[9]:


# we can assign file dataframe the column names
fh.columns


# In[10]:


fh.columns = ['LNAME','FNAME','JOB TITLE','DEPARTMENT','EMPLOYEE ANNUAL SALARY','ESTIMATED ANNUAL SALARY MINUS FURLOUGHS'
]
print(fh)


# In[11]:


# or we can assign the names while reading the files
fh = pd.read_table("dataset/Employees2.csv", header=None, sep=',', 
                   names=['LNAME','FNAME','JOB TITLE','DEPARTMENT','EMPLOYEE ANNUAL SALARY',
                          'ESTIMATED ANNUAL SALARY MINUS FURLOUGHS'])
len(fh)


# In[12]:


print(fh)


# In[13]:


# In some cases, a table might not have a fixed delimiter, using whitespace or some other
# pattern to separate fields. In these cases, you can pass a regular expression as a delimiter
# for read_table.
# pd.read_table('ch06/ex3.txt', sep='\s+')


# In[14]:


# Skip the rows to read
fh = pd.read_table("dataset/Employees2Head.csv", sep=",", skiprows=[0,1,2])
len(fh)  # total was 12 rows but skipped 3 rows


# #### Handling missing data while reading files

# In[15]:


get_ipython().system('more "dataset/Emp2HeadMiss.csv"')


# In[16]:


# reading the file
fh = pd.read_csv("dataset/Emp2HeadMiss.csv")
print(fh)


# In[17]:


pd.isnull(fh)  # TRUE denotes the NULL value


# In[18]:


# The na_values option can take either a list or set of strings to consider missing values
fh = pd.read_csv("dataset/Emp2HeadMiss.csv", na_values=['NULL'])


# In[19]:


# we can define NA values for each columns as well
naValues = {'LNAME':['NA', 'NaN', 'NULL', '|'], 'FNAME':['NULL']}
fh = pd.read_csv("dataset/Emp2HeadMiss.csv", na_values=naValues)
print(fh)


# #### read_csv /read_table function arguments
# ```
# path                        String indicating filesystem location, URL, or file-like object
# sep or delimiter            Character sequence or regular expression to use to split fields in each row
# header                      Row number to use as column names. Defaults to 0 (first row), but should be None if there is no header row
# index_col                   Column numbers or names to use as the row index in the result. Can be a single name/number or a list of them for a hierarchical index
# names                       List of column names for result, combine with header=None
# skiprows                    Number of rows at beginning of file to ignore or list of row numbers (starting from 0) to skip
# na_values                   Sequence of values to replace with NA
# comment                     Character or characters to split comments off the end of lines
# parse_dates                 Attempt to parse data to datetime; False by default. If True, will attempt to parse all columns. Otherwise can specify a list of column numbers or name to parse. If element of list is tuple or list, will combine multiple columns together and parse to date (for example if date/time split across two columns)
# keep_date_col               If joining columns to parse date, drop the joined columns. Default True
# converters                  Dict containing column number of name mapping to functions. For example {'foo': f} would apply the function f to all values in the 'foo' column
# dayfirst                    When parsing potentially ambiguous dates, treat as international format (e.g. 7/6/2012 -> June 7, 2012). Default False
# date_parser                 Function to use to parse dates
# nrows                       Number of rows to read from beginning of file
# iterator                    Return a TextParser object for reading file piecemeal
# chunksize                   For iteration, size of file chunks
# skip_footer                 Number of lines to ignore at end of file
# verbose                     Print various parser output information, like the number of missing values placed in non-numeric columns
# encoding                    Text encoding for unicode. For example 'utf-8' for UTF-8 encoded text
# squeeze                     If the parsed data only contains one column return a Series
# thousands                   Separator for thousands, e.g. ',' or '.'
# ```

# ### Reading Text Files in Pieces

# In[20]:


fh = pd.read_csv("dataset/Employees.csv")
len(fh)


# In[21]:


fh.info()


# In[22]:


fh.describe()


# In[23]:


# if want to read few lines from file
fh = pd.read_csv("dataset/Employees.csv", nrows=5)
print(fh)


# In[24]:


# To read out a file in pieces, specify a chunksize as a number of rows
chunker = pd.read_csv("dataset/Employees.csv", chunksize=1000)
chunker


# In[25]:


line = 0
for data in chunker:
    line = line + 1
print(line)


# #### Writing Data Out to Text Format

# In[26]:


fh = pd.read_csv("dataset/Employees.csv")
len(fh)


# In[27]:


# save the fh as csv
fh.to_csv("dataset/Emp.csv")


# In[28]:


# save the fh as excel
fh.to_excel("dataset/Emp.xls")


# In[29]:


# Other delimiters can be used
# fh.to_csv("dataset/Emp.csv", sep="|")


# In[30]:


# Missing values appear as empty strings in the output. You might want to denote them
# by some other sentinel value:
# sentinel = ['NULL']
# data.to_csv(sys.stdout, na_rep=sentinel)


# In[31]:


# With no other options specified, both the row and column labels are written. Both of
# these can be disabled:
fh.to_csv("dataset/Emp.csv", header = False, index = False)


# In[32]:


# can write column subset as well
cols = ['LNAME', 'FNAME']
fh.to_csv("dataset/Emp.csv", index = False, columns=cols)


# #### Save Series

# In[33]:


dates = pd.date_range('1/1/2000', periods=7)
ts = pd.Series(np.arange(7), index=dates)
ts.to_csv("dataset/times.psv", sep="|")


# In[34]:


get_ipython().system('more "dataset/times.psv"')


# In[35]:


# With a bit of wrangling (no header, first column as index), you can read a CSV version
# of a Series with read_csv, but there is also a from_csv convenience method that makes
# it a bit simpler:
pd.Series.from_csv('dataset/times.psv', parse_dates=True, sep="|")


# #### Manually Working with Delimited Formats

# In[36]:


# import csv
import csv
fh = open("dataset/Emp.csv")
reader = csv.reader(fh)

count=0
for line in reader:
    count = count+1
print(count)


# In[37]:


lines = list(csv.reader(open("dataset/Emp.csv")))
header, values = lines[0], lines[1:]
header


# In[38]:


data_dict = {h: v for h, v in zip(header, zip(*values))}
data_dict.keys()


# In[39]:


# Defining a new format with a different delimiter, string quoting convention, 
# or line terminator is done by defining a simple subclass of csv.Dialect
f = open("dataset/times.psv")

class my_dialect(csv.Dialect):
    lineterminator = '\n'
    delimiter = '|'
    quoting = csv.QUOTE_NONE
    
reader = csv.reader(f, dialect=my_dialect)
print(reader)


# In[40]:


# or we can define as
reader = csv.reader(open("dataset/times.psv"), delimiter = '|')
for line in reader:
    print(line)


# #### CSV dialect options
# 
# ```
# delimiter                   One-character string to separate fields. Defaults to ','.
# lineterminator              Line terminator for writing, defaults to '\r\n'. Reader ignores this and recognizes cross-platform line terminators.
# quotechar                   Quote character for fields with special characters (like a delimiter). Default is '"'.
# quoting                     Quoting convention. Options include csv.QUOTE_ALL (quote all fields),
#                             csv.QUOTE_MINIMAL (only fields with special characters like the delimiter),
#                             csv.QUOTE_NONNUMERIC, and csv.QUOTE_NON (no quoting). See Python’s documentation for full details. Defaults to QUOTE_MINIMAL.
# skipinitialspace            Ignore whitespace after each delimiter. Default False.
# doublequote                 How to handle quoting character inside a field. If True, it is doubled. 
# escapechar                  String to escape the delimiter if quoting is set to csv.QUOTE_NONE. Disabled by default
# ```

# In[41]:


# To write delimited files manually, use csv.writer.
with open('dataset/mydata.csv', 'w') as f:
    writer = csv.writer(f, dialect=my_dialect)
    writer.writerow(('one', 'two', 'three'))
    writer.writerow(('1', '2', '3'))
    writer.writerow(('4', '5', '6'))
    writer.writerow(('7', '8', '9'))


# ### JSON Data

# In[42]:


obj = """
{"name": "Wes",
 "places_lived": ["United States", "Spain", "Germany"],
 "pet": null,
 "siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"},
              {"name": "Katie", "age": 33, "pet": "Cisco"}]
}
"""
obj


# In[43]:


import json
data = json.loads(obj)
data


# In[44]:


asjson = json.dumps(data)
asjson


# In[45]:


siblings = pd.DataFrame(data['siblings'], columns=['name', 'age'])
siblings


# ### XML and HTML: Web Scraping

# In[46]:


# lxml pkg
from lxml.html import parse
# from urllib2 import urlopen  # for python 2
from urllib.request import urlopen


# In[47]:


# this will download some data from net
#parsed = parse(urlopen('http://finance.yahoo.com/q/op?s=AAPL+Options'))
parsed = parse(urlopen('http://finance.yahoo.com'))
#parsed = parse(urlopen('http://www.omdbapi.com/'))
doc = parsed.getroot()


# In[48]:


print(doc)


# In[49]:


# to fetch all hyperlinks <a
links = doc.findall('.//a')
len(links)


# In[50]:


lnk = links[18]
lnk.get('href')


# In[51]:


lnk.text_content()


# In[52]:


# to get all the urls in one liner
urls = [lnk.get('href') for lnk in doc.findall('.//a')]
print(urls[:10])


# In[53]:


## Finding the tables in html 
tables = doc.findall('.//table')
print(len(tables))

table = tables[0]
print(table)


# In[54]:


# table will have only 1 table
# get td tag
rows = table.findall('.//tr')
print(rows)


# In[55]:


# For the header as well as the data rows, we want to extract the text from each cell; in
# the case of the header these are th cells and td cells for the data:
def _unpack(row, kind='td'):
    elts = row.findall('.//%s' % kind)
    return [val.text_content() for val in elts]


# In[56]:


# printing table header th tag
_unpack(rows[0], kind='th')


# In[57]:


# printing table data td tag
_unpack(rows[0], kind='td')


# In[58]:


# combining all the code into one func
from pandas.io.parsers import TextParser
def parse_options_data(table):
    rows = table.findall('.//tr')
    header = _unpack(rows[0], kind='th')
    data = [_unpack(r) for r in rows[1:]]
    return TextParser(data, names=header).get_chunk()


# In[59]:


data = parse_options_data(table)
print(data)


# ### Parsing XML with lxml.objectify

# In[60]:


from lxml import objectify
parsed = objectify.parse(open("dataset/Performance_MNR.xml"))
root = parsed.getroot()


# In[61]:


print(root)


# In[62]:


data = []
skip_fields = ['PARENT_SEQ', 'INDICATOR_SEQ',
               'DESIRED_CHANGE', 'DECIMAL_PLACES']
for elt in root.INDICATOR:
    el_data = {}
    for child in elt.getchildren():
        if child.tag in skip_fields:
            continue
        el_data[child.tag] = child.pyval
    data.append(el_data)


# In[63]:


perf = pd.DataFrame(data)


# In[64]:


perf.head()


# In[ ]: