012_importing_datasets¶

In [ ]:

# Required Imports
import pandas as pd
import sklearn as sk
import sqlite3
from pandas.io import sql

In [ ]:

# Importing CSV files from local directory
# NOTE: Make sure the Path you use contains the dataset named 'whereisthatdataset.csv'

df1 = pd.read_csv ('./assets/whereisthatdataset.csv')               # Using relative path
df2 = pd.read_csv ('/home/arunava/Datasets/whereisthatdataset.csv') # Using absolute path

df1.head(3)

In [ ]:

# If a dataset comes without headers then you need to pass `headers=None`
# Note: This Dataset comes with headers, 
#       specifying `headers=None` leads python to treat the first row as part of the dataset

df1 = pd.read_csv ('./assets/whereisthatdataset.csv', header=None)
df1.head(3)

In [ ]:

# Specify header names while importing datasets with (or without) headers
df1 = pd.read_csv ('./assets/whereisthatdataset.csv', header=None, names=['Where', 'on', 'earth', 'did', 'you', 'got', 'this', 'dataset', 'of', 'Pigeons', 'racing'])
df1.head(3)

In [ ]:

# Importing file from URL
df1 = pd.read_csv('https://raw.githubusercontent.com/iArunava/Python-TheNoTheoryGuide/master/assets/whereisthatdataset.csv')
df1.head(3)

In [ ]:

# Reading Data from text file
# NOTE: Use `sep` to specify how your data is seperated

df1 = pd.read_table ('./assets/whereisthatdataset.txt', sep=',')
df2 = pd.read_csv ('./assets/whereisthatdataset.txt', sep=',')
df1.head(3)

In [ ]:

# Read excel file
# NOTE: you need 'xlrd' module to read .xls files

df1 = pd.read_excel ('./assets/whereisthatdataset.xls', sheetname='whereisthatdataset', skiprows=1)
df1.head(3)

In [ ]:

# Read SAS file
df1 = pd.read_sas ('./assets/whereisthatdataset.sas7bdat')
df1.head(3)

In [ ]:

# Read SQL Table

conn  = sqlite3.connect ('./assets/whereisthatdataset.db')
query = 'SELECT * FROM whereisthattable;'
df1   = pd.read_sql(query, con=conn)
df1.head(3)

In [ ]:

# Read sample rows and columns
# nrows:   Number of rows to select
# usecols: list of cols to use (either all string or unicode)

sdf1 = pd.read_csv ('./assets/whereisthatdataset.csv', nrows=4, usecols=[1, 5, 7])
sdf2 = pd.read_csv ('./assets/whereisthatdataset.csv', nrows=4, usecols=['Breeder', 'Sex', 'Arrival'])
sdf1

In [ ]:

# Skip rows while importing
# NOTE: If you don't set header=None, pandas will treat the first row of all the rows to be considered as the header row

df1 = pd.read_csv ('./assets/whereisthatdataset.csv', header=None, skiprows=5)
df1.head(3)

In [ ]:

# Specify Missing Values
# na_values: pass a list, which if present in the dataset will be considered as missing values

df1 = pd.read_csv ('./assets/whereisthatdataset.csv', na_values=['NaN'])
df1.head(3)