import os
import numpy as np
import csv
print csv.__version__
print np.__version__
1.0 1.14.2
# open file
# set delimiter as ';'
# typecast to list and store to variable "wine"
DATA_DIR = '../data'
with open(os.path.abspath(os.path.join(DATA_DIR, 'day7/winequality-white.csv')), 'r') as datafile:
reader = csv.reader(datafile, delimiter=";")
wines = list(reader)
wines[0]
['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality']
# -1 to avoid header from counting into records
print 'Total records: {}'.format(len(wines)-1)
Total records: 4898
# average of quality variable
qualities = [float(item[-1]) for item in wines[1:]]
print sum(qualities)/len(qualities)
5.87790935076
# converting python array to numpy array and typecasting the cell values to float
wines = wines[1:]
wines_np = np.array(wines, dtype='float')
wines_np.shape
(4898, 12)
# printing out wines 2-D array
wines_np
array([[ 7. , 0.27, 0.36, ..., 0.45, 8.8 , 6. ], [ 6.3 , 0.3 , 0.34, ..., 0.49, 9.5 , 6. ], [ 8.1 , 0.28, 0.4 , ..., 0.44, 10.1 , 6. ], ..., [ 6.5 , 0.24, 0.19, ..., 0.46, 9.4 , 6. ], [ 5.5 , 0.29, 0.3 , ..., 0.38, 12.8 , 7. ], [ 6. , 0.21, 0.38, ..., 0.32, 11.8 , 6. ]])
# creating numpy array with all zero elements
print np.zeros((2,2), dtype='float')
[[0. 0.] [0. 0.]]
# creating numpy array with all random numbers
print np.random.rand(2,2)
[[0.79301112 0.95735914] [0.29987175 0.96456405]]
# using numpy to read dataset
wines = np.genfromtxt(
os.path.abspath(os.path.join(DATA_DIR, 'day7/winequality-white.csv')),
delimiter=";",
skip_header=1
)
wines == wines_np
array([[ True, True, True, ..., True, True, True], [ True, True, True, ..., True, True, True], [ True, True, True, ..., True, True, True], ..., [ True, True, True, ..., True, True, True], [ True, True, True, ..., True, True, True], [ True, True, True, ..., True, True, True]])
# accessing the value for 3rd row 2nd column of wines
print wines[2, 1]
print wines[2, 1] == wines_np[2, 1]
0.28 True
# we would like to access 4rows from top of 3rd column
# wines[start:end, column_index]
# since the index start from zero; so slicing excludes 4 and finds out result from 0, 1, 2, 3
wines[:4,2]
array([0.36, 0.34, 0.4 , 0.32])
# we will override the existing value of 2nd column to 10.0 for all the rows
wines[:, 2] = 10.0
wines[:4, 2]
array([10., 10., 10., 10.])
# creating 1-D array in numpy
random_1d = np.random.rand(5)
random_1d
array([0.28855438, 0.95129591, 0.80747318, 0.89765623, 0.98632739])
# creating 3-D numpy array
random_3d = np.random.rand(2,4,3)
# take this shape as any thing for 2 years across 4 quarters per month in that quarter
#2x4x3 = 24 months
random_3d.shape
(2, 4, 3)
# Data types in numpy
# converting wines to type=int
wines.astype('int')
array([[ 7, 0, 10, ..., 0, 8, 6], [ 6, 0, 10, ..., 0, 9, 6], [ 8, 0, 10, ..., 0, 10, 6], ..., [ 6, 0, 10, ..., 0, 9, 6], [ 5, 0, 10, ..., 0, 12, 7], [ 6, 0, 10, ..., 0, 11, 6]])
# addition to any column across all rows
# as shows below all the remaining mathematical operations can be done
print wines[:, 11]
print wines[:, 11] + 1
[6. 6. 6. ... 6. 7. 6.] [7. 7. 7. ... 7. 8. 7.]
# multiplying 2 columns
# examples show the square of 12th column
wines[:, 11] * wines[: , 11]
array([36., 36., 36., ..., 36., 49., 36.])
# sum any column across all rows
wines[:, 11].sum(axis=0)
28790.0
wines[:, 11].mean() #std, min, max are many other methods for fast stats computation
5.87790935075541