In [1]:

import os
import numpy as np
import csv

print csv.__version__
print np.__version__

1.0
1.14.2

In [2]:

# open file
# set delimiter as ';'
# typecast to list and store to variable "wine"

DATA_DIR = '../data'

with open(os.path.abspath(os.path.join(DATA_DIR, 'day7/winequality-white.csv')), 'r') as datafile:
    reader = csv.reader(datafile, delimiter=";")
    wines = list(reader)

In [3]:

wines[0]

Out[3]:

['fixed acidity',
 'volatile acidity',
 'citric acid',
 'residual sugar',
 'chlorides',
 'free sulfur dioxide',
 'total sulfur dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol',
 'quality']

In [4]:

# -1 to avoid header from counting into records
print 'Total records: {}'.format(len(wines)-1)

Total records: 4898

In [5]:

# average of quality variable
qualities = [float(item[-1]) for item in wines[1:]]
print sum(qualities)/len(qualities)

5.87790935076

In [6]:

# converting python array to numpy array and typecasting the cell values to float
wines = wines[1:]
wines_np = np.array(wines, dtype='float')
wines_np.shape

Out[6]:

(4898, 12)

In [7]:

# printing out wines 2-D array
wines_np

Out[7]:

array([[ 7.  ,  0.27,  0.36, ...,  0.45,  8.8 ,  6.  ],
       [ 6.3 ,  0.3 ,  0.34, ...,  0.49,  9.5 ,  6.  ],
       [ 8.1 ,  0.28,  0.4 , ...,  0.44, 10.1 ,  6.  ],
       ...,
       [ 6.5 ,  0.24,  0.19, ...,  0.46,  9.4 ,  6.  ],
       [ 5.5 ,  0.29,  0.3 , ...,  0.38, 12.8 ,  7.  ],
       [ 6.  ,  0.21,  0.38, ...,  0.32, 11.8 ,  6.  ]])

In [8]:

# creating numpy array with all zero elements
print np.zeros((2,2), dtype='float')

[[0. 0.]
 [0. 0.]]

In [9]:

# creating numpy array with all random numbers
print np.random.rand(2,2)

[[0.79301112 0.95735914]
 [0.29987175 0.96456405]]

In [10]:

# using numpy to read dataset
wines = np.genfromtxt(
                  os.path.abspath(os.path.join(DATA_DIR, 'day7/winequality-white.csv')), 
                  delimiter=";", 
                  skip_header=1
                    )

In [11]:

wines == wines_np

Out[11]:

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])

In [12]:

# accessing the value for 3rd row 2nd column of wines
print wines[2, 1]
print wines[2, 1] == wines_np[2, 1]

0.28
True

In [13]:

# we would like to access 4rows from top of 3rd column
# wines[start:end, column_index]
# since the index start from zero; so slicing excludes 4 and finds out result from 0, 1, 2, 3
wines[:4,2]

Out[13]:

array([0.36, 0.34, 0.4 , 0.32])

In [14]:

# we will override the existing value of 2nd column to 10.0 for all the rows
wines[:, 2] = 10.0
wines[:4, 2]

Out[14]:

array([10., 10., 10., 10.])

In [15]:

# creating 1-D array in numpy
random_1d = np.random.rand(5)
random_1d

Out[15]:

array([0.28855438, 0.95129591, 0.80747318, 0.89765623, 0.98632739])

In [16]:

# creating 3-D numpy array
random_3d = np.random.rand(2,4,3)

In [17]:

# take this shape as any thing for 2 years across 4 quarters per month in that quarter
#2x4x3 = 24 months
random_3d.shape

Out[17]:

(2, 4, 3)

In [18]:

# Data types in numpy
# converting wines to type=int
wines.astype('int')

Out[18]:

array([[ 7,  0, 10, ...,  0,  8,  6],
       [ 6,  0, 10, ...,  0,  9,  6],
       [ 8,  0, 10, ...,  0, 10,  6],
       ...,
       [ 6,  0, 10, ...,  0,  9,  6],
       [ 5,  0, 10, ...,  0, 12,  7],
       [ 6,  0, 10, ...,  0, 11,  6]])

In [19]:

# addition to any column across all rows
# as shows below all the remaining mathematical operations can be done
print wines[:, 11]
print wines[:, 11] + 1

[6. 6. 6. ... 6. 7. 6.]
[7. 7. 7. ... 7. 8. 7.]

In [20]:

# multiplying 2 columns
# examples show the square of 12th column
wines[:, 11] * wines[: , 11]

Out[20]:

array([36., 36., 36., ..., 36., 49., 36.])

In [21]:

# sum any column across all rows
wines[:, 11].sum(axis=0)

Out[21]:

28790.0

In [22]:

wines[:, 11].mean() #std, min, max are many other methods for fast stats computation

Out[22]:

5.87790935075541