Numpy Introduction

numpy arrays

In [91]:
import numpy as np
arr = np.array([1,3,4,5,6])
arr
Out[91]:
array([1, 3, 4, 5, 6])
In [8]:
arr.shape
Out[8]:
(5,)
In [9]:
arr.dtype
Out[9]:
dtype('int32')
In [10]:
arr = np.array([1,'st','er',3])
arr.dtype
Out[10]:
dtype('<U11')
In [5]:
np.sum(arr)
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-5-883ceacaba8a> in <module>()
----> 1 np.sum(arr)

C:\Users\sharmatu\AppData\Local\Continuum\Anaconda\envs\Python3.5\lib\site-packages\numpy\core\fromnumeric.py in sum(a, axis, dtype, out, keepdims)
   1812             return sum(axis=axis, dtype=dtype, out=out, **kwargs)
   1813     return _methods._sum(a, axis=axis, dtype=dtype,
-> 1814                          out=out, **kwargs)
   1815 
   1816 

C:\Users\sharmatu\AppData\Local\Continuum\Anaconda\envs\Python3.5\lib\site-packages\numpy\core\_methods.py in _sum(a, axis, dtype, out, keepdims)
     30 
     31 def _sum(a, axis=None, dtype=None, out=None, keepdims=False):
---> 32     return umr_sum(a, axis, dtype, out, keepdims)
     33 
     34 def _prod(a, axis=None, dtype=None, out=None, keepdims=False):

TypeError: cannot perform reduce with flexible type

Creating arrays

In [11]:
arr = np.array([[1,2,3],[2,4,6],[8,8,8]])
arr.shape
Out[11]:
(3, 3)
In [12]:
arr
Out[12]:
array([[1, 2, 3],
       [2, 4, 6],
       [8, 8, 8]])
In [13]:
arr = np.zeros((2,4))
arr
Out[13]:
array([[ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.]])
In [14]:
arr = np.ones((2,4))
arr
Out[14]:
array([[ 1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.]])
In [15]:
arr = np.identity(3)
arr
Out[15]:
array([[ 1.,  0.,  0.],
       [ 0.,  1.,  0.],
       [ 0.,  0.,  1.]])
In [16]:
arr = np.random.randn(3,4)
arr
Out[16]:
array([[ 0.11069212, -1.3712359 , -0.35438971,  0.03397169],
       [ 0.35755146, -1.15864674,  0.49294546, -0.59452261],
       [ 0.85139437,  0.75329689, -0.57315488, -0.02419983]])
In [17]:
from io import BytesIO
b = BytesIO(b"2,23,33\n32,42,63.4\n35,77,12")
arr = np.genfromtxt(b, delimiter=",")
arr
Out[17]:
array([[  2. ,  23. ,  33. ],
       [ 32. ,  42. ,  63.4],
       [ 35. ,  77. ,  12. ]])

Accessing array elements

Simple indexing

In [18]:
arr[1]
Out[18]:
array([ 32. ,  42. ,  63.4])
In [19]:
arr = np.arange(12).reshape(2,2,3)
arr
Out[19]:
array([[[ 0,  1,  2],
        [ 3,  4,  5]],

       [[ 6,  7,  8],
        [ 9, 10, 11]]])
In [20]:
arr[0]
Out[20]:
array([[0, 1, 2],
       [3, 4, 5]])
In [21]:
arr = np.arange(10)
arr[5:]
Out[21]:
array([5, 6, 7, 8, 9])
In [22]:
arr[5:8]
Out[22]:
array([5, 6, 7])
In [23]:
arr[:-5]
Out[23]:
array([0, 1, 2, 3, 4])
In [24]:
arr = np.arange(12).reshape(2,2,3)
arr
Out[24]:
array([[[ 0,  1,  2],
        [ 3,  4,  5]],

       [[ 6,  7,  8],
        [ 9, 10, 11]]])
In [25]:
arr[1:2]
Out[25]:
array([[[ 6,  7,  8],
        [ 9, 10, 11]]])
In [26]:
arr = np.arange(27).reshape(3,3,3)
arr
Out[26]:
array([[[ 0,  1,  2],
        [ 3,  4,  5],
        [ 6,  7,  8]],

       [[ 9, 10, 11],
        [12, 13, 14],
        [15, 16, 17]],

       [[18, 19, 20],
        [21, 22, 23],
        [24, 25, 26]]])
In [27]:
arr[:,:,2]
Out[27]:
array([[ 2,  5,  8],
       [11, 14, 17],
       [20, 23, 26]])
In [28]:
arr[...,2]
Out[28]:
array([[ 2,  5,  8],
       [11, 14, 17],
       [20, 23, 26]])

Advanced Indexing

In [29]:
arr = np.arange(9).reshape(3,3)
arr
Out[29]:
array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])
In [30]:
arr[[0,1,2],[1,0,0]]
Out[30]:
array([1, 3, 6])
Boolean Indexing
In [31]:
cities = np.array(["delhi","banglaore","mumbai","chennai","bhopal"])
city_data = np.random.randn(5,3)
city_data
Out[31]:
array([[-0.04941315, -0.41476745, -0.60236098],
       [-1.75033842,  0.62559942, -0.58148095],
       [ 0.43502897, -0.06588454, -0.40865494],
       [-0.53978394, -0.7317352 , -0.66959325],
       [ 0.45550659, -0.53018559, -0.2241479 ]])
In [32]:
city_data[cities =="delhi"]
Out[32]:
array([[-0.04941315, -0.41476745, -0.60236098]])
In [33]:
city_data[city_data >0]
Out[33]:
array([ 0.62559942,  0.43502897,  0.45550659])
In [34]:
city_data[city_data >0] = 0
city_data
Out[34]:
array([[-0.04941315, -0.41476745, -0.60236098],
       [-1.75033842,  0.        , -0.58148095],
       [ 0.        , -0.06588454, -0.40865494],
       [-0.53978394, -0.7317352 , -0.66959325],
       [ 0.        , -0.53018559, -0.2241479 ]])

Operations on arrays

In [35]:
arr = np.arange(15).reshape(3,5)
arr
Out[35]:
array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])
In [36]:
arr + 5
Out[36]:
array([[ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14],
       [15, 16, 17, 18, 19]])
In [37]:
arr * 2
Out[37]:
array([[ 0,  2,  4,  6,  8],
       [10, 12, 14, 16, 18],
       [20, 22, 24, 26, 28]])
In [38]:
arr1 = np.arange(15).reshape(5,3)
arr2 = np.arange(5).reshape(5,1)
arr2 + arr1
Out[38]:
array([[ 0,  1,  2],
       [ 4,  5,  6],
       [ 8,  9, 10],
       [12, 13, 14],
       [16, 17, 18]])
In [39]:
arr1
Out[39]:
array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])
In [40]:
arr2
Out[40]:
array([[0],
       [1],
       [2],
       [3],
       [4]])
In [41]:
arr1 = np.random.randn(5,3)
arr1
Out[41]:
array([[-0.92631238, -0.75087049,  0.38818842],
       [ 1.34359452, -0.68896739, -0.58429706],
       [ 1.06638747, -0.40104143,  0.99089011],
       [ 0.26232893,  1.4349162 , -0.97503394],
       [ 0.35716111,  0.20198017,  0.08151897]])
In [42]:
np.modf(arr1)
Out[42]:
(array([[-0.92631238, -0.75087049,  0.38818842],
        [ 0.34359452, -0.68896739, -0.58429706],
        [ 0.06638747, -0.40104143,  0.99089011],
        [ 0.26232893,  0.4349162 , -0.97503394],
        [ 0.35716111,  0.20198017,  0.08151897]]), array([[-0., -0.,  0.],
        [ 1., -0., -0.],
        [ 1., -0.,  0.],
        [ 0.,  1., -0.],
        [ 0.,  0.,  0.]]))

Linear algebra using numpy

In [43]:
A = np.array([[1,2,3],[4,5,6],[7,8,9]])
B = np.array([[9,8,7],[6,5,4],[1,2,3]])
A.dot(B)
Out[43]:
array([[ 24,  24,  24],
       [ 72,  69,  66],
       [120, 114, 108]])
In [44]:
A = np.arange(15).reshape(3,5)
A.T
Out[44]:
array([[ 0,  5, 10],
       [ 1,  6, 11],
       [ 2,  7, 12],
       [ 3,  8, 13],
       [ 4,  9, 14]])
In [45]:
np.linalg.svd(A)
Out[45]:
(array([[-0.15425367,  0.89974393,  0.40824829],
        [-0.50248417,  0.28432901, -0.81649658],
        [-0.85071468, -0.3310859 ,  0.40824829]]),
 array([  3.17420265e+01,   2.72832424e+00,   4.58204637e-16]),
 array([[-0.34716018, -0.39465093, -0.44214167, -0.48963242, -0.53712316],
        [-0.69244481, -0.37980343, -0.06716206,  0.24547932,  0.55812069],
        [ 0.33717486, -0.77044776,  0.28661392,  0.38941603, -0.24275704],
        [-0.36583339,  0.32092943, -0.08854543,  0.67763613, -0.54418674],
        [-0.39048565,  0.05843412,  0.8426222 , -0.29860414, -0.21196653]]))
In [46]:
a = np.array([[7,5,-3], [3,-5,2],[5,3,-7]])
b = np.array([16,-8,0])
x = np.linalg.solve(a, b)
x
Out[46]:
array([ 1.,  3.,  2.])
In [47]:
np.allclose(np.dot(a, x), b)
Out[47]:
True

Pandas

Data frames

In [2]:
import pandas as pd
d =  [{'city':'Delhi',"data":1000},
      {'city':'Banglaore',"data":2000},
      {'city':'Mumbai',"data":1000}]
pd.DataFrame(d)
Out[2]:
city data
0 Delhi 1000
1 Banglaore 2000
2 Mumbai 1000
In [3]:
df = pd.DataFrame(d)

Reading in data

In [4]:
city_data = pd.read_csv(filepath_or_buffer='simplemaps-worldcities-basic.csv')
In [5]:
city_data.head(n=10)
Out[5]:
city city_ascii lat lng pop country iso2 iso3 province
0 Qal eh-ye Now Qal eh-ye 34.983000 63.133300 2997.0 Afghanistan AF AFG Badghis
1 Chaghcharan Chaghcharan 34.516701 65.250001 15000.0 Afghanistan AF AFG Ghor
2 Lashkar Gah Lashkar Gah 31.582998 64.360000 201546.0 Afghanistan AF AFG Hilmand
3 Zaranj Zaranj 31.112001 61.886998 49851.0 Afghanistan AF AFG Nimroz
4 Tarin Kowt Tarin Kowt 32.633298 65.866699 10000.0 Afghanistan AF AFG Uruzgan
5 Zareh Sharan Zareh Sharan 32.850000 68.416705 13737.0 Afghanistan AF AFG Paktika
6 Asadabad Asadabad 34.866000 71.150005 48400.0 Afghanistan AF AFG Kunar
7 Taloqan Taloqan 36.729999 69.540004 64256.0 Afghanistan AF AFG Takhar
8 Mahmud-E Eraqi Mahmud-E Eraqi 35.016696 69.333301 7407.0 Afghanistan AF AFG Kapisa
9 Mehtar Lam Mehtar Lam 34.650000 70.166701 17345.0 Afghanistan AF AFG Laghman
In [55]:
city_data.tail()
Out[55]:
city city_ascii lat lng pop country iso2 iso3 province
7317 Mutare Mutare -18.970019 32.650038 216785.0 Zimbabwe ZW ZWE Manicaland
7318 Kadoma Kadoma -18.330006 29.909947 56400.0 Zimbabwe ZW ZWE Mashonaland West
7319 Chitungwiza Chitungwiza -18.000001 31.100003 331071.0 Zimbabwe ZW ZWE Harare
7320 Harare Harare -17.817790 31.044709 1557406.5 Zimbabwe ZW ZWE Harare
7321 Bulawayo Bulawayo -20.169998 28.580002 697096.0 Zimbabwe ZW ZWE Bulawayo
In [56]:
series_es = city_data.lat
In [57]:
type(series_es)
Out[57]:
pandas.core.series.Series
In [58]:
series_es[1:10:2]
Out[58]:
1    34.516701
3    31.112001
5    32.850000
7    36.729999
9    34.650000
Name: lat, dtype: float64
In [59]:
series_es[:7]
Out[59]:
0    34.983000
1    34.516701
2    31.582998
3    31.112001
4    32.633298
5    32.850000
6    34.866000
Name: lat, dtype: float64
In [60]:
series_es[:-7315]
Out[60]:
0    34.983000
1    34.516701
2    31.582998
3    31.112001
4    32.633298
5    32.850000
6    34.866000
Name: lat, dtype: float64
In [61]:
city_data[:7]
Out[61]:
city city_ascii lat lng pop country iso2 iso3 province
0 Qal eh-ye Now Qal eh-ye 34.983000 63.133300 2997.0 Afghanistan AF AFG Badghis
1 Chaghcharan Chaghcharan 34.516701 65.250001 15000.0 Afghanistan AF AFG Ghor
2 Lashkar Gah Lashkar Gah 31.582998 64.360000 201546.0 Afghanistan AF AFG Hilmand
3 Zaranj Zaranj 31.112001 61.886998 49851.0 Afghanistan AF AFG Nimroz
4 Tarin Kowt Tarin Kowt 32.633298 65.866699 10000.0 Afghanistan AF AFG Uruzgan
5 Zareh Sharan Zareh Sharan 32.850000 68.416705 13737.0 Afghanistan AF AFG Paktika
6 Asadabad Asadabad 34.866000 71.150005 48400.0 Afghanistan AF AFG Kunar
In [62]:
city_data.iloc[:5,:4]
Out[62]:
city city_ascii lat lng
0 Qal eh-ye Now Qal eh-ye 34.983000 63.133300
1 Chaghcharan Chaghcharan 34.516701 65.250001
2 Lashkar Gah Lashkar Gah 31.582998 64.360000
3 Zaranj Zaranj 31.112001 61.886998
4 Tarin Kowt Tarin Kowt 32.633298 65.866699
In [63]:
city_data[city_data['pop'] > 10000000][city_data.columns[pd.Series(city_data.columns).str.startswith('l')]]
Out[63]:
lat lng
360 -34.602502 -58.397531
1171 -23.558680 -46.625020
2068 31.216452 121.436505
3098 28.669993 77.230004
3110 19.016990 72.856989
3492 35.685017 139.751407
4074 19.442442 -99.130988
4513 24.869992 66.990009
5394 55.752164 37.615523
6124 41.104996 29.010002
7071 40.749979 -73.980017
In [64]:
city_greater_10mil = city_data[city_data['pop'] > 10000000]
city_greater_10mil.rename(columns={'pop':'population'}, inplace=True)
city_greater_10mil.where(city_greater_10mil.population > 15000000)
C:\Users\sharmatu\AppData\Local\Continuum\Anaconda\envs\Python3.5\lib\site-packages\pandas\core\frame.py:2746: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  **kwargs)
Out[64]:
city city_ascii lat lng population country iso2 iso3 province
360 NaN NaN NaN NaN NaN NaN NaN NaN NaN
1171 NaN NaN NaN NaN NaN NaN NaN NaN NaN
2068 NaN NaN NaN NaN NaN NaN NaN NaN NaN
3098 NaN NaN NaN NaN NaN NaN NaN NaN NaN
3110 Mumbai Mumbai 19.016990 72.856989 15834918.0 India IN IND Maharashtra
3492 Tokyo Tokyo 35.685017 139.751407 22006299.5 Japan JP JPN Tokyo
4074 NaN NaN NaN NaN NaN NaN NaN NaN NaN
4513 NaN NaN NaN NaN NaN NaN NaN NaN NaN
5394 NaN NaN NaN NaN NaN NaN NaN NaN NaN
6124 NaN NaN NaN NaN NaN NaN NaN NaN NaN
7071 NaN NaN NaN NaN NaN NaN NaN NaN NaN
In [65]:
df = pd.DataFrame(np.random.randn(8, 3),
columns=['A', 'B', 'C'])

Operations on dataframes

In [66]:
nparray = df.values
type(nparray)
Out[66]:
numpy.ndarray
In [67]:
from numpy import nan
df.iloc[4,2] = nan
In [68]:
df
Out[68]:
A B C
0 -1.279701 -0.074395 -1.370447
1 1.536038 0.060453 0.856685
2 0.475407 1.029245 -0.420355
3 -1.636635 -0.385956 -0.261129
4 1.259545 1.916660 NaN
5 1.591468 0.813209 0.605695
6 -1.270361 0.200358 0.035595
7 -0.189060 -1.874718 -1.088224
In [69]:
df.fillna(0)
Out[69]:
A B C
0 -1.279701 -0.074395 -1.370447
1 1.536038 0.060453 0.856685
2 0.475407 1.029245 -0.420355
3 -1.636635 -0.385956 -0.261129
4 1.259545 1.916660 0.000000
5 1.591468 0.813209 0.605695
6 -1.270361 0.200358 0.035595
7 -0.189060 -1.874718 -1.088224
In [70]:
columns_numeric = ['lat','lng','pop']
In [71]:
city_data[columns_numeric].mean()
Out[71]:
lat        20.662876
lng        10.711914
pop    265463.071633
dtype: float64
In [72]:
city_data[columns_numeric].sum()
Out[72]:
lat    1.512936e+05
lng    7.843263e+04
pop    1.943721e+09
dtype: float64
In [73]:
city_data[columns_numeric].count()
Out[73]:
lat    7322
lng    7322
pop    7322
dtype: int64
In [74]:
city_data[columns_numeric].median()
Out[74]:
lat       26.792730
lng       18.617509
pop    61322.750000
dtype: float64
In [75]:
city_data[columns_numeric].quantile(0.8)
Out[75]:
lat        46.852480
lng        89.900018
pop    269210.000000
Name: 0.8, dtype: float64
In [76]:
city_data[columns_numeric].sum(axis = 1).head()
Out[76]:
0      3095.116300
1     15099.766702
2    201641.942998
3     49943.998999
4     10098.499997
dtype: float64
In [77]:
city_data[columns_numeric].describe()
Out[77]:
lat lng pop
count 7322.000000 7322.000000 7.322000e+03
mean 20.662876 10.711914 2.654631e+05
std 29.134818 79.044615 8.287622e+05
min -89.982894 -179.589979 -9.900000e+01
25% -0.324710 -64.788472 1.734425e+04
50% 26.792730 18.617509 6.132275e+04
75% 43.575448 73.103628 2.001726e+05
max 82.483323 179.383304 2.200630e+07
In [78]:
city_data1 = city_data.sample(3)

Concatanating data frames

In [79]:
city_data2 = city_data.sample(3)
city_data_combine = pd.concat([city_data1,city_data2])
city_data_combine
Out[79]:
city city_ascii lat lng pop country iso2 iso3 province
4857 Shebekino Shebekino 50.414350 36.894378 41301.5 Russia RU RUS Belgorod
1561 Bouar Bouar 5.950010 15.599967 31476.5 Central African Republic CF CAF Nana-Mambéré
6650 Scottsbluff Scottsbluff 41.867508 -103.660686 20172.0 United States of America US USA Nebraska
964 Janauba Janauba -15.799618 -43.309977 38641.0 Brazil BR BRA Minas Gerais
3896 Altata Altata 24.636045 -107.916215 750.0 Mexico MX MEX Sinaloa
7201 Tra Vinh Tra Vinh 9.934002 106.334002 131360.0 Vietnam VN VNM Trà Vinh
In [80]:
df1 = pd.DataFrame({'col1': ['col10', 'col11', 'col12', 'col13'],
                    'col2': ['col20', 'col21', 'col22', 'col23'],
                    'col3': ['col30', 'col31', 'col32', 'col33'],
                    'col4': ['col40', 'col41', 'col42', 'col43']},
                   index=[0, 1, 2, 3])
In [81]:
df1
Out[81]:
col1 col2 col3 col4
0 col10 col20 col30 col40
1 col11 col21 col31 col41
2 col12 col22 col32 col42
3 col13 col23 col33 col43
In [82]:
df4 = pd.DataFrame({'col2': ['col22', 'col23', 'col26', 'col27'],
                    'Col4': ['Col42', 'Col43', 'Col46', 'Col47'],
                    'col6': ['col62', 'col63', 'col66', 'col67']},
                   index=[2, 3, 6, 7])

pd.concat([df1,df4], axis=1)
Out[82]:
col1 col2 col3 col4 Col4 col2 col6
0 col10 col20 col30 col40 NaN NaN NaN
1 col11 col21 col31 col41 NaN NaN NaN
2 col12 col22 col32 col42 Col42 col22 col62
3 col13 col23 col33 col43 Col43 col23 col63
6 NaN NaN NaN NaN Col46 col26 col66
7 NaN NaN NaN NaN Col47 col27 col67
In [83]:
country_data = city_data[['iso3','country']].drop_duplicates()
In [84]:
country_data.shape
Out[84]:
(223, 2)
In [85]:
country_data.head()
Out[85]:
iso3 country
0 AFG Afghanistan
33 ALD Aland
34 ALB Albania
60 DZA Algeria
111 ASM American Samoa
In [86]:
del(city_data['country'])
In [87]:
city_data.merge(country_data, 'inner').head()
Out[87]:
city city_ascii lat lng pop iso2 iso3 province country
0 Qal eh-ye Now Qal eh-ye 34.983000 63.133300 2997.0 AF AFG Badghis Afghanistan
1 Chaghcharan Chaghcharan 34.516701 65.250001 15000.0 AF AFG Ghor Afghanistan
2 Lashkar Gah Lashkar Gah 31.582998 64.360000 201546.0 AF AFG Hilmand Afghanistan
3 Zaranj Zaranj 31.112001 61.886998 49851.0 AF AFG Nimroz Afghanistan
4 Tarin Kowt Tarin Kowt 32.633298 65.866699 10000.0 AF AFG Uruzgan Afghanistan

Scikit-learn

In [94]:
from sklearn import datasets
diabetes = datasets.load_diabetes()
X = diabetes.data[:10]
y = diabetes.target
In [95]:
X[:5]
Out[95]:
array([[ 0.03807591,  0.05068012,  0.06169621,  0.02187235, -0.0442235 ,
        -0.03482076, -0.04340085, -0.00259226,  0.01990842, -0.01764613],
       [-0.00188202, -0.04464164, -0.05147406, -0.02632783, -0.00844872,
        -0.01916334,  0.07441156, -0.03949338, -0.06832974, -0.09220405],
       [ 0.08529891,  0.05068012,  0.04445121, -0.00567061, -0.04559945,
        -0.03419447, -0.03235593, -0.00259226,  0.00286377, -0.02593034],
       [-0.08906294, -0.04464164, -0.01159501, -0.03665645,  0.01219057,
         0.02499059, -0.03603757,  0.03430886,  0.02269202, -0.00936191],
       [ 0.00538306, -0.04464164, -0.03638469,  0.02187235,  0.00393485,
         0.01559614,  0.00814208, -0.00259226, -0.03199144, -0.04664087]])
In [96]:
y[:10]
Out[96]:
array([ 151.,   75.,  141.,  206.,  135.,   97.,  138.,   63.,  110.,  310.])
In [97]:
feature_names=['age', 'sex', 'bmi', 'bp',
               's1', 's2', 's3', 's4', 's5', 's6']

Scikit example regression

In [98]:
from sklearn import datasets
from sklearn.linear_model import Lasso

from sklearn import linear_model, datasets
from sklearn.model_selection import GridSearchCV

diabetes = datasets.load_diabetes()
X_train = diabetes.data[:310]
y_train = diabetes.target[:310]

X_test = diabetes.data[310:]
y_test = diabetes.target[310:]

lasso = Lasso(random_state=0)
alphas = np.logspace(-4, -0.5, 30)

scores = list()
scores_std = list()

estimator = GridSearchCV(lasso,
                         param_grid = dict(alpha=alphas))

estimator.fit(X_train, y_train)
Out[98]:
GridSearchCV(cv=None, error_score='raise',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=0,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'alpha': array([  1.00000e-04,   1.32035e-04,   1.74333e-04,   2.30181e-04,
         3.03920e-04,   4.01281e-04,   5.29832e-04,   6.99564e-04,
         9.23671e-04,   1.21957e-03,   1.61026e-03,   2.12611e-03,
         2.80722e-03,   3.70651e-03,   4.89390e-03,   6.46167e-03,
         8....    7.88046e-02,   1.04050e-01,   1.37382e-01,   1.81393e-01,
         2.39503e-01,   3.16228e-01])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)
In [99]:
estimator.best_score_
Out[99]:
0.46540637590235312
In [100]:
estimator.best_estimator_
Out[100]:
Lasso(alpha=0.025929437974046669, copy_X=True, fit_intercept=True,
   max_iter=1000, normalize=False, positive=False, precompute=False,
   random_state=0, selection='cyclic', tol=0.0001, warm_start=False)
In [101]:
estimator.predict(X_test)
Out[101]:
array([ 203.42104984,  177.6595529 ,  122.62188598,  212.81136958,
        173.61633075,  114.76145025,  202.36033584,  171.70767813,
        164.28694562,  191.29091477,  191.41279009,  288.2772433 ,
        296.47009002,  234.53378413,  210.61427168,  228.62812055,
        156.74489991,  225.08834492,  191.75874632,  102.81600989,
        172.373221  ,  111.20843429,  290.22242876,  178.64605207,
         78.13722832,   86.35832297,  256.41378529,  165.99622543,
        121.29260976,  153.48718848,  163.09835143,  180.0932902 ,
        161.4330553 ,  155.80211635,  143.70181085,  126.13753819,
        181.06471818,  105.03679977,  131.0479936 ,   90.50606427,
        252.66486639,   84.84786067,   59.41005358,  184.51368208,
        201.46598714,  129.96333913,   90.65641478,  200.10932516,
         55.2884802 ,  171.60459062,  195.40750666,  122.14139787,
        231.72783897,  159.49750022,  160.32104862,  165.53701866,
        260.73217736,  259.77213787,  204.69526082,  185.66480969,
         61.09821961,  209.9214333 ,  108.50410841,  141.18424239,
        126.10337002,  174.32819351,  214.4947322 ,  162.1789921 ,
        160.57776438,  134.11449594,  171.63076427,   71.71500885,
        263.46782314,  113.73653782,  112.76227977,  134.37721414,
        110.67874472,   98.67153573,  157.2591359 ,   78.32019218,
        265.97090212,   57.85502185,  100.38532691,  101.91670102,
        277.13032245,  168.6443445 ,   64.75637937,  184.37359745,
        174.74927914,  188.78215433,  181.56001383,   92.74463449,
        145.41037529,  257.78620944,  196.57335354,  276.1920927 ,
         50.66776115,  179.12879963,  200.29366671,  167.29501922,
        158.93206689,  156.08070427,  233.38241229,  125.30241353,
        167.05404644,  171.66748431,  223.17843095,  156.7055944 ,
        103.29063169,   84.08205647,  139.87060658,  189.99648341,
        200.20182211,  143.61906164,  170.00220231,  112.05886847,
        160.76337573,  130.06232976,  261.83022688,  102.24589129,
        115.12771477,  119.14505163,  225.96991263,   63.51874043,
        134.88829709,  120.01764214,   55.32147904,  189.95346987,
        105.8037979 ,  120.46197038,  211.35568232,   56.78368048])

Deep Learning Frameworks

Theano example

In [1]:
import numpy
import theano.tensor as T
from theano import function
x = T.dscalar('x')
y = T.dscalar('y')
z = x + y
In [2]:
f = function([x, y], z)
f(8, 2)
Out[2]:
array(10.0)

Tensorflow example

In [102]:
import tensorflow as tf
hello = tf.constant('Hello, TensorFlow!')
sess = tf.Session()
print(sess.run(hello))
b'Hello, TensorFlow!'

Building a neural network model with Keras

In [103]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

X_train = cancer.data[:340]
y_train = cancer.target[:340]

X_test = cancer.data[340:]
y_test = cancer.target[340:]

import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout
Using TensorFlow backend.
In [150]:
model = Sequential()
model.add(Dense(15, input_dim=30, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
In [151]:
model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])
In [152]:
model.fit(X_train, y_train,
          epochs=20,
          batch_size=50)
Epoch 1/20
340/340 [==============================] - 0s - loss: 7.3616 - acc: 0.5382     
Epoch 2/20
340/340 [==============================] - 0s - loss: 7.3616 - acc: 0.5382     
Epoch 3/20
340/340 [==============================] - 0s - loss: 7.3616 - acc: 0.5382     
Epoch 4/20
340/340 [==============================] - 0s - loss: 7.3616 - acc: 0.5382     
Epoch 5/20
340/340 [==============================] - 0s - loss: 7.3616 - acc: 0.5382     
Epoch 6/20
340/340 [==============================] - 0s - loss: 7.3616 - acc: 0.5382     
Epoch 7/20
340/340 [==============================] - 0s - loss: 7.3616 - acc: 0.5382     
Epoch 8/20
340/340 [==============================] - 0s - loss: 7.3616 - acc: 0.5382     
Epoch 9/20
340/340 [==============================] - 0s - loss: 7.3616 - acc: 0.5382     
Epoch 10/20
340/340 [==============================] - 0s - loss: 7.3616 - acc: 0.5382     
Epoch 11/20
340/340 [==============================] - 0s - loss: 7.3616 - acc: 0.5382     
Epoch 12/20
340/340 [==============================] - 0s - loss: 7.3616 - acc: 0.5382     
Epoch 13/20
340/340 [==============================] - 0s - loss: 7.3616 - acc: 0.5382     
Epoch 14/20
340/340 [==============================] - 0s - loss: 7.3616 - acc: 0.5382     
Epoch 15/20
340/340 [==============================] - 0s - loss: 7.3616 - acc: 0.5382     
Epoch 16/20
340/340 [==============================] - 0s - loss: 7.3616 - acc: 0.5382     
Epoch 17/20
340/340 [==============================] - 0s - loss: 7.3616 - acc: 0.5382     
Epoch 18/20
340/340 [==============================] - 0s - loss: 7.3616 - acc: 0.5382     
Epoch 19/20
340/340 [==============================] - 0s - loss: 7.3616 - acc: 0.5382     
Epoch 20/20
340/340 [==============================] - 0s - loss: 7.3616 - acc: 0.5382     
Out[152]:
<keras.callbacks.History at 0x1d49ea58be0>
In [153]:
predictions = model.predict_classes(X_test)
 32/229 [===>..........................] - ETA: 1s
In [154]:
from sklearn import metrics

print('Accuracy:', metrics.accuracy_score(y_true=y_test, y_pred=predictions))
print(metrics.classification_report(y_true=y_test, y_pred=predictions))
Accuracy: 0.759825327511
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        55
          1       0.76      1.00      0.86       174

avg / total       0.58      0.76      0.66       229

C:\Program Files\Anaconda3\lib\site-packages\sklearn\metrics\classification.py:1113: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

The power of deep learning models

In [155]:
model = Sequential()
model.add(Dense(15, input_dim=30, activation='relu'))
model.add(Dense(15, activation='relu'))
model.add(Dense(15, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

model.fit(X_train, y_train,
          epochs=20,
          batch_size=50)
Epoch 1/20
340/340 [==============================] - 0s - loss: 3.3799 - acc: 0.3941     
Epoch 2/20
340/340 [==============================] - 0s - loss: 1.3740 - acc: 0.6059     
Epoch 3/20
340/340 [==============================] - 0s - loss: 0.4258 - acc: 0.8471     
Epoch 4/20
340/340 [==============================] - 0s - loss: 0.2859 - acc: 0.8912     
Epoch 5/20
340/340 [==============================] - 0s - loss: 0.2061 - acc: 0.9206     
Epoch 6/20
340/340 [==============================] - 0s - loss: 0.2407 - acc: 0.8941     
Epoch 7/20
340/340 [==============================] - 0s - loss: 0.2725 - acc: 0.9118     
Epoch 8/20
340/340 [==============================] - 0s - loss: 0.5237 - acc: 0.8676     
Epoch 9/20
340/340 [==============================] - 0s - loss: 0.2165 - acc: 0.9324     
Epoch 10/20
340/340 [==============================] - 0s - loss: 0.2502 - acc: 0.9029     
Epoch 11/20
340/340 [==============================] - 0s - loss: 0.3235 - acc: 0.8853     
Epoch 12/20
340/340 [==============================] - 0s - loss: 0.3115 - acc: 0.8912     
Epoch 13/20
340/340 [==============================] - 0s - loss: 0.2975 - acc: 0.9059     
Epoch 14/20
340/340 [==============================] - 0s - loss: 0.3426 - acc: 0.9118     
Epoch 15/20
340/340 [==============================] - 0s - loss: 0.3763 - acc: 0.9176     
Epoch 16/20
340/340 [==============================] - 0s - loss: 0.2420 - acc: 0.9088     
Epoch 17/20
340/340 [==============================] - 0s - loss: 0.4274 - acc: 0.8618     
Epoch 18/20
340/340 [==============================] - 0s - loss: 0.1885 - acc: 0.9353     
Epoch 19/20
340/340 [==============================] - 0s - loss: 0.2361 - acc: 0.9235     
Epoch 20/20
340/340 [==============================] - 0s - loss: 0.3154 - acc: 0.9000     
Out[155]:
<keras.callbacks.History at 0x1d49ee45908>
In [156]:
predictions = model.predict_classes(X_test)
 32/229 [===>..........................] - ETA: 1s
In [157]:
print('Accuracy:', metrics.accuracy_score(y_true=y_test, y_pred=predictions))
print(metrics.classification_report(y_true=y_test, y_pred=predictions))
Accuracy: 0.912663755459
             precision    recall  f1-score   support

          0       0.78      0.89      0.83        55
          1       0.96      0.92      0.94       174

avg / total       0.92      0.91      0.91       229