In [1]:
cd C:\Users\tk\Desktop
C:\Users\tk\Desktop
In [2]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
In [3]:
import brewer2mpl
from matplotlib import rcParams

#colorbrewer2 Dark2 qualitative color table
dark2_cmap = brewer2mpl.get_map('Dark2', 'Qualitative', 7)
dark2_colors = dark2_cmap.mpl_colors

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'white'
rcParams['patch.facecolor'] = dark2_colors[0]
rcParams['font.family'] = 'StixGeneral'


def remove_border(axes=None, top=False, right=False, left=True, bottom=True):
    """
    Minimize chartjunk by stripping out unnecesasry plot borders and axis ticks
    
    The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
    """
    ax = axes or plt.gca()
    ax.spines['top'].set_visible(top)
    ax.spines['right'].set_visible(right)
    ax.spines['left'].set_visible(left)
    ax.spines['bottom'].set_visible(bottom)
    
    #turn off all ticks
    ax.yaxis.set_ticks_position('none')
    ax.xaxis.set_ticks_position('none')
    
    #now re-enable visibles
    if top:
        ax.xaxis.tick_top()
    if bottom:
        ax.xaxis.tick_bottom()
    if left:
        ax.yaxis.tick_left()
    if right:
        ax.yaxis.tick_right()

"The Olive Oils data has eight explanatory variables (levels of fatty acids in the oils) and nine classes (areas of Italy). The content of the oils is a subject of study in its own right: Olive oil has high nutritional value, and some of its constituent fatty acids are considered to be more beneficial than others."

In [32]:
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
In [5]:
olive_oil = pd.read_csv('olive.csv') 
olive_oil.head(5)
Out[5]:
Unnamed: 0 region area palmitic palmitoleic stearic oleic linoleic linolenic arachidic eicosenoic
0 1.North-Apulia 1 1 1075 75 226 7823 672 36 60 29
1 2.North-Apulia 1 1 1088 73 224 7709 781 31 61 29
2 3.North-Apulia 1 1 911 54 246 8113 549 31 63 29
3 4.North-Apulia 1 1 966 57 240 7952 619 50 78 35
4 5.North-Apulia 1 1 1051 67 259 7771 672 50 80 46
In [6]:
olive_oil.shape
Out[6]:
(572, 11)
In [7]:
olive_oil.rename(columns = {olive_oil.columns[0]:'area_Idili'}, inplace = True) 
olive_oil.head(5)
Out[7]:
area_Idili region area palmitic palmitoleic stearic oleic linoleic linolenic arachidic eicosenoic
0 1.North-Apulia 1 1 1075 75 226 7823 672 36 60 29
1 2.North-Apulia 1 1 1088 73 224 7709 781 31 61 29
2 3.North-Apulia 1 1 911 54 246 8113 549 31 63 29
3 4.North-Apulia 1 1 966 57 240 7952 619 50 78 35
4 5.North-Apulia 1 1 1051 67 259 7771 672 50 80 46
In [14]:
pd.DataFrame(olive_oil.columns)
Out[14]:
0
0 area_Idili
1 region
2 area
3 palmitic
4 palmitoleic
5 stearic
6 oleic
7 linoleic
8 linolenic
9 arachidic
10 eicosenoic
In [11]:
unique_in_region = olive_oil.region.unique() # We will find how many unique entries are there in region column.
unique_in_area = olive_oil.area.unique()
print unique_in_region
print unique_in_area
[1 2 3]
[1 2 3 4 5 6 9 7 8]
In [13]:
pd.crosstab(olive_oil.area, olive_oil.region) 
Out[13]:
region 1 2 3
area
1 25 0 0
2 56 0 0
3 206 0 0
4 36 0 0
5 0 65 0
6 0 33 0
7 0 0 50
8 0 0 50
9 0 0 51
In [14]:
olive_oil.head(5)
Out[14]:
area_Idili region area palmitic palmitoleic stearic oleic linoleic linolenic arachidic eicosenoic
0 1.North-Apulia 1 1 1075 75 226 7823 672 36 60 29
1 2.North-Apulia 1 1 1088 73 224 7709 781 31 61 29
2 3.North-Apulia 1 1 911 54 246 8113 549 31 63 29
3 4.North-Apulia 1 1 966 57 240 7952 619 50 78 35
4 5.North-Apulia 1 1 1051 67 259 7771 672 50 80 46
In [15]:
olive_oil.area_Idili = olive_oil.area_Idili.map(lambda x: x.split('.')[-1]) 
olive_oil.head()
Out[15]:
area_Idili region area palmitic palmitoleic stearic oleic linoleic linolenic arachidic eicosenoic
0 North-Apulia 1 1 1075 75 226 7823 672 36 60 29
1 North-Apulia 1 1 1088 73 224 7709 781 31 61 29
2 North-Apulia 1 1 911 54 246 8113 549 31 63 29
3 North-Apulia 1 1 966 57 240 7952 619 50 78 35
4 North-Apulia 1 1 1051 67 259 7771 672 50 80 46
In [16]:
# How the split function works 
x = '1.northapulia'
y = x.split('.')
print y
z = x.split('.')[-1] #-1 returns the last element of the list
z
['1', 'northapulia']
Out[16]:
'northapulia'
In [17]:
olive_oil[['palmitic', 'palmitoleic']].head(5) # you can access subset of columns of a data frame. (http://bit.ly/1sPHf1u)
Out[17]:
palmitic palmitoleic
0 1075 75
1 1088 73
2 911 54
3 966 57
4 1051 67
In [18]:
olive_oil['palmitic']
Out[18]:
0     1075
1     1088
2      911
3      966
4     1051
5      911
6      922
7     1100
8     1082
9     1037
10    1051
11    1036
12    1074
13     875
14     952
...
557    1010
558    1020
559    1120
560    1090
561    1100
562    1090
563    1150
564    1110
565    1010
566    1070
567    1280
568    1060
569    1010
570     990
571     960
Name: palmitic, Length: 572, dtype: int64
In [19]:
print " the type of olive_oil[['palmitic']]: \t", type(olive_oil[['palmitic']])
print " the type of olive_oil['palmitic']: \t", type(olive_oil['palmitic'])
 the type of olive_oil[['palmitic']]: 	<class 'pandas.core.frame.DataFrame'>
 the type of olive_oil['palmitic']: 	<class 'pandas.core.series.Series'>
In [20]:
olive_oil.palmitic # this is a convienient way to access a specific column
Out[20]:
0     1075
1     1088
2      911
3      966
4     1051
5      911
6      922
7     1100
8     1082
9     1037
10    1051
11    1036
12    1074
13     875
14     952
...
557    1010
558    1020
559    1120
560    1090
561    1100
562    1090
563    1150
564    1110
565    1010
566    1070
567    1280
568    1060
569    1010
570     990
571     960
Name: palmitic, Length: 572, dtype: int64

What map did is it took a pandas series in form of a list. Took that list and mapped each value of that list to something. here we are going to use a data frame( set of lists). for data frame you should use apply

In [15]:
list_of_acids =['palmitic', 'palmitoleic', 'stearic', 'oleic', 'linoleic', 'linolenic', 'arachidic', 'eicosenoic']
df = olive_oil[list_of_acids].apply(lambda x: x/100.0)
df.head(5)
Out[15]:
palmitic palmitoleic stearic oleic linoleic linolenic arachidic eicosenoic
0 10.75 0.75 2.26 78.23 6.72 0.36 0.60 0.29
1 10.88 0.73 2.24 77.09 7.81 0.31 0.61 0.29
2 9.11 0.54 2.46 81.13 5.49 0.31 0.63 0.29
3 9.66 0.57 2.40 79.52 6.19 0.50 0.78 0.35
4 10.51 0.67 2.59 77.71 6.72 0.50 0.80 0.46
In [22]:
olive_oil[list_of_acids] =df # we are replacing the acid list values in olive_oil
olive_oil.head(5)
Out[22]:
area_Idili region area palmitic palmitoleic stearic oleic linoleic linolenic arachidic eicosenoic
0 North-Apulia 1 1 10.75 0.75 2.26 78.23 6.72 0.36 0.60 0.29
1 North-Apulia 1 1 10.88 0.73 2.24 77.09 7.81 0.31 0.61 0.29
2 North-Apulia 1 1 9.11 0.54 2.46 81.13 5.49 0.31 0.63 0.29
3 North-Apulia 1 1 9.66 0.57 2.40 79.52 6.19 0.50 0.78 0.35
4 North-Apulia 1 1 10.51 0.67 2.59 77.71 6.72 0.50 0.80 0.46
In [19]:
plt.hist(olive_oil.palmitic)
Out[19]:
(array([   1.,    0.,   11.,   71.,  188.,   79.,  131.,   73.,    9.,    9.]),
 array([  6.1  ,   7.243,   8.386,   9.529,  10.672,  11.815,  12.958,
         14.101,  15.244,  16.387,  17.53 ]),
 <a list of 10 Patch objects>)
In [23]:
fig, axes=plt.subplots(figsize=(10,10), nrows=2, ncols=2)
axes[0][0].plot(olive_oil.palmitic, olive_oil.linolenic)
axes[0][1].plot(olive_oil.palmitic, olive_oil.linolenic, '.')
axes[1][0].scatter(olive_oil.palmitic, olive_oil.linolenic)
axes[1][1].hist(olive_oil.palmitic)
fig.tight_layout()
In [20]:
region_groupby = olive_oil.groupby('region')
grp_reg=region_groupby.describe()
grp_reg.head(20)
Out[20]:
arachidic area eicosenoic linoleic linolenic oleic palmitic palmitoleic stearic
region
1 count 323.000000 323.000000 323.000000 323.000000 323.000000 323.000000 323.000000 323.000000 323.000000
mean 0.631176 2.783282 0.273220 10.334985 0.380650 71.000093 13.322879 1.548019 2.287740
std 0.111644 0.741054 0.083915 2.106730 0.079727 3.451431 1.529349 0.507237 0.398709
min 0.320000 1.000000 0.100000 4.480000 0.200000 63.000000 8.750000 0.350000 1.520000
25% 0.560000 2.500000 0.220000 8.555000 0.320000 68.830000 12.680000 1.215000 2.015000
50% 0.620000 3.000000 0.270000 10.900000 0.370000 70.300000 13.460000 1.630000 2.230000
75% 0.690000 3.000000 0.320000 12.025000 0.440000 72.835000 14.190000 1.850000 2.495000
max 1.020000 4.000000 0.580000 14.620000 0.740000 81.130000 17.530000 2.800000 3.750000
2 count 98.000000 98.000000 98.000000 98.000000 98.000000 98.000000 98.000000 98.000000 98.000000
mean 0.731735 5.336735 0.019388 11.965306 0.270918 72.680204 11.113469 0.967449 2.261837
std 0.118826 0.475023 0.007436 1.072336 0.053844 1.418783 0.404111 0.138514 0.176363
min 0.450000 5.000000 0.010000 10.570000 0.150000 68.820000 10.300000 0.350000 1.990000
25% 0.660000 5.000000 0.010000 11.122500 0.230000 71.372500 10.852500 0.882500 2.120000
50% 0.720000 5.000000 0.020000 11.465000 0.270000 73.255000 11.075000 0.960000 2.220000
75% 0.810000 6.000000 0.020000 13.065000 0.300000 73.810000 11.372500 1.040000 2.395000
max 1.050000 6.000000 0.030000 14.700000 0.430000 74.390000 12.130000 1.350000 2.720000
3 count 151.000000 151.000000 151.000000 151.000000 151.000000 151.000000 151.000000 151.000000 151.000000
mean 0.375762 8.006623 0.019735 7.270331 0.217881 77.930530 10.948013 0.837351 2.308013
std 0.293586 0.820542 0.007298 1.431226 0.168865 1.648155 0.825635 0.264388 0.389560
min 0.000000 7.000000 0.010000 5.100000 0.000000 73.400000 6.100000 0.150000 1.700000
In [21]:
olstd = olive_oil.groupby('region').std()
olstd
Out[21]:
area palmitic palmitoleic stearic oleic linoleic linolenic arachidic eicosenoic
region
1 0.741054 1.529349 0.507237 0.398709 3.451431 2.106730 0.079727 0.111644 0.083915
2 0.475023 0.404111 0.138514 0.176363 1.418783 1.072336 0.053844 0.118826 0.007436
3 0.820542 0.825635 0.264388 0.389560 1.648155 1.431226 0.168865 0.293586 0.007298
In [22]:
olmean=region_groupby.aggregate(np.mean)
olmean.head()
Out[22]:
area palmitic palmitoleic stearic oleic linoleic linolenic arachidic eicosenoic
region
1 2.783282 13.322879 1.548019 2.287740 71.000093 10.334985 0.380650 0.631176 0.273220
2 5.336735 11.113469 0.967449 2.261837 72.680204 11.965306 0.270918 0.731735 0.019388
3 8.006623 10.948013 0.837351 2.308013 77.930530 7.270331 0.217881 0.375762 0.019735
In [23]:
renamedict_std={k:k+"_std" for k in list_of_acids}
renamedict_mean={k:k+"_mean" for k in list_of_acids}
olstd.rename(columns=renamedict_std,inplace=True)
olmean.rename(columns=renamedict_mean,inplace=True) 
olstd.head()
Out[23]:
area palmitic_std palmitoleic_std stearic_std oleic_std linoleic_std linolenic_std arachidic_std eicosenoic_std
region
1 0.741054 1.529349 0.507237 0.398709 3.451431 2.106730 0.079727 0.111644 0.083915
2 0.475023 0.404111 0.138514 0.176363 1.418783 1.072336 0.053844 0.118826 0.007436
3 0.820542 0.825635 0.264388 0.389560 1.648155 1.431226 0.168865 0.293586 0.007298
In [24]:
olpalmiticmean = olmean[['palmitic_mean']] 
olpalmiticstd = olstd[['palmitic_std']] 
newolbyregion=olpalmiticmean.join(olpalmiticstd)
newolbyregion
Out[24]:
palmitic_mean palmitic_std
region
1 13.322879 1.529349
2 11.113469 0.404111
3 10.948013 0.825635
In [30]:
eico=(olive_oil.eicosenoic < 0.05)
eico
Out[30]:
0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
...
557    True
558    True
559    True
560    True
561    True
562    True
563    True
564    True
565    True
566    True
567    True
568    True
569    True
570    True
571    True
Name: eicosenoic, Length: 572, dtype: bool
In [21]:
new_data = pd.DataFrame({'Bigdata' : [12, 34, 99, 45, 13], \
'Examiner' : [0.9, 0.8, 0.7, 0.6, None], 'Data science' \
: ['L', 'M', None, 'c', 'a']})
new_data
Out[21]:
Bigdata Data science Examiner
0 12 L 0.9
1 34 M 0.8
2 99 None 0.7
3 45 c 0.6
4 13 a NaN
In [22]:
new_data.dropna()
Out[22]:
Bigdata Data science Examiner
0 12 L 0.9
1 34 M 0.8
3 45 c 0.6
In [30]:
data = pd.DataFrame([1., None, 3.5, None, 7])
data
Out[30]:
0
0 1.0
1 NaN
2 3.5
3 NaN
4 7.0
In [31]:
mean = data.mean()
data.fillna(mean)
Out[31]:
0
0 1.000000
1 3.833333
2 3.500000
3 3.833333
4 7.000000