import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('seaborn')
import time
data=pd.read_csv("data2.csv")
data.head()
id | diagnosis | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | ... | radius_worst | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 842302 | M | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0.27760 | 0.3001 | 0.14710 | ... | 25.38 | 17.33 | 184.60 | 2019.0 | 0.1622 | 0.6656 | 0.7119 | 0.2654 | 0.4601 | 0.11890 |
1 | 842517 | M | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 | 0.07864 | 0.0869 | 0.07017 | ... | 24.99 | 23.41 | 158.80 | 1956.0 | 0.1238 | 0.1866 | 0.2416 | 0.1860 | 0.2750 | 0.08902 |
2 | 84300903 | M | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0.15990 | 0.1974 | 0.12790 | ... | 23.57 | 25.53 | 152.50 | 1709.0 | 0.1444 | 0.4245 | 0.4504 | 0.2430 | 0.3613 | 0.08758 |
3 | 84348301 | M | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0.28390 | 0.2414 | 0.10520 | ... | 14.91 | 26.50 | 98.87 | 567.7 | 0.2098 | 0.8663 | 0.6869 | 0.2575 | 0.6638 | 0.17300 |
4 | 84358402 | M | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0.13280 | 0.1980 | 0.10430 | ... | 22.54 | 16.67 | 152.20 | 1575.0 | 0.1374 | 0.2050 | 0.4000 | 0.1625 | 0.2364 | 0.07678 |
5 rows × 32 columns
col=data.columns
print(col)
Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', 'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se', 'fractal_dimension_se', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst', 'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst'], dtype='object')
y = data.diagnosis
x = data.drop(["id","diagnosis"],axis=1)
x.head()
radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | symmetry_mean | fractal_dimension_mean | ... | radius_worst | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0.27760 | 0.3001 | 0.14710 | 0.2419 | 0.07871 | ... | 25.38 | 17.33 | 184.60 | 2019.0 | 0.1622 | 0.6656 | 0.7119 | 0.2654 | 0.4601 | 0.11890 |
1 | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 | 0.07864 | 0.0869 | 0.07017 | 0.1812 | 0.05667 | ... | 24.99 | 23.41 | 158.80 | 1956.0 | 0.1238 | 0.1866 | 0.2416 | 0.1860 | 0.2750 | 0.08902 |
2 | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0.15990 | 0.1974 | 0.12790 | 0.2069 | 0.05999 | ... | 23.57 | 25.53 | 152.50 | 1709.0 | 0.1444 | 0.4245 | 0.4504 | 0.2430 | 0.3613 | 0.08758 |
3 | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0.28390 | 0.2414 | 0.10520 | 0.2597 | 0.09744 | ... | 14.91 | 26.50 | 98.87 | 567.7 | 0.2098 | 0.8663 | 0.6869 | 0.2575 | 0.6638 | 0.17300 |
4 | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0.13280 | 0.1980 | 0.10430 | 0.1809 | 0.05883 | ... | 22.54 | 16.67 | 152.20 | 1575.0 | 0.1374 | 0.2050 | 0.4000 | 0.1625 | 0.2364 | 0.07678 |
5 rows × 30 columns
ax = sns.countplot(y,label ="Count")
B, M = y.value_counts()
print("No. of Benign Tumors", B)
print("No. of Malignant Tumors", M)
No. of Benign Tumors 357 No. of Malignant Tumors 212
x.describe()
radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | symmetry_mean | fractal_dimension_mean | ... | radius_worst | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | ... | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 |
mean | 14.127292 | 19.289649 | 91.969033 | 654.889104 | 0.096360 | 0.104341 | 0.088799 | 0.048919 | 0.181162 | 0.062798 | ... | 16.269190 | 25.677223 | 107.261213 | 880.583128 | 0.132369 | 0.254265 | 0.272188 | 0.114606 | 0.290076 | 0.083946 |
std | 3.524049 | 4.301036 | 24.298981 | 351.914129 | 0.014064 | 0.052813 | 0.079720 | 0.038803 | 0.027414 | 0.007060 | ... | 4.833242 | 6.146258 | 33.602542 | 569.356993 | 0.022832 | 0.157336 | 0.208624 | 0.065732 | 0.061867 | 0.018061 |
min | 6.981000 | 9.710000 | 43.790000 | 143.500000 | 0.052630 | 0.019380 | 0.000000 | 0.000000 | 0.106000 | 0.049960 | ... | 7.930000 | 12.020000 | 50.410000 | 185.200000 | 0.071170 | 0.027290 | 0.000000 | 0.000000 | 0.156500 | 0.055040 |
25% | 11.700000 | 16.170000 | 75.170000 | 420.300000 | 0.086370 | 0.064920 | 0.029560 | 0.020310 | 0.161900 | 0.057700 | ... | 13.010000 | 21.080000 | 84.110000 | 515.300000 | 0.116600 | 0.147200 | 0.114500 | 0.064930 | 0.250400 | 0.071460 |
50% | 13.370000 | 18.840000 | 86.240000 | 551.100000 | 0.095870 | 0.092630 | 0.061540 | 0.033500 | 0.179200 | 0.061540 | ... | 14.970000 | 25.410000 | 97.660000 | 686.500000 | 0.131300 | 0.211900 | 0.226700 | 0.099930 | 0.282200 | 0.080040 |
75% | 15.780000 | 21.800000 | 104.100000 | 782.700000 | 0.105300 | 0.130400 | 0.130700 | 0.074000 | 0.195700 | 0.066120 | ... | 18.790000 | 29.720000 | 125.400000 | 1084.000000 | 0.146000 | 0.339100 | 0.382900 | 0.161400 | 0.317900 | 0.092080 |
max | 28.110000 | 39.280000 | 188.500000 | 2501.000000 | 0.163400 | 0.345400 | 0.426800 | 0.201200 | 0.304000 | 0.097440 | ... | 36.040000 | 49.540000 | 251.200000 | 4254.000000 | 0.222600 | 1.058000 | 1.252000 | 0.291000 | 0.663800 | 0.207500 |
8 rows × 30 columns
print(data)
id diagnosis radius_mean texture_mean perimeter_mean area_mean \ 0 842302 M 17.99 10.38 122.80 1001.0 1 842517 M 20.57 17.77 132.90 1326.0 2 84300903 M 19.69 21.25 130.00 1203.0 3 84348301 M 11.42 20.38 77.58 386.1 4 84358402 M 20.29 14.34 135.10 1297.0 .. ... ... ... ... ... ... 564 926424 M 21.56 22.39 142.00 1479.0 565 926682 M 20.13 28.25 131.20 1261.0 566 926954 M 16.60 28.08 108.30 858.1 567 927241 M 20.60 29.33 140.10 1265.0 568 92751 B 7.76 24.54 47.92 181.0 smoothness_mean compactness_mean concavity_mean concave points_mean \ 0 0.11840 0.27760 0.30010 0.14710 1 0.08474 0.07864 0.08690 0.07017 2 0.10960 0.15990 0.19740 0.12790 3 0.14250 0.28390 0.24140 0.10520 4 0.10030 0.13280 0.19800 0.10430 .. ... ... ... ... 564 0.11100 0.11590 0.24390 0.13890 565 0.09780 0.10340 0.14400 0.09791 566 0.08455 0.10230 0.09251 0.05302 567 0.11780 0.27700 0.35140 0.15200 568 0.05263 0.04362 0.00000 0.00000 ... radius_worst texture_worst perimeter_worst area_worst \ 0 ... 25.380 17.33 184.60 2019.0 1 ... 24.990 23.41 158.80 1956.0 2 ... 23.570 25.53 152.50 1709.0 3 ... 14.910 26.50 98.87 567.7 4 ... 22.540 16.67 152.20 1575.0 .. ... ... ... ... ... 564 ... 25.450 26.40 166.10 2027.0 565 ... 23.690 38.25 155.00 1731.0 566 ... 18.980 34.12 126.70 1124.0 567 ... 25.740 39.42 184.60 1821.0 568 ... 9.456 30.37 59.16 268.6 smoothness_worst compactness_worst concavity_worst \ 0 0.16220 0.66560 0.7119 1 0.12380 0.18660 0.2416 2 0.14440 0.42450 0.4504 3 0.20980 0.86630 0.6869 4 0.13740 0.20500 0.4000 .. ... ... ... 564 0.14100 0.21130 0.4107 565 0.11660 0.19220 0.3215 566 0.11390 0.30940 0.3403 567 0.16500 0.86810 0.9387 568 0.08996 0.06444 0.0000 concave points_worst symmetry_worst fractal_dimension_worst 0 0.2654 0.4601 0.11890 1 0.1860 0.2750 0.08902 2 0.2430 0.3613 0.08758 3 0.2575 0.6638 0.17300 4 0.1625 0.2364 0.07678 .. ... ... ... 564 0.2216 0.2060 0.07115 565 0.1628 0.2572 0.06637 566 0.1418 0.2218 0.07820 567 0.2650 0.4087 0.12400 568 0.0000 0.2871 0.07039 [569 rows x 32 columns]
data_dia = y
data = x
data_n_2 = (data - data.mean()) / (data.std())
data = pd.concat([y,data_n_2.iloc[:,0:10]],axis=1)
data = pd.melt(data,id_vars="diagnosis",
var_name="features",
value_name='value')
plt.figure(figsize=(5,5))
sns.violinplot(x="features", y="value", hue="diagnosis", data=data, split=True, inner="points")
plt.xticks(rotation=45);
# inner{“box”, “quartile”, “point”, “stick”, None}, optional
data = pd.concat([y,data_n_2.iloc[:,10:20]],axis=1)
data = pd.melt(data,id_vars="diagnosis",
var_name="features",
value_name='value')
plt.figure(figsize=(10,10))
sns.violinplot(x="features", y="value", hue="diagnosis", data=data,split=False, inner="quartile")
plt.xticks(rotation=45);
data = pd.concat([y,data_n_2.iloc[:,20:31]],axis=1)
data = pd.melt(data,id_vars="diagnosis",
var_name="features",
value_name='value')
plt.figure(figsize=(10,10))
sns.boxplot(x="features", y="value", hue="diagnosis", data=data, saturation=1, fliersize="5", linewidth=3)
plt.xticks(rotation=45);
data = pd.concat([y,data_n_2.iloc[:,20:31]],axis=1)
data
diagnosis | radius_worst | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | M | 1.885031 | -1.358098 | 2.301575 | 1.999478 | 1.306537 | 2.614365 | 2.107672 | 2.294058 | 2.748204 | 1.935312 |
1 | M | 1.804340 | -0.368879 | 1.533776 | 1.888827 | -0.375282 | -0.430066 | -0.146620 | 1.086129 | -0.243675 | 0.280943 |
2 | M | 1.510541 | -0.023953 | 1.346291 | 1.455004 | 0.526944 | 1.081980 | 0.854222 | 1.953282 | 1.151242 | 0.201214 |
3 | M | -0.281217 | 0.133866 | -0.249720 | -0.549538 | 3.391291 | 3.889975 | 1.987839 | 2.173873 | 6.040726 | 4.930672 |
4 | M | 1.297434 | -1.465481 | 1.337363 | 1.219651 | 0.220362 | -0.313119 | 0.612640 | 0.728618 | -0.867590 | -0.396751 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
564 | M | 1.899514 | 0.117596 | 1.751022 | 2.013529 | 0.378033 | -0.273077 | 0.663928 | 1.627719 | -1.358963 | -0.708467 |
565 | M | 1.535369 | 2.045599 | 1.420690 | 1.493644 | -0.690623 | -0.394473 | 0.236365 | 0.733182 | -0.531387 | -0.973122 |
566 | M | 0.560868 | 1.373645 | 0.578492 | 0.427529 | -0.808876 | 0.350427 | 0.326479 | 0.413705 | -1.103578 | -0.318129 |
567 | M | 1.959515 | 2.235958 | 2.301575 | 1.651717 | 1.429169 | 3.901415 | 3.194794 | 2.287972 | 1.917396 | 2.217684 |
568 | B | -1.409652 | 0.763518 | -1.431475 | -1.074867 | -1.857384 | -1.206491 | -1.304683 | -1.743529 | -0.048096 | -0.750546 |
569 rows × 11 columns
data = pd.melt(data,id_vars="diagnosis",
var_name="features",
value_name='value')
data
diagnosis | features | value | |
---|---|---|---|
0 | M | radius_worst | 1.885031 |
1 | M | radius_worst | 1.804340 |
2 | M | radius_worst | 1.510541 |
3 | M | radius_worst | -0.281217 |
4 | M | radius_worst | 1.297434 |
... | ... | ... | ... |
5685 | M | fractal_dimension_worst | -0.708467 |
5686 | M | fractal_dimension_worst | -0.973122 |
5687 | M | fractal_dimension_worst | -0.318129 |
5688 | M | fractal_dimension_worst | 2.217684 |
5689 | B | fractal_dimension_worst | -0.750546 |
5690 rows × 3 columns
data['features'].unique()
array(['radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst', 'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst'], dtype=object)
data['features'].value_counts()
fractal_dimension_worst 569 area_worst 569 texture_worst 569 smoothness_worst 569 compactness_worst 569 concave points_worst 569 symmetry_worst 569 perimeter_worst 569 concavity_worst 569 radius_worst 569 Name: features, dtype: int64
sns.jointplot(x.loc[:,'concavity_worst'],
x.loc[:,'concave points_worst'],
kind="hex",
color="g")
;
# kind{ “scatter” | “reg” | “resid” | “kde” | “hex” }
#sns.set(style="whitegrid", palette="muted")
data_dia = y
data = x
data_n_2 = (data - data.mean()) / (data.std())
data = pd.concat([y,data_n_2.iloc[:,0:10]],axis=1)
data = pd.melt(data,id_vars="diagnosis",
var_name="features",
value_name='value')
plt.figure(figsize=(10,10))
sns.swarmplot(x="features", y="value", hue="diagnosis", data=data,size=8)
plt.xticks(rotation=45);
#correlation map
f,ax = plt.subplots(figsize=(18, 18))
sns.heatmap(x.corr(), annot=True, linewidths=.5, fmt= '.1f',ax=ax);
Note: If you are starting the notebook from this task, you can run cells from all the previous tasks in the kernel by going to the top menu and Kernel > Restart and Run All\n",
'''['perimeter_mean','radius_mean','compactness_mean',
'concave points_mean','radius_se','perimeter_se',
'radius_worst','perimeter_worst','compactness_worst',
'concave points_worst','compactness_se','concave points_se',
'texture_worst','area_worst']'''
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import f1_score,confusion_matrix
from sklearn.metrics import accuracy_score
sns.set(style="white")
df = x.loc[:,['radius_worst','perimeter_worst','area_worst']]
g = sns.PairGrid(df, diag_sharey=False)
g.map_lower(sns.kdeplot, cmap="Blues_d")
g.map_upper(plt.scatter)
g.map_diag(sns.kdeplot, lw=3);
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2