Superconductivty Data Set: Predict the critical temperature based on 81 material features.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
filename = "https://www.physi.uni-heidelberg.de/~reygers/lectures/2021/ml/data/train_critical_temp.csv"
df = pd.read_csv(filename, engine='python')
df.head()
number_of_elements | mean_atomic_mass | wtd_mean_atomic_mass | gmean_atomic_mass | wtd_gmean_atomic_mass | entropy_atomic_mass | wtd_entropy_atomic_mass | range_atomic_mass | wtd_range_atomic_mass | std_atomic_mass | ... | wtd_mean_Valence | gmean_Valence | wtd_gmean_Valence | entropy_Valence | wtd_entropy_Valence | range_Valence | wtd_range_Valence | std_Valence | wtd_std_Valence | critical_temp | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 4 | 88.944468 | 57.862692 | 66.361592 | 36.116612 | 1.181795 | 1.062396 | 122.90607 | 31.794921 | 51.968828 | ... | 2.257143 | 2.213364 | 2.219783 | 1.368922 | 1.066221 | 1 | 1.085714 | 0.433013 | 0.437059 | 29.0 |
1 | 5 | 92.729214 | 58.518416 | 73.132787 | 36.396602 | 1.449309 | 1.057755 | 122.90607 | 36.161939 | 47.094633 | ... | 2.257143 | 1.888175 | 2.210679 | 1.557113 | 1.047221 | 2 | 1.128571 | 0.632456 | 0.468606 | 26.0 |
2 | 4 | 88.944468 | 57.885242 | 66.361592 | 36.122509 | 1.181795 | 0.975980 | 122.90607 | 35.741099 | 51.968828 | ... | 2.271429 | 2.213364 | 2.232679 | 1.368922 | 1.029175 | 1 | 1.114286 | 0.433013 | 0.444697 | 19.0 |
3 | 4 | 88.944468 | 57.873967 | 66.361592 | 36.119560 | 1.181795 | 1.022291 | 122.90607 | 33.768010 | 51.968828 | ... | 2.264286 | 2.213364 | 2.226222 | 1.368922 | 1.048834 | 1 | 1.100000 | 0.433013 | 0.440952 | 22.0 |
4 | 4 | 88.944468 | 57.840143 | 66.361592 | 36.110716 | 1.181795 | 1.129224 | 122.90607 | 27.848743 | 51.968828 | ... | 2.242857 | 2.213364 | 2.206963 | 1.368922 | 1.096052 | 1 | 1.057143 | 0.433013 | 0.428809 | 23.0 |
5 rows × 82 columns
y = df['critical_temp'].values
X = df[[col for col in df.columns if col!="critical_temp"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True)
import xgboost as xgb
import time
# XGBreg = xgb.sklearn.XGBRegressor(nthread=-1, seed=1, n_estimators=1000)
XGBreg = xgb.sklearn.XGBRegressor()
start_time = time.time()
XGBreg.fit(X_train, y_train)
run_time = time.time() - start_time
print(run_time)
16.105452060699463
y_pred = XGBreg.predict(X_test)
plt.scatter(y_test, y_pred, s=2)
plt.xlabel("true critical temperature (K)", fontsize=14)
plt.ylabel("predicted critical temperature (K)", fontsize=14)
plt.savefig("critical_temperature.pdf")
rms = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"root mean square error {rms:.2f}")
root mean square error 9.68
# compare with other regressors
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor()
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor()
from sklearn.neural_network import MLPRegressor
mlpr = MLPRegressor(hidden_layer_sizes=(50,50), activation='relu', random_state=1, max_iter=5000)
regressors = [rfr, gbr, mlpr]
for reg in regressors:
start_time = time.time()
reg.fit(X_train, y_train)
run_time = time.time() - start_time
y_pred = reg.predict(X_test)
rms = np.sqrt(mean_squared_error(y_test, y_pred))
print(run_time)
print(f"root mean square error {rms:.2f}\n")
83.4200599193573 root mean square error 9.54 28.77874779701233 root mean square error 12.56 9.2709481716156 root mean square error 18.58