This notebook implements model training and cross validation as run by the gpseer command line. It will generate all of the same plots and csv output. The best way to understand what is going on in this notebook is to follow the command-line tutorial.
# Set up the environment
%matplotlib inline
from gpseer import utils, maximum_likelihood, cross_validate, plot
# Model parameters (mirroring those seen in the command line)
threshold = None # best model, for pfcrt-data, set to 5
spline_order = None # best model, for pfcrt-data, set to 2
spline_smoothness = None # best model, for pfcrt-data, set to 100000
epistasis_order = 1 # usually don't change
alpha = 1 # usually don't change
output_root = "linear"
# Load data into a genotype-phenotype map. To obtain a local copy of
# pfcrt-raw-data.csv, run gpseer fetch-example on the command line.
gpm = utils.read_file_to_gpmap("https://github.com/harmslab/gpseer/raw/master/examples/pfcrt-raw-data.csv")
gpm
# Construct fitting model.
ml_model = utils.construct_model(threshold=threshold,
spline_order=spline_order,
spline_smoothness=spline_smoothness,
epistasis_order=epistasis_order,
alpha=alpha)
# Add genotype phenotype map to the model
ml_model.add_gpm(gpm)
ml_model.fit()
# Make prediction
prediction_df = maximum_likelihood.predict_to_dataframe(ml_model)
prediction_df.to_csv(f"{output_root}_predictions.csv")
prediction_df
# Create output summarizing various fit statistics
stats_df, convergence_df = maximum_likelihood.create_stats_output(ml_model)
# Show fit information spreadsheet
stats_df.to_csv(f"{output_root}_fit-information.csv")
stats_df
# Show convergence spreadsheet
convergence_df.to_csv(f"{output_root}_convergence.csv")
convergence_df
# Plot the spline
fig, ax = plot.plot_spline(ml_model,prediction_df)
if fig is not None:
fig.savefig(f"{output_root}_spline-fit.pdf")
None
# Plot correlation between measured and predicted values
fig, ax = plot.plot_correlation(ml_model,prediction_df)
fig.savefig(f"{output_root}_correlation-plot.pdf")
None
# Plot phenotype histograms
fig, ax = plot.plot_histograms(ml_model,prediction_df)
fig.savefig(f"{output_root}_phenotype-histograms.pdf")
None
# Construct a model for cross validation
cv_model = utils.construct_model(threshold=threshold,
spline_order=spline_order,
spline_smoothness=spline_smoothness,
epistasis_order=epistasis_order,
alpha=alpha)
# Do the cross-validation run
cv_df = cross_validate.cross_validate_to_dataframe(cv_model,gpm,n_samples=1000,train_fraction=0.8)
# Show the cross-validation spreadsheet
cv_df.to_csv(f"{output_root}_cross-validation-scores.csv")
cv_df
# Plot cross-validation results
fig, ax = plot.plot_test_train(cv_df)
fig.savefig(f"{output_root}_cross-validation-plot.pdf")
None