#!/usr/bin/env python # coding: utf-8 # # # # Save Chain to Log File # Author(s): Paul Miles | Date Created: June 18, 2018 # # Many models are time consuming to evaluate. As MCMC simulations required many model evaluations, it can be useful to periodically save the chain elements to a file. This can be useful for a variety of reasons: # # - Chain visualization while simulation continues to run. # - Chain is saved in the event that simulation ends prematurely. # # This is important when working on remote systems where you may have limited computation time. This tutorial demonstrates the following: # # - How to specify a log file directory # - Format to save log files in (binary or text) # - How to read in log files for analysis # # Run Simulation & Export to Log Files # Import required paths. # In[6]: import numpy as np from pymcmcstat.MCMC import MCMC from datetime import datetime # Define a simple model and sum-of-squares function. # In[7]: # define test model function def test_modelfun(xdata, theta): m = theta[0] b = theta[1] nrow, ncol = xdata.shape y = np.zeros([nrow,1]) y[:,0] = m*xdata.reshape(nrow,) + b return y def test_ssfun(theta, data): xdata = data.xdata[0] ydata = data.ydata[0] # eval model ymodel = test_modelfun(xdata, theta) # calc sos ss = sum((ymodel[:, 0] - ydata[:, 0])**2) return ss # Initialize MCMC object: # - Add data # - Define model settings # - Define model parameters # In[15]: # Initialize MCMC object mcset = MCMC() # Add data nds = 100 x = np.linspace(2, 3, num=nds) y = 2.*x + 3. + 0.1*np.random.standard_normal(x.shape) mcset.data.add_data_set(x, y) # update model settings mcset.model_settings.define_model_settings(sos_function=test_ssfun) mcset.parameters.add_model_parameter( name='m', theta0=2., minimum=-10, maximum=np.inf, sample=True) mcset.parameters.add_model_parameter( name='b', theta0=-5., minimum=-10, maximum=100, sample=True) # ## Define log file directory and turn on flags in simulations options # The following keyword arguments of the simulation options allow you to setup the log files. # # - `savedir`: Directory in which to store log files. If not specified, but log files turned on, then saves to directory with naming convention 'YYYYMMDD_hhmmss_chain_log'. # - `save_to_bin`: Save log files in binary format. Uses `h5py` package for binary read/write. # - `save_to_txt`: Save log files in text format. Uses `numpy` package for text read/write. # # By default the feature is set to `False`. You can save to either format or to both. Regardless of what format is used to save the chain, a text log file will be included which appends a date/time stamp with corresponding chain indices. This will be explained in more detail later. # In[18]: datestr = datetime.now().strftime('%Y%m%d_%H%M%S') savedir = str('{}_{}'.format(datestr, 'serial_chain')) mcset.simulation_options.define_simulation_options( nsimu=int(5e4), updatesigma=1, method='dram', savedir=savedir, savesize=1000, save_to_json=True, verbosity=0, waitbar=False, save_to_txt=True, save_to_bin=True) # ## Run Simulation # In[19]: mcset.run_simulation() # # Process Simulation # At this point, the simulation is either running, or has completed running. You will observe a folder in the working directory that matches the input argument for `savedir`. # In[40]: ls # We observe that the folder `20190517_073038_serial_chain` matches what was specified for `savedir` and display its contents # In[23]: print('savedir={}'.format(savedir)) # In[25]: ls 20190517_073038_serial_chain/ # As expected, there are log files saved in both binary (h5) and text (txt) format. # ## Processing the log files # We start by importing several modules from the [pymcmcstat](https://prmiles.wordpress.ncsu.edu/codes/python-packages/pymcmcstat/) package. We note that this operation should be done from a separate script file, and possibly from a separate computer. For example, if running a long simulation on a remote server, you can periodically copy the log files from the remote server and analyze the chains on your local machine. # In[26]: from pymcmcstat.chain import ChainProcessing as CP from pymcmcstat.chain.ChainStatistics import chainstats from pymcmcstat.plotting.MCMCPlotting import Plot import time # We initialize the plotting class and define the directory in which to find the log files. # In[27]: # Initialize classes MCP = Plot() # define directory where log files are saved savedir = '20190517_073038_serial_chain' # For testing purposes we can repeatedly read in the data to see how binary versus text is processed. ns = 10 # Read in binary data files and print amount of time it takes to process. # In[30]: start = time.time() for ii in range(ns): results = CP.read_in_savedir_files(savedir, extension='h5') end = time.time() binary_time = end - start print('Binary: {} sec\n'.format(binary_time/ns)) # In[31]: start = time.time() for ii in range(ns): results = CP.read_in_savedir_files(savedir, extension='txt') end = time.time() text_time = end - start print('Text: {} sec\n'.format(text_time/ns)) # It is clearly seen that the binary files are more quickly processed. In either case, the results extracted from the log files are identical, and we can proceed with the analysis. # ## Analysis # We extract the following from the results dictionary: # - `chain`: Sampling chain for model parameters # - `s2chain`: Observation error chain # - `sschain`: Sum-of-squares error corresponding to each row of `chain`. # In[32]: chain = results['chain'] s2chain = results['s2chain'] sschain = results['sschain'] # We define the burn-in period for the chain as half the simulation run time. Display statistics for burned-in portion of chain. # In[33]: # define burnin nsimu = chain.shape[0] burnin = int(nsimu/2) # display chain statistics stats = chainstats(chain[burnin:,:], returnstats=True) # ## Plot Chain # - Chain panel # - pairwise correlation # In[38]: MCP.plot_chain_panel(chain[burnin:, :], figsizeinches=(3, 3)) MCP.plot_pairwise_correlation_panel(chain[burnin:, :], figsizeinches=(3, 3)); # ## Print log files # Log files display a date/time stamp associated with when chain segments were appended to the correponding log file. # # | Date | Time | Start | End | # |--------------|---------|---------|---------| # |2019-05-17| 07:30:40| 0| 999| # |2019-05-17| 07:30:40| 1000| 1999| # |2019-05-17| 07:30:41| 2000| 2999| # |2019-05-17| 07:30:41| 3000| 3999| # |2019-05-17| 07:30:41| 4000| 4999| # |2019-05-17| 07:30:41| 5000| 5999| # # # In[39]: CP.print_log_files(savedir)