#!/usr/bin/env python # coding: utf-8 # # [PyBroMo](http://tritemio.github.io/PyBroMo/) - A1. Reference - Data format and internals # # This notebook is part of PyBroMo a # python-based single-molecule Brownian motion diffusion simulator # that simulates confocal smFRET # experiments. # # > **Summary.** *This notebook describes the HDF5 files used to save diffusion trajectories and raw timestamps.* # > *PyBroMo can also generate complete smFRET data files in [Photon-HDF5](http://photon-hdf5.org) format,* # > *but for this format the reader can refer to official Photon-HDF5 specifications.* # In[1]: import numpy as np import tables import pybromo as pbm print('Numpy version:', np.__version__) print('PyTables version:', tables.__version__) print('PyBroMo version:', pbm.__version__) # In[2]: S = pbm.ParticlesSimulation.from_datafile('0168') # # File format for trajectories # # The simulation is saved in a HDF5 file, one for each running engine. The file has the following content: # # - **/parameters** # # * Numeric parameters (storead as scalar arrays) # - `D` # - `t_step` # - `t_max` # - `EID` # - `ID` # - `chunksize`: used for `emission` and `position` arrays # - `np`: number of particles # - `pMol`: particles concentration in pico-Molar # # * Non-numeric parameters (stored as group attributes) # - `box`: the `Box()` object (stores boundaries and size) # - `particles`: the `Particles()` object, a list of `Particle()` # (containing the initial position positions) and seed. # # # - **/psf** # * Arrays of PSF used in the simulation. This is the raw array as saved from PSFLab. The name of the array is the same as the origina file name. The PSF array has the following attributes: # - 'fname', 'dir_', 'x_step', 'z_step' # * The array and its attributes allow to recreate the `NumericPSF()` object on a simulation reload. # - **TODO**: 'wavelength', 'NA', and other PSFLab parameters # * `default_psf`: hard link to the PSF used in the simualation, used as persistent name # # # - **/trajectories** # * `emission`: 2D array of emission traces: one row per particle. Shape: (num_particles, time) # * `emission_tot`: 1D array of emission trace: total emission from all the particles: Shape: (time) # * `position`: 3D array of positions. Shape (num_particles, 3, time) # # # - **/timestamps** # * Arrays of timestamps for different `rate_max`, `bg_rate` and `seed`. # # ## How to access # # The HDF5 file handle is in `S.store.data_file` after you run `S.open_store()`: # In[3]: S.compact_name_core() # In[4]: S.compact_name_core(t_max=True) # In[5]: S.compact_name() # ## HDF5 File inspection # In[6]: print ('Main groups:\n') for node in S.store.h5file.root: print (node) for n in node: print ('\t%s' % n.name) print ('\t %s' % n.title) # In[7]: h5file = S.store.h5file # ### Group: /parameters # In[8]: group = '/parameters' print ('Numeric attributes (nodes) in %s:\n' % group) print (S.store.h5file.get_node(group)) for node in S.store.h5file.get_node(group)._f_list_nodes(): print ('\t%s (%s)' % (node.name, str(node.read()))) print ('\t %s' % node.title) # In[9]: group = '/parameters' print ('Attributes in %s:\n' % group) print (S.store.h5file.get_node(group)) for attr in S.store.h5file.get_node(group)._v_attrs._f_list(): print ('\t%s' % attr) print ('\t %s' % type(S.store.h5file.get_node(group)._v_attrs[attr])) # In[10]: particles = S.store.h5file.root.parameters._f_getattr('particles') # In[11]: len(particles), particles.rs_hash # ### Group /psf # In[12]: group = '/psf' print ('Nodes in in %s:\n' % group) print (S.store.h5file.get_node(group)) for node in S.store.h5file.get_node(group)._f_list_nodes(): print ('\t%s %s' % (node.name, node.shape)) print ('\t %s' % node.title) # ### `default_psf` attributes # In[13]: node_name = '/psf/default_psf' node = S.store.h5file.get_node(node_name) print ("\n%s %s: '%s'" % (node.name, node.shape, node.title)) print ('\n List of attributes:') for attr in node.attrs._f_list(): print ('\t%s' % attr) print ("\t %s" % repr(node._v_attrs[attr])) # ### Group /trajectories # In[14]: group = '/trajectories' print ('Nodes in in %s:\n' % group) print (S.store.h5file.get_node(group)) for node in S.store.h5file.get_node(group)._f_list_nodes(): print ('\t%s %s' % (node.name, node.shape)) print ('\t %s' % node.title) # In[15]: group = '/trajectories' print ('Attributes in %s:\n' % group) print (S.store.h5file.get_node(group)) for attr in S.store.h5file.get_node(group)._v_attrs._f_list(): print ('\t%s' % attr) print ('\t %s' % type(S.store.h5file.get_node(group)._v_attrs[attr])) # ### `emission` attributes # In[16]: node_name = '/trajectories/emission' node = S.store.h5file.get_node(node_name) print ("\n%s %s: '%s'" % (node.name, node.shape, node.title)) print ('\n List of attributes:') for attr in node.attrs._f_list(): print ('\t%s' % attr) attr_data = repr(node._v_attrs[attr]) if len(attr_data) > 300: attr_data = hash_(node._v_attrs[attr]) print ("\t %s" % attr_data) # ### `position` attributes # In[17]: node_name = '/trajectories/position' if node_name in S.store.h5file: node = S.store.h5file.get_node(node_name) print ("\n%s %s: '%s'" % (node.name, node.shape, node.title)) print ('\n List of attributes:') for attr in node.attrs._f_list(): print ('\t%s' % attr) print ("\t %s" % repr(node._v_attrs[attr])) else: print ('%s not present.') # # File format for timestamps # In[18]: group = '/timestamps' print ('Nodes in in %s:\n' % group) print (S.ts_store.h5file.get_node(group)) for node in S.ts_store.h5file.get_node(group)._f_list_nodes(): print ('\t%s' % node.name) print ('\t %s' % node.title) # In[19]: group = '/timestamps' print ('Attributes in %s:\n' % group) print (S.ts_store.h5file.get_node(group)) for attr in S.ts_store.h5file.get_node(group)._v_attrs._f_list(): print ('\t%s' % attr) print ('\t %s' % type(S.ts_store.h5file.get_node(group)._v_attrs[attr])) # In[20]: group = '/timestamps' print ('>> Nodes in %s' % S.ts_store.h5file.get_node(group)) for node in S.ts_store.h5file.get_node(group)._f_list_nodes(): #print '\t%s' % node.name #print '\t %s' % node.title print ("\n%s %s: '%s'" % (node.name, node.shape, node.title)) print ('\n List of attributes:') for attr in node.attrs._f_list(): print ('\t%s' % attr) attr_data = repr(node._v_attrs[attr]) if len(attr_data) > 300: attr_data = pbm.hash_(node._v_attrs[attr]) print ("\t %s" % attr_data) print ('\n>> Attributes in %s:\n' % group) print (S.ts_store.h5file.get_node(group)) for attr in S.ts_store.h5file.get_node(group)._v_attrs._f_list(): print ('\t%s' % attr) print ('\t %s' % type(S.ts_store.h5file.get_node(group)._v_attrs[attr])) # In[21]: S.store.close() S.ts_store.close() # # Random State: reprodicibility and high-quality pseudo-random numbers # PyBromo uses Numpy's [`RandomState`](http://docs.scipy.org/doc/numpy/reference/generated/numpy.random.RandomState.html) # object to track the current state of the random number generator at different # simulation stages. # # Tracking the random state has a dual purpose. First, it allows to reproduce # any simulation step. # # Second, it allows to mantain an high-quality pseudo-random number stream when # the simulation is resumed. This point is more subtle. Simulation can be performed # in different steps. When resuming a simulation to proceed to the nex step it is important # to use the last saved random state. In fact, by resetting the random state using an arbitrary # seed we may easily introduce correlation between the previous and current stream of random numbers. # For example streams generated with seeds 1 and 2 will be correlated # (up to 1e6 samples!) because many bits in the seed are the same. This is a property of the # [Mersenne twister](http://en.wikipedia.org/wiki/Mersenne_twister) # algorithm (see also [this paper](http://www.iro.umontreal.ca/~lecuyer/myftp/papers/wellrng.pdf)). # To avoid this well-known problem we need to use a single stream by freezing (saving) and restoring # the random state at each step. # # Notice that there are 3 steps that require random numbers: # # 1. Generating the initial **particles position** # 2. **Brownian motion** trajectories simulation (3-D trajectories + emission rates) # 3. Generating **timestamps** based on the emission rates # # The random state is tracked as follows: # # 1. When generating the particles with `gen_particles` the new `Particles` object # will contain the `.init_random_state` attribute (and, as a convenience, a 3 digit # hash in `.rs_hash`) # 2. Whem performing the Brownian motion simulation with `.sim_brownian_motion`, # the '/trajectories' group (shortcut `S.traj_group`) will store the initial and # the final state as group attributes: `.init_random_state` and `.last_random_state`. # The assumption is that we simulate only 1 Browian motion diffusion per object # so makes sense to store these values as group attributes. # 3. When generating timestamps with `S.sim_timestamps_em_store`, we will generate # timestamps several times for different emission or background rates. Therefore # the '/timestamps' group (shortcut`S.ts_group`) contains the `last_random_state` # attribute and each timestamp array contains the `init_random_state` attribute. # # > **NOTE:** Since the random-state data structure ([a tuple of a string, an array, and 3 numbers](http://docs.scipy.org/doc/numpy/reference/generated/numpy.random.get_state.html)) # > is numpy-specific we save it without any conversion. In fact, using the same random state # > in another programming language would require a deep understanding of the Mersenne twister # > implementation. Not to mention that a proprietary language may not provide such level of details # > to the user. Anyway, if you are motivated to use the random state in another language, an # > additional conversion from numpy format would be the least of your problems.