#!/usr/bin/env python
# coding: utf-8

# # Data Sets Tutorial
# This tutorial demonstrates how to create and use `DataSet` objects.  At its core Gate Set Tomography finds a gate set which best fits some experimental data, and in pyGSTi a `DataSet` is used to hold that data.  When a `DataSet` is used to hold time-independent data, it essentially looks like a nested dictionary which associates gate strings with dictionaries of (outcome-label,count) pairs so that `dataset[gateString][outcomeLabel]` can be used to read & write the number of `outcomeLabel` outcomes of the experiment given by the sequence `gateString`.
# 
# There are a few important differences between a `DataSet` and a dictionary-of-dictionaries:
# - `DataSet` objects can be in one of two modes: *static* or *non-static*.  When in *non-static* mode, data can be freely modified within the set, making this mode to use during the data-entry.  In the *static* mode, data cannot be modified and the `DataSet` is essentially read-only.  The `done_adding_data` method of a `DataSet` switches from non-static to static mode, and should be called, as the name implies, once all desired data has been added (or modified).  Once a `DataSet` is static, it is read-only for the rest of its life; to modify its data the best one can do is make a non-static *copy* via the `copy_nonstatic` member and modify the copy.
# 
# - Because `DataSet`s may contain time-dependent data, the dictionary-access syntax for a single outcome label (i.e. `dataset[gateString][outcomeLabel]`) *cannot* be used to write counts for new `gateString` keys; One should instead  use the `add_`*xxx* methods of the `DataSet` object.
# 
# Once a `DataSet` is constructed, filled with data, and made *static*, it is typically passed as a parameter to one of pyGSTi's algorithm or driver routines to find a `GateSet` estimate based on the data.  This tutorial focuses on how to construct a `DataSet` and modify its data.  Later tutorials will demonstrate the different GST algorithms.

# In[1]:


from __future__ import print_function
import pygsti


# ## Creating a `DataSet`
# There three basic ways to create `DataSet` objects in `pygsti`:
# * By creating an empty `DataSet` object and manually adding counts corresponding to gate strings.  Remember that the `add_`*xxx* methods must be used to add data for gate strings not yet in the `DataSet`.  Once the data is added, be sure to call `done_adding_data`, as this restructures the internal storage of the `DataSet` to optimize the access operations used by algorithms.
# * By loading from a text-format dataset file via `pygsti.io.load_dataset`.  The result is a ready-to-use-in-algorithms *static* `DataSet`, so there's no need to call `done_adding_data` this time.
# * By using a `GateSet` to generate "fake" data via `generate_fake_data`. This can be useful for doing simulations of GST, and comparing to your experimental results.
# 
# We do each of these in turn in the cells below.

# In[2]:


#1) Creating a data set from scratch
#    Note that tuples may be used in lieu of GateString objects
ds1 = pygsti.objects.DataSet(outcomeLabels=['0','1'])
ds1.add_count_dict( ('Gx',), {'0': 10, '1': 90} )
ds1.add_count_dict( ('Gx','Gy'), {'0': 40, '1': 60} )
ds1[('Gy',)] = {'0': 10, '1': 90} # dictionary assignment

#Modify existing data using dictionary-like access
ds1[('Gx',)]['0'] = 15
ds1[('Gx',)]['1'] = 85

#GateString objects can be used.
gs = pygsti.objects.GateString( ('Gx','Gy'))
ds1[gs]['0'] = 45
ds1[gs]['1'] = 55

ds1.done_adding_data()


# In[3]:


#2) By creating and loading a text-format dataset file.  The first
#    row is a directive which specifies what the columns (after the
#    first one) holds.  Other allowed values are "0 frequency", 
#    "1 count", etc.  Note that "0" and "1" in are the 
#    SPAM labels and must match those of any GateSet used in 
#    conjuction with this DataSet.
dataset_txt = \
"""## Columns = plus count, count total
{} 0 100
Gx 10 90
GxGy 40 60
Gx^4 20 90
"""
with open("tutorial_files/Example_TinyDataset.txt","w") as tinydataset:
    tinydataset.write(dataset_txt)
ds2 = pygsti.io.load_dataset("tutorial_files/Example_TinyDataset.txt")


# In[4]:


#3) By generating fake data (using the std1Q_XYI standard gate set module)
from pygsti.construction import std1Q_XYI

#Depolarize the perfect X,Y,I gate set
depol_gateset = std1Q_XYI.gs_target.depolarize(gate_noise=0.1)

#Compute the sequences needed to perform Long Sequence GST on 
# this GateSet with sequences up to lenth 512
gatestring_list = pygsti.construction.make_lsgst_experiment_list(
    std1Q_XYI.gs_target, std1Q_XYI.prepStrs, std1Q_XYI.effectStrs,
    std1Q_XYI.germs, [1,2,4,8,16,32,64,128,256,512])

#Generate fake data (Tutorial 00)
ds3 = pygsti.construction.generate_fake_data(depol_gateset, gatestring_list, nSamples=1000,
                                             sampleError='binomial', seed=100)
ds3b = pygsti.construction.generate_fake_data(depol_gateset, gatestring_list, nSamples=50,
                                              sampleError='binomial', seed=100)

#Write the ds3 and ds3b datasets to a file for later tutorials
pygsti.io.write_dataset("tutorial_files/Example_Dataset.txt", ds3, outcomeLabelOrder=['0','1']) 
pygsti.io.write_dataset("tutorial_files/Example_Dataset_LowCnts.txt", ds3b) 


# ## Viewing `DataSets`

# In[5]:


#It's easy to just print them:
print("Dataset1:\n",ds1)
print("Dataset2:\n",ds2)
print("Dataset3 is too big to print, so here it is truncated to Dataset2's strings\n", ds3.truncate(ds2.keys()))


# **Note that the outcome labels `'0'` and `'1'` appear as `('0',)` and `('1',)`**.  This is because outcome labels in pyGSTi are tuples of time-ordered instrument element (allowing for intermediate measurements) and POVM effect labels.  In the special but common case when there are no intermediate measurements, the outcome label is a 1-tuple of just the final POVM effect label.  In this case, one may use the effect label itself (e.g. `'0'` or `'1'`) in place of the 1-tuple in almost all contexts, as it is automatically converted to the 1-tuple (e.g. `('0',)` or `('1',)`) internally.  When printing, however, the 1-tuple is still displayed to remind the user of the more general structure contained in the `DataSet`.

# ## Iteration over data sets

# In[6]:


# A DataSet's keys() method returns a list of GateString objects
ds1.keys()


# In[7]:


# There are many ways to iterate over a DataSet.  Here's one:
for gatestring in ds1.keys():
    dsRow = ds1[gatestring]
    for spamlabel in dsRow.counts.keys():
        print("Gatestring = %s, SPAM label = %s, count = %d" % \
            (str(gatestring).ljust(5), str(spamlabel).ljust(6), dsRow[spamlabel]))


# ## Advanced features of data sets
# 
# ### `collisionAction` argument
# When creating a `DataSet` one may specify the `collisionAction` argument as either `"aggregate"` (the default) or `"keepseparate"`.  The former instructs the `DataSet` to simply add the counts of like outcomes when counts are added for an already existing gate sequence.  `"keepseparate"`, on the other hand, causes the `DataSet` to tag added count data by appending a fictitious `"#<n>"` gate label to a gate sequence that already exists, where `<n>` is an integer.  When retreiving the keys of a `keepseparate` data set, the `stripOccuranceTags` argument to `keys()` determines whether the `"#<n>"` labels are included in the output (if they're not - the default - duplicate keys may be returned).  Access to different occurances of the same data are provided via the `occurrance` argument of the `get_row` and `set_row` functions, which should be used instead of the usual bracket indexing.

# In[8]:


ds_agg = pygsti.objects.DataSet(outcomeLabels=['0','1'], collisionAction="aggregate") #the default
ds_agg.add_count_dict( ('Gx','Gy'), {'0': 10, '1': 90} )
ds_agg.add_count_dict( ('Gx','Gy'), {'0': 40, '1': 60} )
print("Aggregate-mode Dataset:\n",ds_agg)

ds_sep = pygsti.objects.DataSet(outcomeLabels=['0','1'], collisionAction="keepseparate")
ds_sep.add_count_dict( ('Gx','Gy'), {'0': 10, '1': 90} )
ds_sep.add_count_dict( ('Gx','Gy'), {'0': 40, '1': 60} )
print("Keepseparate-mode Dataset:\n",ds_sep)


# ### Time-dependent data
# When your data is time-stamped, either for each individual count or by groups of counts, there are additional (richer) options for analysis.  The `DataSet` class is also capable of storing time-dependent data by holding *series* of count data rather than binned numbers-of-counts, which are added via its `add_series_data` method.  Outcome counts are input by giving at least two parallel arrays of 1) outcome labels and 2) time stamps.  Optionally, one can provide a third array of repetitions, specifying how many times the corresponding outcome occurred at the time stamp.  While in reality no two outcomes are taken at exactly the same time, a `TDDataSet` allows for arbitrarily *coarse-grained* time-dependent data in which multiple outcomes are all tagged with the *same* time stamp.  In fact, the "time-independent" case considered in this tutorial so far is actually a special case in which the all data is stamped at *time=0*.
# 
# Below we demonstrate how to create and initialize a `DataSet` using time series data.

# In[9]:


#Create an empty dataset                                                                       
tdds = pygsti.objects.DataSet(outcomeLabels=['0','1'])

#Add a "single-shot" series of outcomes, where each spam label (outcome) has a separate time stamp
tdds.add_raw_series_data( ('Gx',), #gate sequence                                                                 
            ['0','0','1','0','1','0','1','1','1','0'], #spam labels                                                                                                                 
            [0.0, 0.2, 0.5, 0.6, 0.7, 0.9, 1.1, 1.3, 1.35, 1.5]) #time stamps                                                                                              

#When adding outcome-counts in "chunks" where the counts of each
# chunk occur at nominally the same time, use 'add_raw_series_data' to
# add a list of count dictionaries with a timestamp given for each dict:
tdds.add_series_data( ('Gx','Gx'),  #gate sequence                                                               
                      [{'0':10, '1':90}, {'0':30, '1':70}], #count dicts                                                         
                      [0.0, 1.0]) #time stamps - one per dictionary                                                               

#For even more control, you can specify the timestamp of each count
# event or group of identical outcomes that occur at the same time:
#Add 3 'plus' outcomes at time 0.0, followed by 2 'minus' outcomes at time 1.0
tdds.add_raw_series_data( ('Gy',),  #gate sequence                                                               
                      ['0','1'], #spam labels                                                         
                      [0.0, 1.0], #time stamps                                                               
                      [3,2]) #repeats  

#The above coarse-grained addition is logically identical to:
# tdds.add_raw_series_data( ('Gy',),  #gate sequence                                                               
#                       ['0','0','0','1','1'], #spam labels                                                         
#                       [0.0, 0.0, 0.0, 1.0, 1.0]) #time stamps                                                               
# (However, the DataSet will store the coase-grained addition more efficiently.) 


# When one is done populating the `DataSet` with data, one should still call `done_adding_data`:

# In[10]:


tdds.done_adding_data()


# Access to the underlying time series data is done by indexing on the gate sequence (to get a `DataSetRow` object, just as in the time-independent case) which has various methods for retrieving its underlying data: 

# In[11]:


tdds_row = tdds[('Gx',)]
print("INFO for Gx string:\n")
print( tdds_row )
      
print( "Raw outcome label indices:", tdds_row.oli )
print( "Raw time stamps:", tdds_row.time )
print( "Raw repetitions:", tdds_row.reps )
print( "Number of entries in raw arrays:", len(tdds_row) )

print( "Outcome Labels:", tdds_row.outcomes )
print( "Repetition-expanded outcome labels:", tdds_row.get_expanded_ol() )
print( "Repetition-expanded outcome label indices:", tdds_row.get_expanded_oli() )
print( "Repetition-expanded time stamps:", tdds_row.get_expanded_times() )
print( "Time-independent-like counts per spam label:", tdds_row.counts )
print( "Time-independent-like total counts:", tdds_row.total )
print( "Time-independent-like spam label fraction:", tdds_row.fractions )

print("\n")

tdds_row = tdds[('Gy',)]
print("INFO for Gy string:\n")
print( tdds_row )
      
print( "Raw outcome label indices:", tdds_row.oli )
print( "Raw time stamps:", tdds_row.time )
print( "Raw repetitions:", tdds_row.reps )
print( "Number of entries in raw arrays:", len(tdds_row) )

print( "Spam Labels:", tdds_row.outcomes )
print( "Repetition-expanded outcome labels:", tdds_row.get_expanded_ol() )
print( "Repetition-expanded outcome label indices:", tdds_row.get_expanded_oli() )
print( "Repetition-expanded time stamps:", tdds_row.get_expanded_times() )
print( "Time-independent-like counts per spam label:", tdds_row.counts )
print( "Time-independent-like total counts:", tdds_row.total )
print( "Time-independent-like spam label fraction:", tdds_row.fractions )


# Finally, it is possible to read text-formatted time-dependent data in the special case when
# 1. the outcomes are all single-shot 
# 2. the time stamps of the outcomes are the integers (starting at zero) for *all* of the gate sequences.
# This corresponds to the case when each sequence is performed and measured simultaneously at equally spaced intervals.  We realize this is a bit fictitous and more text-format input options will be created in the future.

# ## The `MultiDataSet` object: a dictionary of `DataSet`s
# Sometimes it is useful to deal with several sets of data all of which hold counts for the *same* set of gate sequences.  For example, colleting data to perform GST on Monday and then again on Tuesday, or making an adjustment to an experimental system and re-taking data, could create two separate data sets with the same sequences.  PyGSTi has a separate data type, `pygsti.objects.MultiDataSet`, for this purpose.  A `MultiDataSet` looks and acts like a simple dictionary of `DataSet` objects, but underneath implements some certain optimizations that reduce the amount of space and memory required to store the data.  Primarily, it holds just a *single* list of the gate sequences - as opposed to an actual dictionary of `DataSet`s in which each `DataSet` contains it's own copy of the gate sequences.  In addition to being more space efficient, a `MultiDataSet` is able to aggregate all of its data into a single "summed" `DataSet` via `get_datasets_aggregate(...)`, which can be useful for combining several "passes" of experimental data.  
# 
# Several remarks regarding a `MultiDataSet` are worth mentioning:
# - you add `DataSets` to a `MultiDataSet` using the `add_dataset` method.  However only *static* `DataSet` objects can be added.  This is because the MultiDataSet must keep all of its `DataSet`s locked to the same set of sequences, and a non-static `DataSet` allows the addition or removal of only its sequences.  (If the `DataSet` you want to add isn't in static-mode, call its `done_adding_data` method.)
# - square-bracket indexing accesses the `MultiDataSet` as if it were a dictionary of `DataSets`.
# - `MultiDataSets` can be loaded and saved from a single text-format file with columns for each contained `DataSet` - see `pygsti.io.load_multidataset`.
# 
# Here's a brief example of using a `MultiDataSet`:

# In[12]:


from __future__ import print_function
import pygsti

multiDS = pygsti.objects.MultiDataSet()

#Create some datasets                                           
ds = pygsti.objects.DataSet(outcomeLabels=['0','1'])
ds.add_count_dict( (), {'0': 10, '1': 90} )
ds.add_count_dict( ('Gx',), {'0': 10, '1': 90} )
ds.add_count_dict( ('Gx','Gy'), {'0': 20, '1': 80} )
ds.add_count_dict( ('Gx','Gx','Gx','Gx'), {'0': 20, '1': 80} )
ds.done_adding_data()

ds2 = pygsti.objects.DataSet(outcomeLabels=['0','1'])            
ds2.add_count_dict( (), {'0': 15, '1': 85} )
ds2.add_count_dict( ('Gx',), {'0': 5, '1': 95} )
ds2.add_count_dict( ('Gx','Gy'), {'0': 30, '1': 70} )
ds2.add_count_dict( ('Gx','Gx','Gx','Gx'), {'0': 40, '1': 60} )
ds2.done_adding_data()

multiDS['myDS'] = ds
multiDS['myDS2'] = ds2

nStrs = len(multiDS)
dslabels = list(multiDS.keys())
print("MultiDataSet has %d gate strings and DataSet labels %s" % (nStrs, dslabels))
    
for dslabel in multiDS:
    ds = multiDS[dslabel]
    print("Empty string data for %s = " % dslabel, ds[()])       

for ds in multiDS.values():
    print("Gx string data (no label) =", ds[('Gx',)])     

for dslabel,ds in multiDS.items():
    print("GxGy string data for %s =" % dslabel, ds[('Gx','Gy')])  

dsSum = multiDS.get_datasets_aggregate('myDS','myDS2')
print("\nSummed data:")
print(dsSum)


# In[13]:


multi_dataset_txt = \
"""## Columns = DS0 0 count, DS0 1 count, DS1 0 frequency, DS1 count total                                
{} 0 100 0 100                                                                                                      
Gx 10 90 0.1 100                                                                                                    
GxGy 40 60 0.4 100                                                                                                  
Gx^4 20 80 0.2 100                                                                                                  
"""

with open("tutorial_files/TinyMultiDataset.txt","w") as output:
    output.write(multi_dataset_txt)
multiDS_fromFile = pygsti.io.load_multidataset("tutorial_files/TinyMultiDataset.txt", cache=False)

print("\nLoaded from file:\n")
print(multiDS_fromFile)


# In[ ]: