#!/usr/bin/env python
# coding: utf-8

# # Accessing Three Channel (FLORT) Fluorescence and Backscatter Data from the OOI Raw Data Server
# The example code provided below shows a pathway for downloading and converting the raw FLORT data (recorded in ASCII format) into a usable form for further processing and analysis. The data is accessible from the [OOI Raw Data Server](https://rawdata.oceanobservatories.org/files/). For this demonstration we are using data from the Spring 2016 Deployment of the [Oregon Shelf Surface Mooring (CE02SHSM)](https://rawdata.oceanobservatories.org/files/CE02SHSM/D00003/cg_data/dcl27/flort/)
# 
# Before proceeding, you need to obtain a copy of the cgsn_parsers modules used below. Using the Anaconda python distribution and the conda-forge channel, you can install these modules via:
# 
# ```bash
# # Via conda
# conda install -c conda-forge cgsn-parsers
# 
# # Or via pip if not using Anaconda
# pip install git+https://bitbucket.org/ooicgsn/cgsn-parsers
# ```
# 
# See the [README](https://bitbucket.org/cwingard/ooiea-data-examples) in this repo for further information.

# In[1]:


# Load required python modules
import requests
import numpy as np
import pandas as pd

from bokeh.plotting import figure, show
from bokeh.palettes import Colorblind as palette
from bokeh.io import output_notebook


# In[2]:


# Load the parser for the FLORT data. Reads in the DCL logged raw data file and converts that data to a Bunch class data object.
from cgsn_parsers.parsers.parse_flort import Parser


# In[3]:


# Coastal Endurance Oregon Shelf Surface Mooring NSIF (7 meters) FLORT data from June 1, 2016
baseurl = "https://rawdata.oceanobservatories.org/files/CE02SHSM/D00003/cg_data/dcl27/flort/"
fname = "20160601.flort.log"

# initialize the Parser object for the FLORT
flort = Parser(baseurl + fname)
r = requests.get(flort.infile, verify=True)


# In[4]:


# Raw data is available in the raw data object for the parser class. 
flort.raw = r.content.decode('utf-8').splitlines()
flort.raw[:5]


# In[5]:


# The parser class method parse_data converts the raw data into a parsed bunch class data object
flort.parse_data()
flort.data.keys()


# In[6]:


# Almost every dataset will include multiple sources of timing data. We always use the data logger date/time string
# (dcl_date_time_string) converted to an Epoch time stamp (seconds since 1970-01-01 UTC) as this time source is 
# directly tied to the GPS provided time. This converted value is called 'time' in all of the datasets created by 
# the cgsn_parsers. The source date/time string and any other time sources included in the dataset are also provided.
flort.data.time[:5]


# From here, you can save the data to disk as a JSON formatted data file if you so desire. We use this method to store the parsed data files locally for all further processing.
# ```python
# # write the resulting Bunch object via the toJSON method to a JSON
# # formatted data file (note, no pretty-printing keeping things compact)
# with open(outfile, 'w') as f:
#     f.write(flort.data.toJSON())
# ```
# We are going to proceed, instead, by converting the data into a [Pandas](https://github.com/pandas-dev/pandas) dataframe.

# In[7]:


# Convert the data into a Panda dataframe for further analysis.
df = pd.DataFrame(flort.data)
df['dt_utc'] = pd.to_datetime(df.time, unit='s')  # use the time variable to set the index
df.set_index('dt_utc', drop=False, inplace=True)


# In[8]:


# Show the contents of the dataframe
df.shape, df.columns


# In[9]:


df.tail(5)


# In[10]:


# Provide a simple plot of a days worth of data
output_notebook()

# make a list of our columns
cols = ['raw_signal_cdom', 'raw_signal_chl', 'raw_signal_beta']
colors = palette[3]

# make the figure, 
p = figure(x_axis_type="datetime", title="Normalized Raw FLORT Data -- Bursts", width = 800, height = 500)
p.xaxis.axis_label = 'Date and Time'
p.yaxis.axis_label = 'Counts'

# loop through our columns and colours
for c, cname in zip(colors, cols):
    p.line(df.index, ((df[cname] - df[cname].mean()) / df[cname].std()), color=c, legend=cname)

show(p)


# In[11]:


# The FLORT data is collected in a burst mode (~1 Hz data sampled for 3 minutes every 15 minutes). We're going to take
# a median average of each burst to clean up variablity in the data created by the movement of the NSIF relative to the 
# water column and to make the ultimate data files smaller and easier to work with.
df['burst'] = (df['time'].diff() > 300).cumsum()   # find when the timedelta is greater than 300 s and increment group
bursts = df.groupby('burst').aggregate(np.median)  # aggregate each group via median averaging
bursts['dt_utc'] = pd.to_datetime(bursts.time, unit='s')  # use the time variable to set the index
bursts.set_index('dt_utc', drop=False, inplace=True)


# In[12]:


# make the figure, 
p = figure(x_axis_type="datetime", title="Normalized Raw FLORT Data -- Averaged", width = 800, height = 500)
p.xaxis.axis_label = 'Date and Time'
p.yaxis.axis_label = 'Counts'

# loop through our columns and colours
for c, cname in zip(colors, cols):
    p.line(bursts.index, ((bursts[cname] - bursts[cname].mean()) / bursts[cname].std()), color=c, legend=cname)
    #p.line(bursts.index, bursts[cname], color=c, legend=cname)  # or plot it regularly

show(p)


# The following two functions and the implementation below, takes the work from the examples above and combines them into 
# a simple routine we can use to access, download and initially process the SPKIR data for the month of June (change the example regex to get whatever data it is you are after).

# In[13]:


# Add some addition modules
from bs4 import BeautifulSoup
import re

# Function to create a list of the data files of interest on the raw data server
def list_files(url, tag=''):
    page = requests.get(url).text
    soup = BeautifulSoup(page, 'html.parser')
    pattern = re.compile(str(tag))
    return [node.get('href') for node in soup.find_all('a', text=pattern)]

# Function to download a file, parse it, apply median-averaging to the bursts and create a final dataframe.
def process_file(file):
    # Initialize the parser, download and parse the data file
    flort = Parser(baseurl + file)
    r = requests.get(flort.infile, verify=True)
    flort.raw = r.content.decode('utf-8').splitlines()
    flort.parse_data()

    # Convert the parsed data to a dataframe
    df = pd.DataFrame(flort.data)
    if df.empty:
        return None
    
    # Set the burst index and create the median averaged burst dataframe
    df['burst'] = (df['time'].diff() > 300).cumsum()  # use a boolean counter to find when the timedelta is greater than 300 s
    bursts = df.groupby('burst').aggregate(np.median)  # aggregate each group via median averaging
    bursts['dt_utc'] = pd.to_datetime(bursts.time, unit='s')  # use the time variable to set the index
    bursts.set_index('dt_utc', drop=False, inplace=True)

    # Clean up the dataframe and return the results
    bursts.drop(columns=['measurement_wavelength_beta', 'measurement_wavelength_cdom', 'measurement_wavelength_chl'])
    return bursts


# In[14]:


# Create a list of the files from June using a simple regex as tag to discriminate the files
files = list_files(baseurl, '201606[0-9]{2}.flort.log')

# Process the data files for June and concatenate into a single dataframe
frames = [process_file(f) for f in files]
june = pd.concat(frames)


# In[15]:


# Plot the burst averaged data for the month of June 2016.
# make the figure, 
p = figure(x_axis_type="datetime", title="Normailzed Raw FLORT Data -- June 2016", width = 800, height = 500)
p.xaxis.axis_label = 'Date and Time'
p.yaxis.axis_label = 'Counts'

# loop through our columns and colours
for c, cname in zip(colors, cols):
    p.line(june.index, ((june[cname] - june[cname].mean()) / june[cname].std()), color=c, legend=cname)
    #p.line(june.index, june[cname], color=c, legend=cname)  # or plot it regularly

show(p)


# At this point, you have the option to save the data, or apply the processing routines available in pyseas and cgsn_processing, to convert the data from raw engineering units to scientific units using the calibration coefficients that are available online. An example for how those steps work is available [here](insert link)

# In[ ]: