#!/usr/bin/env python
# coding: utf-8

# # Accessing Three Channel (FLORT) Fluorescence and Backscatter Data from the OOI Raw Data Server
# The example code provided below shows a pathway for downloading and converting the raw FLORT data (recorded in ASCII format) into a usable form for further processing and analysis. The data is accessible from the [OOI Raw Data Server](https://rawdata.oceanobservatories.org/files/). For this demonstration we are using data from the Spring 2016 Deployment of the [Oregon Shelf Surface Mooring (CE02SHSM)](https://rawdata.oceanobservatories.org/files/CE02SHSM/D00003/cg_data/dcl27/flort/)
# 
# Before proceeding, you need to obtain a copy of the cgsn_parsers modules used below. Using the Anaconda python distribution and the conda-forge channel, you can install these modules via:
# 
# ```bash
# # Via conda
# conda install -c conda-forge cgsn_parsers
# 
# # Or via pip if not using Anaconda
# pip install git+https://bitbucket.org/ooicgsn/cgsn-parsers
# ```
# 
# See the [README](https://bitbucket.org/cwingard/ooiea-data-examples) in this repo for further information on how to setup an environment for working with the OOI data.

# In[1]:


# Load required python modules
import requests
import pandas as pd
import xarray as xr

from bokeh.models import Range1d, LinearAxis
from bokeh.plotting import figure, show
from bokeh.palettes import Colorblind as palette
from bokeh.io import output_notebook

import warnings
warnings.filterwarnings('ignore')


# In[2]:


# Load the parser for the FLORT data. Reads in the DCL logged raw data file and converts that data to a Bunch class
# data object.
from cgsn_parsers.parsers.parse_flort import Parser


# In[3]:


# Coastal Endurance Oregon Shelf Surface Mooring NSIF (7 meters) FLORT data from June 1, 2016
baseurl = "https://rawdata.oceanobservatories.org/files/CE02SHSM/D00003/cg_data/dcl27/flort/"
fname = "20160601.flort.log"

# initialize the Parser object for the FLORT
flort = Parser(baseurl + fname)
r = requests.get(flort.infile, verify=True)


# In[4]:


# Raw data is available in the raw data object for the parser class. 
flort.raw = r.content.decode('utf-8').splitlines()
flort.raw[:5]  # print a snippet of the raw data


# In[5]:


# The parser class method parse_data converts the raw data into a parsed bunch class data object
flort.parse_data()
flort.data.keys()  # print the resulting dictionary keys in the data object


#  Almost every EA dataset will include multiple sources of timing data. We always use the data logger date/time string (dcl_date_time_string) converted to an Epoch time stamp (seconds since 1970-01-01 UTC) as this time source is directly tied to GPS time. This converted Epoch time stamp is called 'time' in all of the datasets created by the cgsn_parsers. The source date/time string and any other time sources included in the dataset are also provided, in the raw format that is recorded in the data file.

# With the data parsed, you can save the data to disk as a JSON formatted data file if you so desire. We use this method to store the parsed data files locally for all further processing.
# ```python
# # write the resulting Bunch object via the toJSON method to a JSON
# # formatted data file (note, no pretty-printing keeping things compact)
# with open(outfile, 'w') as f:
#     f.write(flort.data.toJSON())
# ```
# We are going to proceed, instead, by converting the data into a [pandas](https://github.com/pandas-dev/pandas) dataframe and then an [xarray](http://xarray.pydata.org/en/stable/index.html) dataset for the following steps.

# In[6]:


# Convert the data into a panda dataframe and then xarray dataset for further analysis.
df = pd.DataFrame(flort.data)
df['time'] = pd.to_datetime(df.time, unit='s')  # use the time variable to set the index
df.set_index('time', drop=False, inplace=True)
ds = df.to_xarray()


# In[7]:


ds  # print a summary of the dataset


# In[8]:


# Provide a simple plot of a days worth of data
output_notebook()

# make a list of our columns
cols = ['raw_signal_cdom', 'raw_signal_chl', 'raw_signal_beta']
colors = palette[3]

# make the figure, 
p = figure(x_axis_type="datetime", title="Raw FLORT Data -- Bursts 2016-06-01", width = 850, height = 500)
p.xaxis.axis_label = 'Date and Time'

p.yaxis.axis_label = 'Raw CDOM [counts]'
p.y_range = Range1d(start=50, end=100)

p.extra_y_ranges['cb'] = Range1d(start=0, end=2000)
p.add_layout(LinearAxis(y_range_name='cb', axis_label='Raw Chlorophyll and Backscatter [counts]'), 'right')

p.line(ds.time.values, ds[cols[0]].values, color=colors[0], legend=cols[0])
p.line(ds.time.values, ds[cols[1]].values, color=colors[1], legend=cols[1], y_range_name = 'cb')
p.line(ds.time.values, ds[cols[2]].values, color=colors[2], legend=cols[2], y_range_name = 'cb')

p.toolbar_location = 'above'
show(p)


# In[9]:


# The FLORT data is collected in a burst mode (~1 Hz data sampled for 3 minutes every 15 minutes). We're going to take
# a median average of each burst to clean up variablity in the data created by the movement of the NSIF relative to the 
# water column and to make the ultimate data files smaller and easier to work with.
bursts = ds.resample(time='15Min').median()


# In[10]:


# make the figure, 
p = figure(x_axis_type="datetime", title="Raw FLORT Data -- Averaged 2016-06-01", width = 850, height = 500)
p.xaxis.axis_label = 'Date and Time'

p.yaxis.axis_label = 'Raw CDOM [counts]'
p.y_range = Range1d(start=50, end=100)

p.extra_y_ranges['cb'] = Range1d(start=0, end=2000)
p.add_layout(LinearAxis(y_range_name='cb', axis_label='Raw Chlorophyll and Backscatter [counts]'), 'right')

p.line(bursts.time.values, bursts[cols[0]].values, color=colors[0], legend=cols[0])
p.line(bursts.time.values, bursts[cols[1]].values, color=colors[1], legend=cols[1], y_range_name = 'cb')
p.line(bursts.time.values, bursts[cols[2]].values, color=colors[2], legend=cols[2], y_range_name = 'cb')

p.toolbar_location = 'above'
show(p)


# The following two functions and the implementation below, takes the work from the examples above and combines them into 
# a simple routine we can use to access, download and initially process the FLORT data for the month of June.

# In[11]:


# Add some addition modules
from bs4 import BeautifulSoup
import re

# Function to create a list of the data files of interest on the raw data server
def list_files(url, tag=''):
    page = requests.get(url).text
    soup = BeautifulSoup(page, 'html.parser')
    pattern = re.compile(str(tag))
    return [node.get('href') for node in soup.find_all('a', text=pattern)]

# Function to download a file, parse it, apply median-averaging to the bursts and create a final dataframe.
def process_file(baseurl, file):
    # Initialize the parser, download and parse the data file
    flort = Parser(baseurl + file)
    r = requests.get(flort.infile, verify=True)
    flort.raw = r.content.decode('utf-8').splitlines()
    flort.parse_data()

    # Convert the parsed data to a dataframe and then a dataset
    df = pd.DataFrame(flort.data)
    df['time'] = pd.to_datetime(df.time, unit='s')  # use the time variable to set the index
    df.set_index('time', drop=False, inplace=True)
    ds = df.to_xarray()

    # Set the burst index and create the median averaged burst dataframe
    bursts = ds.resample(time='15Min').median()

    # Return the results
    return bursts


# In[12]:


# Create a list of the files from June using a simple regex as tag to discriminate the files
files = list_files(baseurl, '201606[0-9]{2}.flort.log')

# Process the data files for June and concatenate into a single dataframe
frames = [process_file(baseurl, f) for f in files]
june = xr.concat(frames, 'time')


# In[13]:


# Plot the burst averaged data for the month of June 2016.
# make the figure, 
p = figure(x_axis_type="datetime", title="Raw FLORT Data -- June 2016", width = 850, height = 500)
p.xaxis.axis_label = 'Date'

p.yaxis.axis_label = 'Raw CDOM [counts]'
p.y_range = Range1d(start=50, end=100)

p.extra_y_ranges['cb'] = Range1d(start=0, end=3500)
p.add_layout(LinearAxis(y_range_name='cb', axis_label='Raw Chlorophyll and Backscatter [counts]'), 'right')

p.line(june.time.values, june[cols[0]].values, color=colors[0], legend=cols[0])
p.line(june.time.values, june[cols[1]].values, color=colors[1], legend=cols[1], y_range_name = 'cb')
p.line(june.time.values, june[cols[2]].values, color=colors[2], legend=cols[2], y_range_name = 'cb')

p.toolbar_location = 'above'
show(p)


# At this point, you have the option to save the data, or apply the processing routines available in pyseas and cgsn_processing, to convert the data from raw engineering units to scientific units using the calibration coefficients that are available online.

# In[14]:


june['time'] = june.time.values.astype(float) / 10.0**9  # Convert from datetime object in nanoseconds to seconds since 1970
june.to_netcdf('C:\\ooi\\ce02shsm_june2016_raw_flort.nc')