#!/usr/bin/env python
# coding: utf-8

# # NAIP Segmentation using Scikit image Quickshift, SLIC algorithms
# https://opensourceoptions.com/blog/python-geographic-object-based-image-analysis-geobia/
# 
# This notebook needs at least 16GB RAM to run

# In[1]:


import numpy as np
from osgeo import gdal
from skimage import exposure
from skimage.segmentation import quickshift, slic
import time
import scipy
import fsspec


# #### Download data so we have a local copy

# In[2]:


naip_fn = 'https://mghp.osn.xsede.org/rsignellbucket1/obia/m_4107027_se_19_060_20210904.tif'


# In[3]:


fs_https = fsspec.filesystem('https')


# In[4]:


fs_https.info(naip_fn)


# In[5]:


get_ipython().run_cell_magic('time', '', "fs_https.download(naip_fn, 'naip.tif')\n")


# #### Load the data

# In[6]:


naip_fn = 'naip.tif'


# In[7]:


get_ipython().run_cell_magic('time', '', "driverTiff = gdal.GetDriverByName('GTiff')\nnaip_ds = gdal.Open(naip_fn)\nnbands = naip_ds.RasterCount\nband_data = []\nprint('bands', naip_ds.RasterCount, 'rows', naip_ds.RasterYSize, 'columns',\n      naip_ds.RasterXSize)\nfor i in range(1, nbands+1):\n    band = naip_ds.GetRasterBand(i).ReadAsArray()\n    band_data.append(band)\nband_data = np.dstack(band_data)\n")


# #### Scale image values from 0.0 - 1.0 

# In[8]:


img = exposure.rescale_intensity(band_data)


# #### Run the segmentation using SLIC

# In[9]:


get_ipython().run_cell_magic('time', '', 'segments = slic(img, n_segments=500000, compactness=0.1)\n')


# #### Spectral Properties of Segments
# 
# Now we need to describe each segment based on it’s spectral properties because the spectral properties are the variables that will classify each segment as a land cover type.
# 
# First of all, write a function that, given an array of pixel values, will calculate the min, max, mean, variance, skewness, and kurtosis for each band. The code below takes all the pixels in a segment and calculates statistics for each band, saving them in the features variable, which is returned. Next, get the pixel data and save the returned features. I describe this process below.

# In[10]:


def segment_features(segment_pixels):
    features = []
    npixels, nbands = segment_pixels.shape
    for b in range(nbands):
        stats = scipy.stats.describe(segment_pixels[:, b])
        band_stats = list(stats.minmax) + list(stats)[2:]
        if npixels == 1:
            # in this case the variance = nan, change it 0.0
            band_stats[3] = 0.0
        features += band_stats
    return features


# In object-based image analysis each segment represents an object. Objects represent buildings, roads, trees, fields or pieces of those features, depending on how the segmentation is done. 
# 
# The code below gets a list of the segment ID numbers. Then sets up a list for the statistics describing each object (i.e. segment ID) returned from the segment_features function (above). The pixels for each segment are identified and passed to segment_features, which returns the statistics describing the spectral properties of the segment/object. 
# 
# Statistics are saved to the objects list and the object_id is stored in a separate list. 

# In[11]:


get_ipython().run_cell_magic('time', '', 'segment_ids = np.unique(segments)\nprint(len(segment_ids))\n')


# The following cell loops over each segment.  It seems to only require 2.5GB RAM

# In[12]:


get_ipython().run_cell_magic('time', '', 'objects = []\nobject_ids = []\nfor id in segment_ids[:80]:     # test with a few segments\n    segment_pixels = img[segments == id]\n    object_features = segment_features(segment_pixels)\n    objects.append(object_features)\n    object_ids.append(id)\n')


# Okay, let's parallelize!   
# 
# First we define a function that takes the id as input and returns the stats for that segment: 

# In[13]:


def get_features(id):
    segment_pixels = img[segments == id]
    return segment_features(segment_pixels)


# Verify it takes the same amount of time and gives the same results:

# In[14]:


get_ipython().run_cell_magic('time', '', 'objects2 = [get_features(id) for id in segment_ids[:80]]\n')


# In[15]:


assert objects == objects2


# Use Dask Bag to parallelize loop

# In[16]:


from dask.distributed import Client


# In[17]:


client = Client(n_workers=4, threads_per_worker=1)


# In[18]:


client


# In[19]:


client.cluster.workers


# In[20]:


import dask.bag as db


# In[21]:


get_ipython().run_cell_magic('time', '', 'b = db.from_sequence(segment_ids[:80], npartitions=4)\nb1 = b.map(get_features).compute()\n')


# #### Try scattering the image and segment data to the workers 

# In[ ]:


get_ipython().run_cell_magic('time', '', 'scattered_img = client.scatter(img, broadcast=True)\nscattered_segments = client.scatter(segments, broadcast=True)\n')


# In[ ]:


def get_features(id, img=scattered_img, segments=scattered_segments):
    segment_pixels = img[segments == id]
    return segment_features(segment_pixels)


# In[ ]:


get_ipython().run_cell_magic('time', '', 'b1 = b.map(get_features).compute()\n')


# In[ ]: