#!/usr/bin/env python # coding: utf-8 # # NAIP Segmentation using Scikit image Quickshift, SLIC algorithms # https://opensourceoptions.com/blog/python-geographic-object-based-image-analysis-geobia/ # # This notebook needs at least 16GB RAM to run # In[1]: import numpy as np from osgeo import gdal from skimage import exposure from skimage.segmentation import quickshift, slic import time import scipy import fsspec # #### Download data so we have a local copy # In[2]: naip_fn = 'https://mghp.osn.xsede.org/rsignellbucket1/obia/m_4107027_se_19_060_20210904.tif' # In[3]: fs_https = fsspec.filesystem('https') # In[4]: fs_https.info(naip_fn) # In[5]: get_ipython().run_cell_magic('time', '', "fs_https.download(naip_fn, 'naip.tif')\n") # #### Load the data # In[6]: naip_fn = 'naip.tif' # In[7]: get_ipython().run_cell_magic('time', '', "driverTiff = gdal.GetDriverByName('GTiff')\nnaip_ds = gdal.Open(naip_fn)\nnbands = naip_ds.RasterCount\nband_data = []\nprint('bands', naip_ds.RasterCount, 'rows', naip_ds.RasterYSize, 'columns',\n naip_ds.RasterXSize)\nfor i in range(1, nbands+1):\n band = naip_ds.GetRasterBand(i).ReadAsArray()\n band_data.append(band)\nband_data = np.dstack(band_data)\n") # #### Scale image values from 0.0 - 1.0 # In[8]: img = exposure.rescale_intensity(band_data) # #### Run the segmentation using SLIC # In[9]: get_ipython().run_cell_magic('time', '', 'segments = slic(img, n_segments=500000, compactness=0.1)\n') # #### Spectral Properties of Segments # # Now we need to describe each segment based on it’s spectral properties because the spectral properties are the variables that will classify each segment as a land cover type. # # First of all, write a function that, given an array of pixel values, will calculate the min, max, mean, variance, skewness, and kurtosis for each band. The code below takes all the pixels in a segment and calculates statistics for each band, saving them in the features variable, which is returned. Next, get the pixel data and save the returned features. I describe this process below. # In[10]: def segment_features(segment_pixels): features = [] npixels, nbands = segment_pixels.shape for b in range(nbands): stats = scipy.stats.describe(segment_pixels[:, b]) band_stats = list(stats.minmax) + list(stats)[2:] if npixels == 1: # in this case the variance = nan, change it 0.0 band_stats[3] = 0.0 features += band_stats return features # In object-based image analysis each segment represents an object. Objects represent buildings, roads, trees, fields or pieces of those features, depending on how the segmentation is done. # # The code below gets a list of the segment ID numbers. Then sets up a list for the statistics describing each object (i.e. segment ID) returned from the segment_features function (above). The pixels for each segment are identified and passed to segment_features, which returns the statistics describing the spectral properties of the segment/object. # # Statistics are saved to the objects list and the object_id is stored in a separate list. # In[11]: get_ipython().run_cell_magic('time', '', 'segment_ids = np.unique(segments)\nprint(len(segment_ids))\n') # The following cell loops over each segment. It seems to only require 2.5GB RAM # In[12]: get_ipython().run_cell_magic('time', '', 'objects = []\nobject_ids = []\nfor id in segment_ids[:80]: # test with a few segments\n segment_pixels = img[segments == id]\n object_features = segment_features(segment_pixels)\n objects.append(object_features)\n object_ids.append(id)\n') # Okay, let's parallelize! # # First we define a function that takes the id as input and returns the stats for that segment: # In[13]: def get_features(id): segment_pixels = img[segments == id] return segment_features(segment_pixels) # Verify it takes the same amount of time and gives the same results: # In[14]: get_ipython().run_cell_magic('time', '', 'objects2 = [get_features(id) for id in segment_ids[:80]]\n') # In[15]: assert objects == objects2 # Use Dask Bag to parallelize loop # In[16]: from dask.distributed import Client # In[17]: client = Client(n_workers=4, threads_per_worker=1) # In[18]: client # In[19]: client.cluster.workers # In[20]: import dask.bag as db # In[21]: get_ipython().run_cell_magic('time', '', 'b = db.from_sequence(segment_ids[:80], npartitions=4)\nb1 = b.map(get_features).compute()\n') # #### Try scattering the image and segment data to the workers # In[ ]: get_ipython().run_cell_magic('time', '', 'scattered_img = client.scatter(img, broadcast=True)\nscattered_segments = client.scatter(segments, broadcast=True)\n') # In[ ]: def get_features(id, img=scattered_img, segments=scattered_segments): segment_pixels = img[segments == id] return segment_features(segment_pixels) # In[ ]: get_ipython().run_cell_magic('time', '', 'b1 = b.map(get_features).compute()\n') # In[ ]: