!apt-get install python3-rtree
!pip install pystac
!pip install geopandas
!pip install rasterio
import os
os.environ["CURL_CA_BUNDLE"] = "/etc/ssl/certs/ca-certificates.crt"
from matplotlib import pyplot as plt
import numpy as np
from pprint import pprint
import rasterio
from rasterio.windows import Window
import geopandas as gpd
from pyproj import CRS
from pystac import (Catalog, CatalogType, Item, Asset, LabelItem, Collection)
# overwriting STAC_IO read method to handle http/s as per https://pystac.readthedocs.io/en/latest/concepts.html#using-stac-io
from urllib.parse import urlparse
import requests
from pystac import STAC_IO
def my_read_method(uri):
parsed = urlparse(uri)
if parsed.scheme.startswith('http'):
return requests.get(uri).text
else:
return STAC_IO.default_read_text_method(uri)
STAC_IO.read_text_method = my_read_method
# load our training and test catalogs
train1_cat = Catalog.from_file('https://drivendata-competition-building-segmentation.s3-us-west-1.amazonaws.com/train_tier_1/catalog.json')
train2_cat = Catalog.from_file('https://drivendata-competition-building-segmentation.s3-us-west-1.amazonaws.com/train_tier_2/catalog.json')
test_cat = Catalog.from_file('https://drivendata-competition-building-segmentation.s3-us-west-1.amazonaws.com/test/catalog.json')
# look at what's in train1_cat (aka train_tier_1)
train1_cat.describe()
* <Catalog id=train_tier_1> * <Collection id=acc> * <Item id=665946> * <LabelItem id=665946-labels> * <Item id=a42435> * <LabelItem id=a42435-labels> * <Item id=ca041a> * <LabelItem id=ca041a-labels> * <Item id=d41d81> * <LabelItem id=d41d81-labels> * <Collection id=mon> * <Item id=401175> * <LabelItem id=401175-labels> * <Item id=493701> * <LabelItem id=493701-labels> * <Item id=207cc7> * <LabelItem id=207cc7-labels> * <Item id=f15272> * <LabelItem id=f15272-labels> * <Collection id=ptn> * <Item id=abe1a3> * <LabelItem id=abe1a3-labels> * <Item id=f49f31> * <LabelItem id=f49f31-labels> * <Collection id=kam> * <Item id=4e7c7f> * <LabelItem id=4e7c7f-labels> * <Collection id=dar> * <Item id=a017f9> * <LabelItem id=a017f9-labels> * <Item id=b15fce> * <LabelItem id=b15fce-labels> * <Item id=353093> * <LabelItem id=353093-labels> * <Item id=f883a0> * <LabelItem id=f883a0-labels> * <Item id=42f235> * <LabelItem id=42f235-labels> * <Item id=0a4c40> * <LabelItem id=0a4c40-labels> * <Collection id=znz> * <Item id=33cae6> * <LabelItem id=33cae6-labels> * <Item id=3b20d4> * <LabelItem id=3b20d4-labels> * <Item id=076995> * <LabelItem id=076995-labels> * <Item id=75cdfa> * <LabelItem id=75cdfa-labels> * <Item id=9b8638> * <LabelItem id=9b8638-labels> * <Item id=06f252> * <LabelItem id=06f252-labels> * <Item id=c7415c> * <LabelItem id=c7415c-labels> * <Item id=aee7fd> * <LabelItem id=aee7fd-labels> * <Item id=3f8360> * <LabelItem id=3f8360-labels> * <Item id=425403> * <LabelItem id=425403-labels> * <Item id=bd5c14> * <LabelItem id=bd5c14-labels> * <Item id=e52478> * <LabelItem id=e52478-labels> * <Item id=bc32f1> * <LabelItem id=bc32f1-labels> * <Collection id=nia> * <Item id=825a50> * <LabelItem id=825a50-labels>
# make a dict of all collections witinin train1 catalog
cols = {cols.id:cols for cols in train1_cat.get_children()}
cols
{'acc': <Collection id=acc>, 'dar': <Collection id=dar>, 'kam': <Collection id=kam>, 'mon': <Collection id=mon>, 'nia': <Collection id=nia>, 'ptn': <Collection id=ptn>, 'znz': <Collection id=znz>}
# look at the acc collection, as a dict
cols['acc'].to_dict()
{'description': 'Tier 1 training data from acc', 'extent': {'spatial': {'bbox': [[-0.20863145179911316, 5.573262528211078, -0.18948660187120017, 5.593203677296213]]}, 'temporal': {'interval': [['2019-10-29T00:00:00Z', None]]}}, 'id': 'acc', 'license': 'various', 'links': [{'href': './665946/665946.json', 'rel': 'item', 'type': 'application/json'}, {'href': './665946-labels/665946-labels.json', 'rel': 'item', 'type': 'application/json'}, {'href': './a42435/a42435.json', 'rel': 'item', 'type': 'application/json'}, {'href': './a42435-labels/a42435-labels.json', 'rel': 'item', 'type': 'application/json'}, {'href': './ca041a/ca041a.json', 'rel': 'item', 'type': 'application/json'}, {'href': './ca041a-labels/ca041a-labels.json', 'rel': 'item', 'type': 'application/json'}, {'href': './d41d81/d41d81.json', 'rel': 'item', 'type': 'application/json'}, {'href': './d41d81-labels/d41d81-labels.json', 'rel': 'item', 'type': 'application/json'}, {'href': 'https://drivendata-competition-building-segmentation.s3-us-west-1.amazonaws.com/train_tier_1/acc/collection.json', 'rel': 'self', 'type': 'application/json'}, {'href': '../catalog.json', 'rel': 'root', 'type': 'application/json'}, {'href': '../catalog.json', 'rel': 'parent', 'type': 'application/json'}], 'stac_extensions': ['label'], 'stac_version': '0.8.1'}
# iterate through all the items within acc collection and print their ids
for i in cols['acc'].get_all_items():
print(i.id)
665946 665946-labels a42435 a42435-labels ca041a ca041a-labels d41d81 d41d81-labels
# for all items within acc col, either load and display label geojson with geopandas or raster metadata with rasterio
for i in cols['acc'].get_all_items():
print(i.id,'\n----------------------------')
pprint(i.properties)
if 'label' in i.id:
gdf = gpd.read_file(i.make_asset_hrefs_absolute().assets['labels'].href)
gdf.plot()
plt.show()
else:
print('raster metadata:')
pprint(rasterio.open(i.make_asset_hrefs_absolute().assets['image'].href).meta)
print('\n----------------------------')
665946 ---------------------------- {'area': 'acc', 'datetime': '2018-08-05 00:00:00Z', 'license': 'CC BY 4.0'} raster metadata: {'count': 4, 'crs': CRS.from_epsg(32630), 'driver': 'GTiff', 'dtype': 'uint8', 'height': 150147, 'nodata': None, 'transform': Affine(0.02001518707102818, 0.0, 805429.9166958937, 0.0, -0.02001518707102818, 624939.1898949385), 'width': 84466} ---------------------------- 665946-labels ---------------------------- {'area': 'acc', 'datetime': '2018-08-05 00:00:00Z', 'label:description': 'Geojson building labels for scene 665946', 'label:overviews': [{'counts': [{'count': 7308, 'name': 'yes'}], 'property_key': ['building']}], 'label:properties': ['building'], 'label:type': 'vector', 'license': 'ODbL-1.0'}
---------------------------- a42435 ---------------------------- {'area': 'acc', 'datetime': '2018-10-06 00:00:00Z', 'license': 'CC BY 4.0'} raster metadata: {'count': 4, 'crs': CRS.from_epsg(32630), 'driver': 'GTiff', 'dtype': 'uint8', 'height': 39162, 'nodata': None, 'transform': Affine(0.032029411960186015, 0.0, 804676.2688712641, 0.0, -0.03202926727370731, 621829.9693785439), 'width': 57540} ---------------------------- a42435-labels ---------------------------- {'area': 'acc', 'datetime': '2018-10-06 00:00:00Z', 'label:description': 'Geojson building labels for scene a42435', 'label:overviews': [{'counts': [{'count': 6647, 'name': 'yes'}], 'property_key': ['building']}], 'label:properties': ['building'], 'label:type': 'vector', 'license': 'ODbL-1.0'}
---------------------------- ca041a ---------------------------- {'area': 'acc', 'datetime': '2018-11-12 00:00:00Z', 'license': 'CC BY 4.0'} raster metadata: {'count': 4, 'crs': CRS.from_epsg(32630), 'driver': 'GTiff', 'dtype': 'uint8', 'height': 77778, 'nodata': None, 'transform': Affine(0.035820209694930036, 0.0, 807207.717115721, 0.0, -0.035820613560994544, 620903.4357975163), 'width': 65882} ---------------------------- ca041a-labels ---------------------------- {'area': 'acc', 'datetime': '2018-11-12 00:00:00Z', 'label:description': 'Geojson building labels for scene ca041a', 'label:overviews': [{'counts': [{'count': 10194, 'name': 'yes'}], 'property_key': ['building']}], 'label:properties': ['building'], 'label:type': 'vector', 'license': 'ODbL-1.0'}
---------------------------- d41d81 ---------------------------- {'area': 'acc', 'datetime': '2019-07-07 00:00:00Z', 'license': 'CC BY 4.0'} raster metadata: {'count': 4, 'crs': CRS.from_epsg(32630), 'driver': 'GTiff', 'dtype': 'uint8', 'height': 42719, 'nodata': None, 'transform': Affine(0.05179965064903244, 0.0, 809267.1099320438, 0.0, -0.05180039261972288, 618981.5459276063), 'width': 40868} ---------------------------- d41d81-labels ---------------------------- {'area': 'acc', 'datetime': '2019-07-07 00:00:00Z', 'label:description': 'Geojson building labels for scene d41d81', 'label:overviews': [{'counts': [{'count': 9436, 'name': 'yes'}], 'property_key': ['building']}], 'label:properties': ['building'], 'label:type': 'vector', 'license': 'ODbL-1.0'}
----------------------------
# open one image item
SCENE_ID = 'ca041a'
one_item = cols['acc'].get_item(id=SCENE_ID)
one_item.to_dict()
{'assets': {'image': {'href': 'https://drivendata-competition-building-segmentation.s3-us-west-1.amazonaws.com/train_tier_1/acc/ca041a/ca041a.tif', 'title': 'GeoTIFF', 'type': 'image/tiff; application=geotiff; profile=cloud-optimized'}}, 'bbox': [-0.22707525357332697, 5.585527399115482, -0.20581415249279408, 5.610742610987594], 'collection': 'acc', 'geometry': {'coordinates': [[[-0.2260939759101167, 5.607821019807083], [-0.22707525357332697, 5.609567361411101], [-0.2257626190986551, 5.610742610987594], [-0.2209214783972656, 5.60396659440964], [-0.2209297943096631, 5.603475955578037], [-0.21938590601191368, 5.601711342600872], [-0.21863322644066166, 5.601370284670147], [-0.2171984310079642, 5.60126443910544], [-0.2150344772406196, 5.602172946869177], [-0.21221192884842804, 5.603687126475405], [-0.2071666235973881, 5.60628622311988], [-0.20581415249279408, 5.604666197948947], [-0.2074253572000044, 5.603584221065274], [-0.2083544460457665, 5.6019965375946645], [-0.20906008314381597, 5.600996885039098], [-0.21070656970592522, 5.599526807751497], [-0.2114122068039735, 5.597897962116837], [-0.212416616071534, 5.595984932725826], [-0.2138692108714978, 5.593295072530488], [-0.21395275670010877, 5.593002662130355], [-0.21397782044869207, 5.590391854986293], [-0.2141449121059085, 5.590032607923272], [-0.21550670911225214, 5.588150738133833], [-0.21721104401589492, 5.585527399115482], [-0.21795907856051402, 5.590627248068187], [-0.21855973661963968, 5.59108215821591], [-0.2203440443835102, 5.593219794249854], [-0.22225201704190803, 5.59588742268891], [-0.2234356667466541, 5.598325387752418], [-0.22523764092403067, 5.601536258406711], [-0.22571463408862905, 5.6031880680693025], [-0.2259010426101557, 5.606526622234949], [-0.2260939759101167, 5.607821019807083]]], 'type': 'Polygon'}, 'id': 'ca041a', 'links': [{'href': '../collection.json', 'rel': 'collection', 'type': 'application/json'}, {'href': 'https://drivendata-competition-building-segmentation.s3-us-west-1.amazonaws.com/train_tier_1/acc/ca041a/ca041a.json', 'rel': 'self', 'type': 'application/json'}, {'href': '../../catalog.json', 'rel': 'root', 'type': 'application/json'}, {'href': '../collection.json', 'rel': 'parent', 'type': 'application/json'}], 'properties': {'area': 'acc', 'datetime': '2018-11-12T00:00:00Z', 'license': 'CC BY 4.0'}, 'stac_version': '0.8.1', 'type': 'Feature'}
# load raster for this item
rst = rasterio.open(one_item.assets['image'].href)
rst.meta
{'count': 4, 'crs': CRS.from_epsg(32630), 'driver': 'GTiff', 'dtype': 'uint8', 'height': 77778, 'nodata': None, 'transform': Affine(0.035820209694930036, 0.0, 807207.717115721, 0.0, -0.035820613560994544, 620903.4357975163), 'width': 65882}
# check raster resolution
rst.res
(0.035820209694930036, 0.035820613560994544)
# make a windowed read of this raster and reshape into a displayable 4-d array (RGB+alpha channel)
# more on windowed reads with rasterio: https://rasterio.readthedocs.io/en/stable/topics/windowed-rw.html#windows
win_sz = 1024
window = Window(rst.meta['width']//2,rst.meta['height']//2,win_sz,win_sz) # 1024x1024 window starting at center of raster
win_arr = rst.read(window=window)
win_arr = np.moveaxis(win_arr,0,2)
plt.figure(figsize=(10,10))
plt.imshow(win_arr)
<matplotlib.image.AxesImage at 0x7f11336ace80>
# create a Polygon box to chip into training image and label
from shapely.geometry import box
win_box = box(*rasterio.windows.bounds(window, rst.meta['transform']))
CRS.from_epsg(4326)
<Geographic 2D CRS: EPSG:4326> Name: WGS 84 Axis Info [ellipsoidal]: - Lat[north]: Geodetic latitude (degree) - Lon[east]: Geodetic longitude (degree) Area of Use: - name: World - bounds: (-180.0, -90.0, 180.0, 90.0) Datum: World Geodetic System 1984 - Ellipsoid: WGS 84 - Prime Meridian: Greenwich
rst.meta['crs']
CRS.from_epsg(32630)
# make box into geodataframe for reprojection and later intersect operation
win_box_gdf = gpd.GeoDataFrame(geometry=[win_box], crs=rst.meta['crs'])
win_box_gdf = win_box_gdf.to_crs(CRS.from_epsg(4326))
print(win_box_gdf.crs)
win_box_gdf.plot()
epsg:4326
<matplotlib.axes._subplots.AxesSubplot at 0x7f11331c3748>
one_item_label = cols['acc'].get_item(id=SCENE_ID+'-labels')
one_item_label.to_dict()
{'assets': {'labels': {'href': 'https://drivendata-competition-building-segmentation.s3-us-west-1.amazonaws.com/train_tier_1/acc/ca041a-labels/ca041a.geojson', 'type': 'application/geo+json'}}, 'bbox': [-0.22707525357332697, 5.585527399115482, -0.20581415249279408, 5.610742610987594], 'geometry': {'coordinates': [[[-0.2260939759101167, 5.607821019807083], [-0.22707525357332697, 5.609567361411101], [-0.2257626190986551, 5.610742610987594], [-0.2209214783972656, 5.60396659440964], [-0.2209297943096631, 5.603475955578037], [-0.21938590601191368, 5.601711342600872], [-0.21863322644066166, 5.601370284670147], [-0.2171984310079642, 5.60126443910544], [-0.2150344772406196, 5.602172946869177], [-0.21221192884842804, 5.603687126475405], [-0.2071666235973881, 5.60628622311988], [-0.20581415249279408, 5.604666197948947], [-0.2074253572000044, 5.603584221065274], [-0.2083544460457665, 5.6019965375946645], [-0.20906008314381597, 5.600996885039098], [-0.21070656970592522, 5.599526807751497], [-0.2114122068039735, 5.597897962116837], [-0.212416616071534, 5.595984932725826], [-0.2138692108714978, 5.593295072530488], [-0.21395275670010877, 5.593002662130355], [-0.21397782044869207, 5.590391854986293], [-0.2141449121059085, 5.590032607923272], [-0.21550670911225214, 5.588150738133833], [-0.21721104401589492, 5.585527399115482], [-0.21795907856051402, 5.590627248068187], [-0.21855973661963968, 5.59108215821591], [-0.2203440443835102, 5.593219794249854], [-0.22225201704190803, 5.59588742268891], [-0.2234356667466541, 5.598325387752418], [-0.22523764092403067, 5.601536258406711], [-0.22571463408862905, 5.6031880680693025], [-0.2259010426101557, 5.606526622234949], [-0.2260939759101167, 5.607821019807083]]], 'type': 'Polygon'}, 'id': 'ca041a-labels', 'links': [{'href': '../collection.json', 'rel': 'collection', 'type': 'application/json'}, {'href': 'https://drivendata-competition-building-segmentation.s3-us-west-1.amazonaws.com/train_tier_1/acc/ca041a-labels/ca041a-labels.json', 'rel': 'self', 'type': 'application/json'}, {'href': '../../catalog.json', 'rel': 'root', 'type': 'application/json'}, {'href': '../collection.json', 'rel': 'parent', 'type': 'application/json'}], 'properties': {'area': 'acc', 'datetime': '2018-11-12T00:00:00Z', 'label:description': 'Geojson building labels for scene ca041a', 'label:overviews': [{'counts': [{'count': 10194, 'name': 'yes'}], 'property_key': ['building']}], 'label:properties': ['building'], 'label:type': 'vector', 'license': 'ODbL-1.0'}, 'stac_extensions': ['label'], 'stac_version': '0.8.1', 'type': 'Feature'}
scene_labels_gdf = gpd.read_file(one_item_label.assets['labels'].href)
# plot both gdf to visualize where win_box is within the whole image scene
fig, ax = plt.subplots(1,1,figsize=(15,15))
scene_labels_gdf.plot(ax=ax)
win_box_gdf.plot(ax=ax, color='red')
<matplotlib.axes._subplots.AxesSubplot at 0x7f113356ec88>
# get the label geoms that intersect with our win_box
gdf_chip = gpd.sjoin(scene_labels_gdf, win_box_gdf, how='inner', op='intersects')
gdf_chip.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x7f11331be208>
# create a list of (geom, burn_val) to pass into rasterio's rasterize function, see: https://rasterio.readthedocs.io/en/latest/api/rasterio.features.html#rasterio.features.rasterize
burn_val = 255
shapes = [(geom, burn_val) for geom in gdf_chip.geometry]
shapes[0][0]
# get the affine transform object for our win_box to pass into rasterize()
print(win_box_gdf.bounds.values[0])
chip_tfm = rasterio.transform.from_bounds(*win_box_gdf.bounds.values[0], win_sz, win_sz)
print(chip_tfm)
[-0.21648885 5.59777824 -0.21615647 5.59811124] | 0.00, 0.00,-0.22| | 0.00,-0.00, 5.60| | 0.00, 0.00, 1.00|
from rasterio.features import rasterize
label_arr = rasterize(shapes, (win_sz, win_sz), transform=chip_tfm, dtype='uint8')
plt.imshow(label_arr)
<matplotlib.image.AxesImage at 0x7f11335f8518>
fig, (ax1, ax2) = plt.subplots(1,2,figsize=(20,10))
ax1.imshow(win_arr)
ax2.imshow(win_arr)
ax2.imshow(label_arr, alpha=0.5)
<matplotlib.image.AxesImage at 0x7f11334fc240>
plt.imshow(win_arr)
<matplotlib.image.AxesImage at 0x7f11335fee80>
chip_tfm
Affine(3.24587556346464e-07, 0.0, -0.21648885023580522, 0.0, -3.251996054240078e-07, 5.598111242040538)
def save_geochip(arr, chip_tfm, save_fn='test', crs='EPSG:4326', dtype='uint8'):
im = (arr).astype(dtype)
# check im shape, number of channels and expand into (H,W,C) if needed
if len(im.shape) == 3: num_ch = im.shape[-1]
else:
num_ch = 1
im = np.expand_dims(im, -1)
with rasterio.open(f'{save_fn}.tif', 'w', driver='GTiff',
height=im.shape[0], width=im.shape[1],
count=num_ch, dtype=im.dtype, crs=crs, transform=chip_tfm, compress='LZW') as dst: #TODO: look into issue using pyproj CRS object here
for ch in range(num_ch):
dst.write(im[:,:,ch], indexes=ch+1) #indexes start at 1
save_geochip(win_arr, chip_tfm)
save_geochip(label_arr, chip_tfm, 'label')
Check in QGIS that our saved chips are properly georeferenced against OSM baselayer:
Looks good!
https://rasterio.readthedocs.io/en/latest/quickstart.html#saving-raster-data
https://rasterio.readthedocs.io/en/latest/topics/windowed-rw.html#writing
# affine transform matrix for the 1024x1024 chip created above is used to convert from display to geo coords
chip_tfm
Affine(3.24587556346464e-07, 0.0, -0.21648885023580522, 0.0, -3.251996054240078e-07, 5.598111242040538)
# the inverse affine tfm converts from geo coords to display
~chip_tfm
Affine(3080832.8306110487, 0.0, 666965.9572677072, 0.0, -3075034.4813492666, 17214385.099703625)
# grab a sample geometry from within the chip area
sample_geom = shapes[0][0]
sample_geom
# get geo coords of the exterior vertices of this geometry
xs,ys = sample_geom.exterior.coords.xy
xs,ys
(array('d', [-0.2164429, -0.2164399, -0.2163469, -0.2163474, -0.2163521, -0.2163547, -0.2164429]), array('d', [5.5979852, 5.598095, 5.5980925, 5.598077, 5.5980771, 5.5979828, 5.5979852]))
# use the inverse affine tfm to convert our geo coords to display x,y values in 2 respective lists
display_xs = []
display_ys = []
for x,y in zip(xs,ys):
display_coord = ~chip_tfm * (x,y)
display_xs.append(display_coord[0])
display_ys.append(display_coord[1])
display_xs, display_ys
([141.56499504309613, 150.80749353487045, 437.3249467817368, 435.7845303664217, 421.30461606255267, 413.2944507029606, 141.56499504309613], [387.58362075313926, 49.9448347017169, 57.632420904934406, 105.29545536637306, 104.98795191943645, 394.96370350942016, 387.58362075313926])
# scatter plot to check
# axes start from bottom left and display coords can be negative values (meaning it's outside the visible area in the chip)
plt.scatter(display_xs, display_ys)
<matplotlib.collections.PathCollection at 0x7f1133666320>
# plot on top of our earlier image and label chips to doublecheck in a more obvious way
fig, (ax1, ax2) = plt.subplots(1,2,figsize=(20,10))
ax1.imshow(win_arr)
ax2.imshow(win_arr)
ax2.scatter(display_xs, display_ys, color='red')
ax2.imshow(label_arr, alpha=0.5)
<matplotlib.image.AxesImage at 0x7f11323a6fd0>