Antarctica's Western Peninsula is a rapidly changing ecosystem. To quantify these changes and prepare for the Global biogeochemical and ecological costs of these changes, we must look at long terms trends. Here is analysis of 15 years of Inorganic Nutrient data taken from Palmer LTER January cruises. The Cruise follows a regional grid along the Western Antarctica Peninsula (wAP) as follows:
Image(filename='palmerGrid.jpg')
The software in this notebook is intended to help standardize data to expedite analysis and high-quality research. Specifically, this iPython notebook transfers the Palmer Inorganic Nutrient dataset into a Pandas dataframe, cleans and standardizes measurements according to a simplified grid, provides annual depth profiles by station, and interpolates station data between and throughout years to get a glimpse of regional inorganic nutrient patterns. CTD data is also used to estimate Mixed Layer Depth (MLD) using a gradient and threshold method. The combined NO2 NO3 is integrated from the surface to the MLD for each station annually to quantify the nutrients removed from the water column due to Phytoplankton and Diatoms. From there, we can employ the stoichiometric ratio of marine biota-- known as the Redfield Ratio, to calculate how much CO2 is taken from the water column from this process called, Net Community Production (NCP).
import os
import json
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from mpl_toolkits.basemap import Basemap
import seaborn
import colorsys
from scipy.interpolate import griddata
from scipy.integrate import simps
from numpy import trapz
from IPython.display import Image
%matplotlib inline
# Global variables
seaborn.set(font_scale=1.5)
plt.rcParams['figure.figsize'] = (15,9)
skip = -999 # skip value in thr dataset
mark = ['o','p','d','v','s'] # markers for scatterplots
surfDepth = 10 #(m) depth of surface
maxDepth = 250
cMap = 'RdBu_r'
outOfGrid = ['Trap','Rothera','B','E','notFound','outGrid']
dsPath = 'data_sets/' # data set directory
Let's read the grid into Python through a Pandas dataframe, so we can work with it... Moreover, let's create some functions so we can continue to do so.
def read_grd(fileIn,col):
if(os.path.isfile(fileIn)):
grid = pd.read_csv(fileIn,sep=',',skiprows=1,names=col,na_filter=True)
grid = grid[grid.stationID.isin(outOfGrid)==False]
return grid
else:
print (fileIn,"is either unreadable or DNE.")
raise SystemExit(0)
def draw_wAP(zoom=False,lat=-66.90,lon=-67.05):
if(zoom!=1):
map = Basemap(width=1800000,height=1000000,
resolution='l',projection='stere',\
lat_ts=50,lat_0=lat,lon_0=lon)
elif(zoom==1):
map = Basemap(width=320000,height=180000,
resolution='l',projection='stere',\
lat_ts=50,lat_0=-64.90,lon_0=-66.05)
map.drawparallels(np.arange(-90.,90.,1.),labels=[False,False,False,True],
color='w',dashes=[100,.0001],zorder=0)
map.drawmeridians(np.arange(-180.,181.,1.),labels=[True,False,False,False],
color='w',dashes=[100,.0001],zorder=0)
map.drawcoastlines()
map.fillcontinents()
map.drawmapboundary()
return map
def plt_grd(grid_df):
map = draw_wAP(zoom=False,lat=-65.40,lon=-67.05)
x1 = grid_df['lon'].values
y1 = grid_df['lat'].values
labels = grid_df['stationID']
xS, yS = map(x1, y1)
map.scatter(xS, yS, marker='o',s=26,color='k')
for label, xpt, ypt in zip(labels, xS, yS):
plt.text(xpt-70000, ypt-5000, label,size=8)
plt.title('Palmer LTER Regional Grid')
plt.show()
# files
gridInfile = os.path.join(dsPath+'Basic Western Antarctic Peninsula Survey Grid.csv')
gridCol = ["studyName","gridLine","gridStation","stationID","lat","lon","gridCode"]
# first read in the grid standard
grid = read_grd(gridInfile,gridCol)
plt_grd(grid)
Sometime is not right with the station names, so let's fix it!
newGrid = read_grd(gridInfile,gridCol)
newGrid.gridStation = newGrid.gridStation.astype(int).apply(lambda x: '{0:0>3}'.format(x))
newGrid.gridLine = newGrid.gridLine.astype(int).apply(lambda x: '{0:0>3}'.format(x))
newGrid['stationID'] = newGrid.gridLine.map(str) +"."+ newGrid.gridStation.map(str)
plt_grd(newGrid)
Now let's take a look at the cruise measurements from 2000-onwards.
# CSV file from Datazoo
inOrgInfile = os.path.join(dsPath+'Dissolved Inorganic Nutrients.csv')
# Column formats for inorganic nutrients
inOrgCol = ['studyName','eventNum','cast','bottle','datetime','stationID','gridLine',\
'gridStation','lat','lon','percentIrradiance','depth','PO4','SiO4','NO2','NO3','NH4','Nitrite',\
'notes']
def read_inOrg(infile,cols):
# Import the files
if(os.path.isfile(infile)):
inOrg = pd.read_csv(infile,skiprows=1,names=cols)
# Munging...
inOrg = inOrg[(inOrg['lon'].notnull()) & (inOrg['lat'].notnull())&
(inOrg['depth']!=skip) & (inOrg['depth'].notnull())&
(inOrg['datetime'].notnull())]
# Time keeping...
inOrg =inOrg[inOrg.datetime>'1999-12-31 00:00:00'] # start date
inOrg.datetime=pd.to_datetime(inOrg.datetime,format='%Y-%m-%d %H:%M:%S',exact=True)
inOrg['year']=pd.DatetimeIndex(inOrg['datetime']).year
inOrg['month']=pd.DatetimeIndex(inOrg['datetime']).month
inOrg.lat=abs(inOrg.lat.map('{:,.5f}'.format).astype(float))*-1 # convert all coords to south
inOrg.lon=abs(inOrg.lon.map('{:,.5f}'.format).astype(float))*-1 # convert all coords to west
inOrg = inOrg.reset_index()
print("The imported database contains",len(inOrg.index),"records.")
return inOrg
else:
print (fileIn1,"is either unreadable or DNE.")
raise SystemExit(0)
inOrg = read_inOrg(inOrgInfile,inOrgCol)
inOrg.head()
The imported database contains 9101 records.
index | studyName | eventNum | cast | bottle | datetime | stationID | gridLine | gridStation | lat | ... | depth | PO4 | SiO4 | NO2 | NO3 | NH4 | Nitrite | notes | year | month | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7681 | LMG00-01 | 65 | NaN | 23 | 2000-01-09 16:10:00 | NaN | 601.16 | 38.56 | -64.93363 | ... | 3.0 | 1.38 | 82.85 | 0.15 | 20.18 | 0.51 | -999 | NaN | 2000 | 1 |
1 | 7682 | LMG00-01 | 65 | NaN | 21 | 2000-01-09 16:10:00 | NaN | 601.16 | 38.56 | -64.93363 | ... | 3.5 | 1.38 | 82.97 | 0.15 | 20.04 | 0.59 | -999 | NaN | 2000 | 1 |
2 | 7683 | LMG00-01 | 65 | NaN | 19 | 2000-01-09 16:10:00 | NaN | 601.16 | 38.56 | -64.93363 | ... | 6.3 | 1.37 | 83.42 | 0.16 | 20.08 | 0.46 | -999 | NaN | 2000 | 1 |
3 | 7684 | LMG00-01 | 65 | NaN | 17 | 2000-01-09 16:10:00 | NaN | 601.16 | 38.56 | -64.93363 | ... | 10.1 | 1.38 | 83.53 | 0.16 | 20.41 | 0.45 | -999 | NaN | 2000 | 1 |
4 | 7685 | LMG00-01 | 65 | NaN | 15 | 2000-01-09 16:10:00 | NaN | 601.16 | 38.56 | -64.93363 | ... | 12.3 | 1.44 | 83.32 | 0.15 | 21.00 | 0.81 | -999 | NaN | 2000 | 1 |
5 rows × 22 columns
Let's take a peep at the spatial-temporal distribution of measurments in the grid...
def plt_interannual(df,zoom=False):
# Initialize the plot
map = draw_wAP(zoom)
yearList = np.array(df.year.unique()) # List of years from the LTRE
yearVar = len(yearList) # How long has this been going on?
colorSwatch = cm.YlGnBu(np.linspace(0, 1, yearVar)) # Colors for each year
for i in range(yearVar):
querry = df[df.year==yearList[i]] # Querry the database per year
x1 = querry.lon.values
y1 = querry.lat.values
x, y = map(x1, y1) # Convert the lat and lon to a Basemap-readable format.
map.scatter(x, y, color=colorSwatch[i], marker='o',s=24,label=np.asarray(querry.year)[0])
plt.legend(scatterpoints=1,
loc='lower right',
ncol=1,
fontsize=10)
plt.title('Interannual Distribution of Palmer LTER Inorganic Nutrients')
plt.show()
plt_interannual(inOrg)
It looks like all stations are lined up with the excpetion of some outside of the station lines... But let's not make any assumptions just yet, and take a closer look.
plt_interannual(inOrg,zoom=True)
We see that although these points are taken in the same proximity, their coordinates aren't flush to the correct percision...
print("We also see that there are ",len(inOrg.lat.unique()),"and",len(inOrg.lon.unique()),"unique latitude and longitude coordiantes,")
print("and that the stationID's are not standardized:\n",inOrg.stationID.unique()[:14])
print("We want to see",len(grid.stationID),"coordinate pairs and stationIDs...")
We also see that there are 903 and 940 unique latitude and longitude coordiantes, and that the stationID's are not standardized: [nan '500.06' '600.04' '600.06' '600.08' '600.1' '600.12' '600.14' '600.16' '600.18' '600.2' '600.22' '600.26' '500.22'] We want to see 120 coordinate pairs and stationIDs...
Before we can do any anaylsis, we need to wrangle the data into a comparable format...
This task can be accomplished by standizing stationIDs and coordiantes that are within reasonable bounrds of idealized station coordaintes. This function compares each gritty coordinate pair with idealized pairs and outputs a Pandas DF contianing corrected coordinates (lat,lon) and stationID.
cord2stationID
Top
latThresh = 0.0645
lonThresh = 0.0750
def cord2stationID(df):
# Lets standardize the coordinates for SW
latIn = -1*abs(df.lat)
lonIn = -1*abs(df.lon)
# how many points are there?
x=len(latIn)
# List to hold corrected stations.
newStations = []
# Iterate through each lat lon pair in the list
for i in range(x):
# Find standard latitude that is closest to observed.
querry = newGrid[newGrid.lat<=latIn[i]+latThresh]
querry = querry[querry.lat>=latIn[i]-latThresh]
# If DNE...
if querry.empty:
newStations.append({'lat':latIn[i],'lon':lonIn[i],
'stationID':'notFound',
'gridLine':'notFound',
'stationLine':'notFound'})
# IF DE, look for standard longitude closest to observed.
else:
querry = querry[querry.lon<=lonIn[i]+lonThresh]
querry = querry[querry.lon>=lonIn[i]-lonThresh]
qLen = len(querry.index)
# DNE...
if querry.empty:
newStations.append({'lat':latIn[i],'lon':lonIn[i],
'stationID':'notFound',
'gridLine':'notFound',
'stationLine':'notFound'})
# If there is perfect a match
elif (qLen==1):
newStations.append({'lat':np.asarray(querry.lat, dtype=np.float)[0],
'lon':np.asarray(querry.lon, dtype=np.float)[0],
'stationID':np.asarray(querry.stationID, dtype=object)[0],
'gridLine':np.asarray(querry.gridStation, dtype=object)[0],
'stationLine':np.asarray(querry.gridLine, dtype=object)[0]})
# the list has multiple values
else:
qLon = querry.lon.values
qLat = querry.lat.values
qStation = querry.stationID.values
qGridLine = querry.gridLine.values
qStationLine = querry.gridStation.values
minDist = 1e6
# calculate which lat lon combo is closest to station values
for j in range(qLen):
lonDiff = abs(lonIn[i]-qLon[j])
latDiff = abs(latIn[i]-qLat[j])
sumDist = lonDiff+latDiff
if(sumDist<minDist):
mindex = j
# update the lists
newStations.append({'lat':qLat[mindex],'lon':qLon[mindex],
'stationID':qStation[mindex],
'gridLine':qGridLine[mindex],
'stationLine':qStationLine[mindex]})
# Station corrections done, update df with new values.
df[['lat','lon','stationID',
'gridLine','stationLine']] = pd.DataFrame(newStations)[['lat','lon','stationID',
'gridLine','stationLine']]
return df
How does this new cleaned data compare to the old?
inOrgClean = cord2stationID(inOrg)
map = draw_wAP()
querry = inOrgClean[inOrgClean['stationID'].isin(outOfGrid)==False]
x1 = querry.lon.values
y1 = querry.lat.values
x, y = map(x1, y1)
map.scatter(x, y, color='k', marker='o',s=24,linewidth=.08,alpha=.8,label='Confirmed Grid Station')
querry = inOrgClean[inOrgClean['stationID']=='notFound']
x2 = querry.lon.values
y2 = querry.lat.values
x, y = map(x2, y2)
map.scatter(x, y, marker='o',color='yellow',s=24,linewidth=.08,alpha=.8,label='Unmarked Station')
gridBounds = ['000.220','100.220','200.220','300.220','400.220','500.220','600.220']
gridLabels = newGrid[newGrid['stationID'].isin(gridBounds)]
x3 = gridLabels.lon.values
y3 = gridLabels.lat.values
xS, yS = map(x3, y3)
labels = gridLabels.gridLine.values.astype(int)
for label, xpt, ypt in zip(labels, xS, yS):
plt.text(xpt-84000, ypt+19000, label,size=14)
plt.legend(scatterpoints=1,
loc='lower right',
ncol=1,
fontsize=12)
plt.title('Standardized Grid')
plt.show()
plt_interannual(inOrgClean.loc[inOrgClean['stationID'].isin(outOfGrid)==False],zoom=True)
Looks good on the map! How about values?
inOrgClean.head()
index | studyName | eventNum | cast | bottle | datetime | stationID | gridLine | gridStation | lat | ... | PO4 | SiO4 | NO2 | NO3 | NH4 | Nitrite | notes | year | month | stationLine | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7681 | LMG00-01 | 65 | NaN | 23 | 2000-01-09 16:10:00 | 600.040 | 040 | 38.56 | -64.93333 | ... | 1.38 | 82.85 | 0.15 | 20.18 | 0.51 | -999 | NaN | 2000 | 1 | 600 |
1 | 7682 | LMG00-01 | 65 | NaN | 21 | 2000-01-09 16:10:00 | 600.040 | 040 | 38.56 | -64.93333 | ... | 1.38 | 82.97 | 0.15 | 20.04 | 0.59 | -999 | NaN | 2000 | 1 | 600 |
2 | 7683 | LMG00-01 | 65 | NaN | 19 | 2000-01-09 16:10:00 | 600.040 | 040 | 38.56 | -64.93333 | ... | 1.37 | 83.42 | 0.16 | 20.08 | 0.46 | -999 | NaN | 2000 | 1 | 600 |
3 | 7684 | LMG00-01 | 65 | NaN | 17 | 2000-01-09 16:10:00 | 600.040 | 040 | 38.56 | -64.93333 | ... | 1.38 | 83.53 | 0.16 | 20.41 | 0.45 | -999 | NaN | 2000 | 1 | 600 |
4 | 7685 | LMG00-01 | 65 | NaN | 15 | 2000-01-09 16:10:00 | 600.040 | 040 | 38.56 | -64.93333 | ... | 1.44 | 83.32 | 0.15 | 21.00 | 0.81 | -999 | NaN | 2000 | 1 | 600 |
5 rows × 23 columns
Now each point along the station lines has standardized coordinates and is separated from non-conventional station points!
Since this clean-up process is memory-intensive, we want to run it once. To save our progress we can save the cleaned up dataframe as a CSV...
Parameters:
df: Pandas dataframe
csvout:String (filename)
returns:
CSV of the Pandas dataframe in the parent directory.
Top
def convert2CSV(df,csvout):
toCSV=pd.DataFrame.to_csv(df,sep=',',line_terminator='\n')
csvout = open(csvout, 'w') # opens the file to write into
csvout.write(toCSV) # writes df to csv...
print("Database transferred to csv...")
csvout.close()
# Drop the year and month columns.
cleanDF = inOrgClean[['studyName','eventNum','cast','bottle','datetime','stationID','gridLine',
'gridStation','lat','lon','percentIrradiance','depth','PO4','SiO4','NO2',
'NO3','NH4','Nitrite','notes']]
# filenames
inOrgOutfile = os.path.join(dsPath+'inOrganicNutrientsClean.csv')
gridOutfile = os.path.join(dsPath+'Revised Basic Western Antarctic Peninsula Survey Grid.csv')
# Run it!
convert2CSV(cleanDF, inOrgOutfile)
convert2CSV(newGrid,gridOutfile)
Database transferred to csv... Database transferred to csv...
Now that we've preprocessed the data, we can finally start doing some analysis!
Top
# Import the clean files
inOrgCleanFile = os.path.join(dsPath+'inOrganicNutrientsClean.csv')
inOrg = read_inOrg(inOrgCleanFile,inOrgCol)
The imported database contains 9101 records.
def vertical_profiles(inOrg,maxDepth,drawMLD=False,method='t_sig_down',byStation=False,station='200.220',byYear=True,year=2010):
yearList = np.array(inOrg.year.unique()) # List of years from the LTRE
colorSwatch = cm.YlGnBu(np.linspace(0, 1, len(yearList) )) # Colors for plotting
color = pd.DataFrame(colorSwatch,columns=['hue','lightness','saturation','other'])
color['year'] = yearList
scatMark = 25
# Search each station
if(byStation!=1):
for station in np.sort(pd.unique(inOrg['stationID'].loc[inOrg['stationID'].isin(outOfGrid)==False].values.ravel())):
stationData = inOrg[inOrg.stationID==station] # get all values in specific station for all stationVar
stationTime = stationData.datetime.unique() # all sample dates from the station
# Initiate the plot
fig,ax = plt.subplots(ncols=3,figsize=(14,6),sharey=True)
# label and limit the 3 axis
xlabel = [r'PO$_{4}$ [$\mu$m/L]',r'NO$_{3}$+NO$_{2}$ [$\mu$m/L]',r'SiO$_{4}$ [$\mu$m/L]']
for j in range(3):
ax[j].locator_params(axis = 'x',nbins=5)
ax[j].set_xlabel(xlabel[j])
ax[0].set_ylim([-abs(maxDepth),0])
ax[0].set_ylabel('Water column height [m]')
count = 0
# Plot each year
for jj in range(len(stationTime)):
target = stationData.loc[stationData.datetime==stationTime[jj]] # get only values from selected stationInterannualVar
if(len(target.index)<=3): # make sure there are more than 3 points per time
break
else:
colorYear = (color[['hue','lightness','saturation']][color.year==np.asarray(target.year.unique(), dtype=np.float)[0]]).values
# For each chemical tracer
chem = ['PO4','Nitrite','SiO4']
for k in range(len(chem)):
df = target.loc[(target[chem[k]]!=skip) & (target[chem[k]].notnull())]
x = df[chem[k]].values
y = df['depth'].values*-1
if(x.any()):
labels = np.asarray(target.year, dtype=np.int)[0]
ax[k].scatter(x,y ,marker=mark[jj%5],s=scatMark,color=colorYear,zorder=1,linewidth=.2,label=labels)
ax[k].plot(x,y,color=colorYear[0],zorder=10)
ax[k].legend(scatterpoints=1,loc='lower left',ncol=1,fontsize=10)
if(drawMLD==1):
MLD = mld_df.loc[(mld_df.stationID==stationID[i]) & (mld_df.year==labels)]
if (len(MLD)>1):
ax[k].axhline(y=MLD[method].iloc[0]*-1,linewidth=2,color=colorYear[0],alpha=0.8,zorder = 3)
count +=1
if(count==0):
plt.close()
fig.suptitle(r'Interannual Variability of Inorganic Nutrients at Station '+np.asarray(target.stationID)[0]+\
' ('+str(np.asarray(target.lat)[0])+','+str(np.asarray(target.lon)[0])+')',fontsize=22)
plt.show()
plt.tight_layout
# Or get profile of specified station...
else:
stationData = inOrg[inOrg.stationID==station] # get all values in specific station for all stationVar
stationTime = stationData.datetime.unique() # all sample dates from the station
# Initiate the plot
fig,ax = plt.subplots(ncols=3,figsize=(14,6),sharey=True)
# label and limit the 3 axis
xlabel = [r'PO$_{4}$ [$\mu$m/L]',r'NO$_{3}$+NO$_{2}$ [$\mu$m/L]',r'SiO$_{4}$ [$\mu$m/L]']
for j in range(3):
ax[j].locator_params(axis = 'x',nbins=5)
ax[j].set_xlabel(xlabel[j])
ax[0].set_ylim([-abs(maxDepth),0])
ax[0].set_ylabel('Water column height [m]')
count = 0
# Plot each year
for jj in range(len(stationTime)):
target = stationData.loc[stationData.datetime==stationTime[jj]] # get only values from selected stationInterannualVar
colorYear = (color[['hue','lightness','saturation']][color.year==np.asarray(target.year.unique(), dtype=np.float)[0]]).values
# For each chemical tracer
chem = ['PO4','Nitrite','SiO4']
for k in range(len(chem)):
df = target.loc[(target[chem[k]]!=skip) & (target[chem[k]].notnull())]
x = df[chem[k]].values
y = df['depth'].values*-1
if(x.any()):
labels = np.asarray(target.year, dtype=np.int)[0]
ax[k].scatter(x,y ,marker=mark[jj%5],s=scatMark,color=colorYear,zorder=1,linewidth=.2,label=labels)
ax[k].plot(x,y,color=colorYear[0],zorder=10)
ax[k].legend(scatterpoints=1,loc='lower left',ncol=1,fontsize=10)
if(drawMLD==1):
MLD = mld_df.loc[(mld_df.stationID==station) & (mld_df.year==labels)]
if (len(MLD)>1):
ax[k].axhline(y=MLD[method].iloc[0]*-1,linewidth=2,color=colorYear[0],alpha=0.8,zorder = 3)
count +=1
if(count==0):
plt.close()
fig.suptitle(r'Interannual Variability of Inorganic Nutrients at Station '+np.asarray(target.stationID)[0]+\
' ('+str(np.asarray(target.lat)[0])+','+str(np.asarray(target.lon)[0])+')',fontsize=22)
plt.show()
plt.tight_layout
With vertical_profiles(), we can inspect how each station varies interanually with depth, or by individual station when byStation=True, and station='stationID' are set as arguments.
vertical_profiles(inOrg,maxDepth,byStation=True,station='200.100')
To save space, I only showed one, but we can print them all using verticalprofiles(inOrg,maxDepth) Now we know how each station changes with depth over time, but these depth profiles are not informative as standalone figures. To get more insight let's compare each cruise across the years and averaged among years. We can do so by interpolating between stations.
def interpolate(df,tracer,MLD=False):
# Querry the database and clean up
tracer_df = df.loc[(df[tracer]!=skip) &
(df[tracer].notnull()) &
(df['stationID'].isin(outOfGrid)==False)]
# tracer specific values like title, depth-dependence, and colorbar labels.
if(tracer=='Nitrite'):
title = 'Surface NO$_{3}$+NO$_{2}$'
tracer_df = tracer_df[tracer_df['depth']<=surfDepth]
cBarLabel = r'$\mu$mol/L'
elif(tracer=='PO4'):
title = 'Surface PO$_{4}$'
tracer_df = tracer_df[tracer_df['depth']<=surfDepth]
cBarLabel = r'$\mu$mol/L'
elif(tracer=='SiO4'):
title = 'Surface SiO$_{4}$'
tracer_df = tracer_df[tracer_df['depth']<=surfDepth]
cBarLabel = r'$\mu$mol/L'
elif(tracer=='Nuit_Deficiency'):
title = 'Net Community Production'
cBarLabel = r'mol C / m$^{2}$'
else:
title = 'Mixed Layer Depth'
tracer_df = tracer_df.loc[(tracer_df[tracer]<200) & (tracer_df[tracer]>0)]
cBarLabel = 'm'
# Interpolate and plot for each year
for year in pd.unique(tracer_df.year.values.ravel()):
querry = tracer_df[tracer_df['year']==year]
# Define the plotable elements
x = querry.lon.values
y = querry.lat.values
z = querry[tracer].values
# If it's not empty let's interpolate!
if(len(x)>5):
# plot data points.
#plt.scatter(x,y,facecolors='none', edgecolors='k',s=26, zorder=3)
if(MLD==True):
querr2 = mld_df[mld_df['year']==year]
x = querr2.lon.values
y = querr2.lat.values
z = querr2['clustered_MLD']
#plt.scatter(x,y, color='orange',s=26,zorder = 2)
title = 'Mixed Layer Depth'
cBarLabel = 'm'
# define grid.
xi = np.linspace(x.min(),x.max(),300)
yi = np.linspace(y.min(),y.max(),300)
# grid the data.
zi = abs(griddata((x, y), z, (xi[None,:], yi[:,None]), method='cubic'))
if(tracer=='Nuit_Deficiency'): # use redfield rations to convert mmol deficiency to carbon in mols
zi = (zi*(106./16.))/1000
# contour the gridded data.
plt.contourf(xi,yi,zi,12,cmap=cMap,zorder=1)
plt.colorbar(orientation='vertical').ax.set_xlabel(cBarLabel) # draw colorbar
plt.xlim(min(x),max(x))
plt.ylim(min(y),max(y))
plt.title('Interpolated '+title+' '+str(np.asarray(querry.year)[0])+' (%d points)' % len(x))
plt.show()
Combined surface NO2+NO3 represents the stock of Nitrogen-based nuitrients available to support Phytoplankton growth. NO2+NO3 samples (hollow points) are used to genreate an interpolation of nuitrient distributions along the wAP each January. Notice any trends?
Top
interpolate(inOrg,'Nitrite')
def interpolate_average(df,tracer):
# Querry the database and clean up
tracer_df = df.loc[(df[tracer]!=skip) & (df[tracer].notnull())& (df['stationID'].isin(outOfGrid)==False)]
# Title for plot
if(tracer=='Nitrite'):
title = 'Surface NO$_{2}$+NO$_{3}$'
tracer_df = tracer_df.loc[tracer_df['depth']<=surfDepth]
cBarLabel = r'$\mu$mol/L'
elif(tracer=='PO4'):
title = 'SurfacePO$_{4}$'
tracer_df = tracer_df.loc[tracer_df['depth']<=surfDepth]
cBarLabel = r'$\mu$mol/L'
elif(tracer=='SiO4'):
title = 'Surface SiO$_{4}$'
tracer_df = tracer_df.loc[tracer_df['depth']<=surfDepth]
cBarLabel = r'$\mu$mol/L'
elif(tracer=='Nuit_Deficiency'):
title = 'Net Community Production'
cBarLabel = r'mol C / m$^{2}$'
else:
title = 'MLD'
tracer_df = tracer_df.loc[(tracer_df[tracer]<200) & (tracer_df[tracer]>0)]
cBarLabel = 'm'
# Average surface values across years
stationList = tracer_df.stationID.unique()
lons = []
lats = []
nuits = []
for stationID in stationList:
querry= tracer_df[tracer_df['stationID']==stationID]
if(len(querry.year.unique())>=5):
lons.append(np.asarray(querry.lon)[0])
lats.append(np.asarray(querry.lat)[0])
nuits.append(abs(np.nanmean(querry[tracer])))
# define grid.
xi = np.linspace(min(lons),max(lons),360)
yi = np.linspace(min(lats),max(lats),360)
# grid the data.
zi = abs(griddata((lons, lats), nuits, (xi[None,:], yi[:,None]), method='cubic'))
if(tracer=='Nuit_Deficiency'): # use redfield rations to convert mmol deficiency to carbon in mols
zi = (zi*(106./16.))/1000
# contour the gridded data.
plt.contourf(xi,yi,zi,15,cmap=cMap)
plt.colorbar(orientation='vertical').ax.set_xlabel(cBarLabel) # draw colorbar
# plot data points.
#plt.scatter(lons,lats,facecolors='none', edgecolors='k',s=26)
plt.xlim(min(lons),max(lons))
plt.ylim(min(lats),max(lats))
plt.title(r'Summertime Averages of '+title+' (%d points)' % len(nuits))
plt.show()
interpolate_average(inOrg,'Nitrite')
We can leverage these combined NO2 NO3 measurements to calculate how much inorganic carbon has been taken out of the water column by Phytoplankton. This Biogeochemical measurement is called Net Community Production (NCP), and can be attained with a help from Physical parameters... namely, Mixed Layer Depth (MLD). The integrated area down to the MLD of combined NO2 NO3 represents the total nutrients taken out of the homogenous water column by Phytoplankton. With this number, we can reverse engineer how much CO2 has been sequestered using a stoichiometric ratio of elements found in Marine biota called the Redfield Ratio.
Along with inorganic nutrients, CTD profile are taken on the annual cruise along the wAP. The profile is recorded on the way down (faster) and on the way back up (slower) to the ship. For this reason, MLD estimations are taken from both directions with two methods highlighted in Holte and Talley (2009)-- the Threshold Method by
de Boyer Montegut et al. (2004) and the
Gradient Method by
Dong et al. (2008) using temperature and potential desnity.
Top
# files
mldPath = 'package_650/**/*.std'
mldOutfile = os.path.join(dsPath+'Palmer_MLD.json')
# values for threshold (t_) and gradient(g_) MLD estimations
t_threshT = 0.2 # celsius
t_threshS = 0.03 # kg/m^3
g_threshT = 0.005 # celsius
g_threshS = 0.0005 # kg/m^3
def find_mld(path):
mld_list = [] # initialize a list for all dictionaries containing MLD estimates.
records = 0
for filename in glob.iglob(path, recursive=True):
records += 1
# First let's read each CTD profile into a pandas Dataframe
lines = [line.split() for line in open(filename,'r',errors="ignore")]
cols = [" ".join(line).lower() for line in lines[1:25] if len(line)<12]
rows = lines[len(cols)+1:]
df = pd.DataFrame(data=np.reshape(rows,(len(rows),len(cols))),columns=cols)
mld = {} # initialize a dictionary to store all the MLD estimates
# add propoer leading zeros to lines
station,line = lines[0][5].split(".")
if("-" not in line):
mld['stationID']= station+"."+str("%03d" % (int(line),))
mld['year'] = '20'+filename.split('/')[1][:2]
else:
mld['stationID']= station+"."+str("%04d" % (int(line),))
mld['year'] = '20'+filename.split('/')[1][:2]
# Boolean switches
sFound = 0
t1Found = 0
t2Found = 0
pFound = 0
dFound = 0
# check each column in the uploaded file for potential desnity, temps, pressure, and depth
for col in cols:
if(sFound==0 and 'sigma' in col):
sigmaCol = col
sFound = 1
elif(t1Found==0 and 'temperature' in col and 'potential' not in col):
tempCol = col
t1Found = 1
elif(t2Found==0 and 'temperature' in col and 'potential' not in col):
temp2Col = col
t2Found =1
elif('pressure' in col and pFound==0):
presCol = col
pFound = 1
elif('depth' in col and dFound==0):
depthCol = col
dFound = 1
# rename the columns
df = df.rename(columns={sigmaCol: 'sigma',
tempCol: 'temp1',
temp2Col: 'temp2',
presCol : 'pres',
depthCol : 'depth'})
for col in ['pres','sigma','temp1','temp2','depth']:
df[col] = pd.to_numeric(df[col], errors='coerce')
#The measurement closest to 10 dbar is used as the reference value, now enter Holte and Talley 2009.
ref = df.loc[df['pres']==5].index
if(len(ref)==2):
df[ref[0]:ref[1]]
bottomUp = True
elif(len(ref)==1):
if(ref[0]<(len(df)/2)):
df[ref[0]:]
bottomUp = False
else:
df[:ref[0]]
bottomUp = True
# Sigma Threshold
for isopyc in df['sigma']:
if (abs(isopyc-np.asarray(df['sigma'].head(1))[0]) > t_threshS):
mld['t_sig_down']= np.asarray(df['depth'].loc[df['sigma']==isopyc].values)[0]
break
if(bottomUp==True):
for isopyc in reversed(df['sigma']):
if (abs(isopyc-np.asarray(df['sigma'].tail(1))[0]) > t_threshS):
mld['t_sig_up']= np.asarray(df['depth'].loc[df['sigma']==isopyc].values)[0]
break
# Temp Threshold 1
for isotherm in df['temp1']:
if (abs(isotherm-np.asarray(df['temp1'].head(1))[0]) > t_threshT):
mld['t_t1_down']= np.asarray(df['depth'].loc[df['temp1']==isotherm].values)[0]
break
if(bottomUp==True):
for isotherm in reversed(df['temp1']):
if (abs(isotherm-np.asarray(df['temp1'].tail(1))[0]) > t_threshT):
mld['t_t1_up']= np.asarray(df['depth'].loc[df['temp1']==isotherm].values)[0]
break
# Calculate the finite difference slope
sigSlope = np.diff(df['sigma'])/np.diff(df['pres'])
t1Slope = np.diff(df['temp1'])/np.diff(df['pres'])
t2Slope = np.diff(df['temp2'])/np.diff(df['pres'])
ms = len(t1Slope)-1
for i in range(ms):
if(abs(t1Slope[i])>g_threshT):
mld['g_t1_down']=df['depth'].iloc[i+1]
break
if(bottomUp==True):
for i in reversed(range(ms)):
if(abs(t1Slope[i])>g_threshT):
mld['g_t1_up']=df['depth'].iloc[i+1]
break
for i in range(ms):
if(abs(sigSlope[i])>g_threshS):
mld['g_sig_down']=df['depth'].iloc[i+1]
break
if(bottomUp==True):
for i in reversed(range(ms)):
if(abs(sigSlope[i])>g_threshS):
mld['g_sig_up']=df['depth'].iloc[i+1]
break
mld_list.append(mld)
# Save the MLD estimates
with open(mldOutfile, 'w') as file:
json.dump(mld_list, file, indent=4)
print(records,'CTD records scraped and dumped')
# run it!
find_mld(mldPath)
1176 CTD records scraped and dumped
A method used by Holte and Talley (2009) is cluster analysis. For this I build off a runction from Raymond Hettinger, to search through each station's MLD estimates and cluster values based on a scaling gap. The largest cluster is then averaged and returned.
Top
# Merge the MLD estimates with the wAP grid for plot-ability.
mld_df = pd.merge(left=pd.read_json(mldOutfile),right=newGrid,on='stationID',how='inner')
# Cluster based MLD estimation.
def cluster(data, maxgap):
data = data[~np.isnan(data)]
if(data.any()):
data.sort()
groups = [[data[0]]]
for x in data[1:]:
if abs(x - groups[-1][-1]) <= (maxgap+(x/10)):
groups[-1].append(x)
else:
groups.append([x])
# check for largest cluster
maxClusterSize = 0
for group in groups:
if(len(group)>maxClusterSize):
maxClusterSize = len(group)
meanCluster = np.mean(group)
return meanCluster
mld_df['clustered_MLD']=[cluster(row, maxgap=4) for row in mld_df[['g_sig_up',
'g_t1_up',
't_sig_up',
't_t1_up',
'g_sig_down',
'g_t1_down',
't_sig_down',
't_t1_down']].values]
How do we know if our MLD estimates (orange) have enough coverage to calculate NCP for for each set of annual inorganic nutrient measurements (hollow)? For hollow points-- where we lack CTD data, we will use interpolated MLD (filled contour) to calculate the NCP. For this reason, how MLD is estimated and interpolated can make or break our restults.
Top
interpolate(df=inOrg,tracer='Nitrite',MLD=True)
interpolate_average(mld_df,'clustered_MLD')
# Files
ncpOutfileNOX = os.path.join(dsPath+'NCP.json')
ncpOutfileSiO4 = os.path.join(dsPath+'NCP_SiO4.json')
ncpOutfilePO4 = os.path.join(dsPath+'NCP_PO4.json')
# seach interpolated MLD dictionary--parameterized in compile_NCP(), for values closest to missing station.
def station_MLD(stationID,zi,df,xi_coords,yi_coords):
mld = zi[xi_coords[round(float(df.loc[df.stationID==stationID].lon.values[0]),1)],
yi_coords[round(float(df.loc[df.stationID==stationID].lat.values[0]),1)]]
return(mld.mean())
# interpolated
def get_NCP(df,station,tracer,mld):
depSlice = df.loc[df['stationID']==station]
if not(depSlice.empty):
mld_tracer = np.interp(mld,depSlice['depth'], depSlice[tracer])
# slice the profile after using the profile beyong the MLD to interpolate.
depSlice = depSlice.loc[depSlice['depth']<=mld]
nuit_profile = np.append(depSlice[tracer].values,mld_tracer)
mld_depth = np.append(depSlice['depth'].values,mld)
return((mld*mld_tracer) - simps(nuit_profile,mld_depth)) # integrate from depth to surface using profile.
else:
return None
ncpOutfile = os.path.join(dsPath+'NCP.json')
def compile_NCP(method='t_sig_down',tracer='Nitrite',outfile=ncpOutfile,warning=False):
years = [2003,2004,2005,2006,2007,2008,2009,2011,2012,2013,2014]
NCP_list = []
for year in years:
if(warning==True):
print("***",year,"***")
# Querry the databases and clean up
querryMLD = mld_df.loc[(mld_df[method]!=skip) &
(mld_df[method].notnull()) &
(mld_df['stationID'].isin(outOfGrid)==False)&
(mld_df['year']==year)]
querryInOrg = inOrg.loc[(inOrg[tracer]!=skip) &
(inOrg[tracer].notnull()) &
(inOrg['stationID'].isin(outOfGrid)==False)&
(inOrg['year']==year)]
# Grid and interpolate MLD.
x = np.round(querryMLD.lon.values,4)
y = np.round(querryMLD.lat.values,4)
z = querryMLD[method].values
# If it's not empty let's interpolate!
if(len(x>4)):
# define grid.
xi = np.linspace(x.min(),x.max(),360)
yi = np.linspace(y.min(),y.max(),360)
# grid the data.
zi = abs(griddata((x, y), z, (xi[None,:], yi[:,None]), method='cubic'))
# document the linspace coord pairs
xi_coords = {round(float(value),1): index for index, value in enumerate(xi)}
yi_coords = {round(float(value),1): index for index, value in enumerate(yi)}
# Go through each station measurement of inorganic nuitrients
for station in pd.unique(querryInOrg['stationID'].values.ravel()):
# check if the station and year exist in the MLD data
mld_S = querryMLD.loc[querryMLD['stationID']==station]
# if not, check interp...
if(mld_S.empty):
try:
if not(pd.isnull(station_MLD(station,zi,querryInOrg,xi_coords,yi_coords))):
NCP_list.append({'year': year,
'stationID': station,
'MLD' : station_MLD(station,zi,querryInOrg,xi_coords,yi_coords),
'interp':'True',
'Nuit_Deficiency': get_NCP(querryInOrg,station,tracer,station_MLD(station,zi,querryInOrg,xi_coords,yi_coords))})
except (KeyError,IndexError) as e:
if(warning==True):
print(station,"index not found")
# TODO: reference annual value for index error?? reference nearest enighbor for key error?
else:
NCP_list.append({'year': year,
'stationID': station,
'MLD': mld_S[method].mean(),
'interp': 'False',
'Nuit_Deficiency': get_NCP(querryInOrg,station,tracer,np.nanmean(mld_S[method]))})
with open(outfile, 'w') as file:
json.dump(NCP_list, file, indent=4)
compile_NCP(method='clustered_MLD',tracer='Nitrite',outfile=ncpOutfile)
read the json file into a pandas DF, read as text to avoid nans. Chained solution to this Stackoverflow answer.
NOX_df = pd.DataFrame(json.loads(open(ncpOutfile,'r').read()))
NOX_df = pd.merge(left=NOX_df,right=newGrid[['stationID','lat','lon']],on='stationID',how='inner')
NOX_df.head()
MLD | Nuit_Deficiency | interp | stationID | year | lat | lon | |
---|---|---|---|---|---|---|---|
0 | 3.111857 | 36.190000 | False | 600.040 | 2003 | -64.93333 | -64.4 |
1 | 2.227750 | 27.857989 | False | 600.040 | 2004 | -64.93333 | -64.4 |
2 | 4.738607 | 88.064000 | False | 600.040 | 2005 | -64.93333 | -64.4 |
3 | 3.176771 | 64.340630 | False | 600.040 | 2006 | -64.93333 | -64.4 |
4 | 2.970500 | 22.135364 | False | 600.040 | 2007 | -64.93333 | -64.4 |
compile_NCP(method='clustered_MLD',tracer='SiO4',outfile=ncpOutfileSiO4)
compile_NCP(method='clustered_MLD',tracer='PO4',outfile=ncpOutfilePO4)
SiO4_df = pd.DataFrame(json.loads(open(ncpOutfileSiO4,'r').read()))
PO4_df = pd.DataFrame(json.loads(open(ncpOutfilePO4,'r').read()))
N_P = NOX_df['Nuit_Deficiency']/PO4_df['Nuit_Deficiency'].loc[PO4_df['Nuit_Deficiency']>.1]
N_P = N_P.replace([np.inf, -np.inf,0], np.nan).dropna()
N_P = N_P.where((N_P<100)&(N_P>-30))
N_P.plot(kind='kde',color='k', alpha=0.5, xlim=(-30,100),title='Density Distribution of N:P')
print("N:P mean",round(N_P.mean(),4),"Vs",16.00,"Redfield Ratio")
N:P mean 19.5843 Vs 16.0 Redfield Ratio
N_Si = NOX_df['Nuit_Deficiency']/SiO4_df['Nuit_Deficiency']
N_Si = N_Si.replace([np.inf, -np.inf,0], np.nan).dropna()
N_Si.plot(kind='kde',color='k', alpha=0.5, xlim=(-10,20),title='Density Distribution of N:Si')
print("N:Si mean",round(N_Si.mean(),4),"Vs",15/16,"Redfield Ratio")
N:Si mean 0.9718 Vs 0.9375 Redfield Ratio
Because the Redfield Ratios are consistent, we can apply the C:N ratio (106:16) of Marine Biomass to calculate how much carbon was removed from the water column.
interpolate(df=NOX_df,tracer='Nuit_Deficiency')
interpolate_average(NOX_df,'Nuit_Deficiency')
Pockets of extreme productivity are spaced sporadically (1-3x) between otherwise non-productive waters along the wAP. An inverse pattern is faint in surface interpolations of NO2 + NO3. However, the regions of high net community production do not mesh with those of deplete surface NO2 + NO3, suggesting NCP in the wAP is driven by other (physical, biological or geographic) factors, rather than nutrient stocks alone. For this reason we turn to the physical method(s) used during Mixed Layer Depth (MLD) estimation and interpolation.
Estimation algorithms can improve by accounting for for Sea ice thermodynamics and water column structure. Starting from physics, we get closer to parsing out how the wAP's ecosystem and carbon sink will change as global temperatures rise. Unfortunately missing CTD values leave holes in the physics where interpolation algorithims to fall short at boundary stations (lines 000 and 600). To compensate for this, interannualy averaged MLD could be used as a baseline, and other high resolution datasets such as Argo and the World Ocean atlas could be used to cross reference, validate, and interpolate these gaps.
Physics aside, Redfield Ratios hold up remarkably well-- with SiO4 suggesting January community structures are dominated by Diatoms.
Datasets ouside of CTD and Inorganic nutrients have yet to be incorporated into calculating NCP. Triple Oxygen Isotopes, and inhert Argon would be interesting avenues to explore. Freshwater discharge is also another feature that may help explain mixed layer dynamics as well as potential Iron fertilization.
Top
# This is an extra function that I decided to leave in for curious GEOSPATIALLY-inclined folks!
# Map-based interpolate function, not as pretty as interpolate... but better context
def interpolate_map(df=inOrg,tracer = 'Nitrite',MLD = False):
# Querry the database and clean up
tracer_df = df.loc[(df[tracer]!=skip) &
(df[tracer].notnull()) &
(df['stationID'].isin(outOfGrid)==False)]
# choose a title and see if the tracer analysis depth-dependent.
if(tracer=='Nitrite'):
title = 'Surface NO$_{3}$+NO$_{2}$'
tracer_df = tracer_df[tracer_df['depth']<=surfDepth]
elif(tracer=='PO4'):
title = 'Surface PO$_{4}$'
tracer_df = tracer_df[tracer_df['depth']<=surfDepth]
elif(tracer=='SiO4'):
title = 'Surface SiO$_{4}$'
tracer_df = tracer_df[tracer_df['depth']<=surfDepth]
else:
title = 'Mixed Layer Depth'
tracer_df = tracer_df.loc[(tracer_df[tracer]<200) & (tracer_df[tracer]>0)]
# Interpolate and plot for each year
for year in pd.unique(tracer_df.year.values.ravel()):
map = Basemap(width=1000000,height=640000,
resolution='l',projection='stere',\
lat_ts=50,lat_0=-66.40,lon_0=-69.05)
map.drawparallels(np.arange(-90.,90.,1.),labels=[False,False,False,True],
color='w',dashes=[100,.0001],zorder=0)
map.drawmeridians(np.arange(-180.,181.,1.),labels=[True,False,False,False],
color='w',dashes=[100,.0001],zorder=0)
map.drawcoastlines()
map.fillcontinents()
map.drawmapboundary()
querry = tracer_df[tracer_df['year']==year]
# Define the plotable elements
x = querry.lon.values
y = querry.lat.values
z = querry[tracer].values
labels = querry.stationID
xM,yM = map(x, y)
# If it's not empty let's interpolate!
if(len(x)>10):
# plot data points.
map.scatter(xM,yM,facecolors='none', edgecolors='k',s=26, zorder=2)
gridBounds = ['000.220','100.220','200.220','300.220','400.220','500.220','600.220']
gridLabels = newGrid[newGrid['stationID'].isin(gridBounds)]
x3 = gridLabels.lon.values
y3 = gridLabels.lat.values
xS, yS = map(x3, y3)
labels = gridLabels.gridLine.values.astype(int)
for label, xpt, ypt in zip(labels, xS, yS):
plt.text(xpt-60000, ypt+12000, label,size=14)
if(MLD==True):
querr2 = mld_df[mld_df['year']==year]
x1 = querr2.lon.values
y1 = querr2.lat.values
xM1,yM1 = map(x1,y1)
map.scatter(xM1,yM1, color='orange',s=26,zorder = 1)
else:
# define grid.
xi = np.linspace(xM.min(),xM.max(),300)
yi = np.linspace(yM.min(),yM.max(),300)
# grid the data.
zi = abs(griddata((xM, yM), z, (xi[None,:], yi[:,None]), method='cubic'))
# contour the gridded data.
plt.contourf(xi,yi,zi,12,cmap=cMap)
plt.colorbar() # draw colorbar
plt.title('Interpolated '+title+' '+str(np.asarray(querry.year)[0])+' (%d points)' % len(x))
plt.show()