(This post is the second part of Gridding data.)
In the first part we implemented a function to interpolate scattered observations to a grid. Our function took a correlation length and an estimated error and, using a fixed Gaussian correlation function, interpolate the observations to a rectangular grid. Remember that we had to guess the parameters and hope for the best fit. (BTW: A good first guess for the correlation length is the first would be the first Baroclinic Rossby radius.)
In this post we extend our gridding function to take different correlation
functions (Gaussian, Markov and LeTraon functions), but most importantly we
will estimate the parameters R
(Correlation length scale) and lamb
(error variance).
For this we need a function to express the R
, the grid and
the observed positions in kilometers (for an easier interpretation of the
length scale R
). We also need a function to estimate the scale using either
a Gaussian or a Markov function.
(For more information check the original MatlabTM program and this paper.)
import numpy as np
def ll2km(lon, lat, bbox):
"""xkm, ykm will be the coordinates of the data (converted from lon/lat)."""
rearth = 6370800 # Earth radius [m].
deg2rad = np.pi/180
ykm = rearth * (lat - bbox[2]) * deg2rad / 1000
xkm = rearth * ((deg2rad * (lon - bbox[0])) *
np.cos(deg2rad * 0.5 * (lat + bbox[2])) / 1000)
return xkm, ykm
def func(a, x, fx, method='markov'):
"""Compute the mean squared misfit between an analytical function
(e.g. Gaussian or Markov function) and a set of data FX observed at
independent coordinates X.
http://marine.rutgers.edu/dmcs/ms615/jw_matlab/oi/myfunction.m
Parameters
----------
a : float
Parameters of analytically function
x : array
Locations of data
fx : array
Observed function values (data).
method : string
Specifies the shape of the function to be fitted:
Must be one of 'gauss', 'markov' (default) or 'letra' (LeTraon).
In all cases, two parameters are fit a[0] is the y-intercept at x=0
a[1] is the characteristic scale of the fitted function.
"""
# Gaussian function f = a0 * exp(-0.5 * (r/a)**2).
if method == 'gauss':
r = x / a[1]
fit = a[0] * np.exp(-0.5 * r**2)
# Markov function f = a0 * (1 + r/a) * exp(-r/a).
elif method == 'markov':
r = np.abs(x) / a[1]
fit = a[0] * (1+r) * np.exp(-r)
# Le Traon function f = a0 * exp(-r/a) * (1+r/a+(r**2) / 6-(r**3) / 6.
elif 'letra':
r = np.abs(x) / a[1]
rsq = r**2
fit = a[0] * np.exp(-r) * (1 + r + rsq / 6 - (r * rsq) / 6)
else:
raise ValueError("Unrecognized method {!r}. Must be one of 'gauss',"
" 'markov' or 'letra'.".format(method))
return np.mean((fit - fx)**2)
Below is our new gridding function. The function takes the correlation length
R
, an estimated error variance lamb
along with the grid, observed data and
observed positions. So far it is very similar to our previous gridding
function. The main difference is that now we can pass either Gaussian
or
Markov
as the correlation function.
def optinter(R, lamb, x, y, data, xdata, ydata, cov_func='markov'):
"""The code uses optimal interpolation to map irregular spaced observations
onto a regular grid.
http://marine.rutgers.edu/dmcs/ms615/jw_matlab/oi/oi_mercator.m
Parameters
----------
R : float
Square root of the de-correlation length scale in units of deg**2.
lambda : float
error squared to signal squared or E.
X, Y : array
Grid of the locations for theta.
data : array
Observations.
xdata, ydata : array
Observed locations.
Returns
-------
theta : array
Optimal interpolated data.
err : array
Estimated optimum error.
res : array
Residue fit.
"""
X, Y = ll2km(*np.meshgrid(x, y), bbox=bbox)
# Ars.
xkm, ykm = ll2km(xdata, ydata, bbox)
xr, yr = np.meshgrid(xkm, ykm)
# Compute the distance of each data point:
rdist = np.sqrt((xr - xr.T)**2 + (yr - yr.T)**2)
# Covariance function.
if cov_func == 'gauss':
cdd0 = np.exp(-rdist**2 / R**2)
elif cov_func == 'markov':
cdd0 = (1 + rdist/R) * np.exp(-rdist/R)
else:
raise ValueError("Unrecognized covariance function {!r}."
"Must be one of 'gauss' or 'markov'.".format(cov_func))
# Final Data covariance Matrix between data points.
cdd = cdd0 + lamb * np.eye(*cdd0.shape)
# Cxr.
Xd, Xg = np.meshgrid(xkm, X.ravel())
Yd, Yg = np.meshgrid(ykm, Y.ravel())
# Distance between observation r and grid g.
rmd = np.sqrt((Xg - Xd)**2 + (Yg - Yd)**2)
# Again plug into covariance function.
if cov_func == 'gauss':
cmd = np.exp(-rmd**2 / R**2)
elif cov_func == 'markov':
cmd = (1 + rmd /R) * np.exp(-rmd/R)
else:
raise ValueError("Unrecognized covariance function {!r}."
"Must be one of 'gauss' or 'markov'.".format(cov_func))
demeaned = data - data.mean()
res = data.mean() + np.dot(cdd0, np.linalg.solve(cdd, demeaned))
res = data.ravel() - res.ravel()
# Normalized by the error variance.
err = np.diag(1 - np.dot(np.dot(cmd, np.linalg.inv(cdd)), cmd.T))
err = np.reshape(err, X.shape) * 100 # Error in percentages.
theta = data.mean() + np.dot(cmd, np.linalg.solve(cdd, demeaned))
theta = np.reshape(theta, X.shape)
return dict(residual=res, error=err, covariance=cdd0, final_covariance=cdd, theta=theta)
Here is the original data again.
import cartopy.crs as ccrs
import matplotlib.pyplot as plt
import cartopy.feature as cfeature
from cartopy.mpl.gridliner import LONGITUDE_FORMATTER, LATITUDE_FORMATTER
LAND = cfeature.NaturalEarthFeature('physical', 'land', '50m',
edgecolor='face',
facecolor=cfeature.COLORS['land'])
def make_map(bbox, projection=ccrs.PlateCarree()):
fig, ax = plt.subplots(figsize=(8, 6),
subplot_kw=dict(projection=projection))
ax.set_extent(bbox)
ax.add_feature(LAND, facecolor='0.75')
ax.coastlines(resolution='50m')
gl = ax.gridlines(draw_labels=True)
gl.xlabels_top = gl.ylabels_right = False
gl.xformatter = LONGITUDE_FORMATTER
gl.yformatter = LATITUDE_FORMATTER
return fig, ax
import numpy.ma as ma
from scipy.io import loadmat
from oceans.colormaps import cm
merc = loadmat('./data/mercator_temperature.mat', squeeze_me=True)
x, y = merc['x'], merc['y']
bbox = [x.min(), x.max(), y.min(), y.max()]
level = 10 # 10 meters temperature.
data = ma.masked_invalid(merc['temp'][level, ...])
fig, ax = make_map(bbox=bbox)
cs = ax.pcolormesh(x, y, data, cmap=cm.avhrr)
cbar = fig.colorbar(cs, extend='both', shrink=0.85)
And the sub-sampled data as before.
def random_mask(data, frac=0.05):
N = data.size
frac = np.int(frac * N)
idx = np.random.random_integers(0, N-1, frac)
return idx
mask = ~data.mask
temp = data[mask].data
idx = random_mask(temp, frac=0.05)
X, Y = np.meshgrid(x, y)
tsub = temp[idx]
xx, yy = X[mask][idx], Y[mask][idx]
fig, ax = make_map(bbox)
ax.set_title('Sub-sampled data to '
'{:0.2f} %.'.format(100 * tsub.size / temp.size))
cs = ax.scatter(xx, yy, 20, tsub, cmap=cm.avhrr)
cbar = fig.colorbar(cs, extend='both', shrink=0.85)
Before we estimate the length scale R
we can try to over and under estimated
the parameter while fixing lamb
.
R = 300 # [km] Overestimation.
lamb = 0.11
ret = optinter(R, lamb, x, y, tsub, xx, yy, cov_func='markov')
fig, ax = make_map(bbox=bbox)
cs = ax.pcolormesh(x, y, ret['theta'], cmap=cm.avhrr)
ax.set_title('Too smooth!')
cbar = fig.colorbar(cs, extend='both', shrink=0.85)
R = 20 # [km] Underestimated.
ret = optinter(R, 0.11, x, y, tsub, xx, yy, cov_func='markov')
fig, ax = make_map(bbox=bbox)
cs = ax.pcolormesh(x, y, ret['theta'], cmap=cm.avhrr)
ax.set_title('Some suspicious structures...')
cbar = fig.colorbar(cs, extend='both', shrink=0.85)
The first one, where we overestimated R = 300 km
is clearly too smoothed.
On the other hand, the second one using R = 20 km
, created several weird
structures that are probably not realistic (check the original data above).
Well, it was promised an objective way to estimate those parameters right? The trick part is the error variance: one might start with the instrumentation error + an estimated noise or an error assessment of the measurements.
Here we assume that the original data had no error and we add a known noise. That way we have some control on what is going to happen.
(Adding random noise of $\pm$ 0.75 $^\circ$C or variance 0.5625 $^\circ$C$^2$.)
e = 0.75
tsub += e * np.random.randn(tsub.size)
demaned = tsub - tsub.mean()
C = np.dot(demaned[:, None], demaned[:, None].T) # Cov.
# Distance to check for the scale.
dr = 30 # Distance step.
rs = np.r_[0, np.arange(0.5*dr, 1000, dr)]
cf = [np.diag(C).mean()]
XKM, YKM = ll2km(*np.meshgrid(xx, yy), bbox=bbox)
Rd = np.sqrt((XKM-XKM.T)**2 + (YKM-YKM.T)**2)
for r in rs[1:]:
idx = np.logical_and(Rd.ravel() > r - 0.5 * dr,
Rd.ravel() <= r + 0.5 * dr)
cf.append(C.ravel()[idx].mean())
cf = np.array(cf) # Observed Covariance Function.
Now we can fit an analytical covariance function to the observed covariance.
from scipy.optimize import fmin
rfitmax = 600
subset = np.logical_and(rs > 0, rs <= rfitmax)
x0 = [20, 400]
a = fmin(func=func, x0=x0, args=(rs[subset], cf[subset], 'gauss'))
b = fmin(func=func, x0=x0, args=(rs[subset], cf[subset], 'markov'))
# Final fit at convergence.
misfit_a = func(a, rs[subset], cf[subset], 'gauss')
misfit_b = func(b, rs[subset], cf[subset], 'markov')
# Estimate the noise variance.
if misfit_a < misfit_b:
best_method = 'Gaussian'
esq = cf[0] - a[0]
else:
best_method = 'Markov'
esq = cf[0] - b[0]
print('\nThe best fit was "%s"' % best_method)
fig, ax = plt.subplots()
ax.plot(rs, cf, 'x')
ax.plot(rs, a[0] * np.exp(-0.5 * (rs/a[1])**2),
label='Gaussian: misfit = %.3f' % misfit_a)
ax.plot(rs, b[0] * (1 + rs/(b[1])) * np.exp(-rs/b[1]),
label='Markov: misfit = %.3f' % misfit_b)
ax.set_xlabel('Distance (km)')
ax.set_ylabel(r'$^\circ$C$^2$')
ax.set_title('Binned lagged covariance from data')
ax.errorbar(160, cf[0], yerr=esq, color='black', label=r'$e^2$')
ax.grid(True)
ax.text(200, 10, 'R = %.2f km' % b[1])
leg = ax.legend(numpoints=1)
The next step is to estimated the error variance (lamb
) using the noise
variance (esq
) and the estimated signal variance (ssq
).
(ssq
is given by y-intercept from fit above). Note that the fit offset is
the correlation length R
.
R = b[1] # Scale.
ssq = b[0] # Add error/signal variance.
lamb = esq/ssq
ret = optinter(R, lamb, x, y, tsub, xx, yy, cov_func='gauss')
To evaluate the fit we need to examine the histogram of the residuals. (We expect most the data points to fall around zero.)
fig, ax = plt.subplots()
h = ax.hist(ret['residual'], 20)
ax.set_xlim(-4, 4)
_ = ax.set_title('Histogram of residuals (data - intepolation)')
And finally gridded data using the estimated R
and lamb
. Not perfect, but
remember that we used only 5% of the original data here.
fig, ax = make_map(bbox=bbox)
cs = ax.pcolormesh(x, y, ret['theta'], cmap=cm.avhrr)
cbar = fig.colorbar(cs, extend='both', cmap=cm.avhrr, shrink=0.85, )
_ = ax.scatter(xx, yy, 20, tsub)
It can be handy to show the expected errors for the gridded data. Note how
the error increases to more than 50% for any grid point that is more than R
distant from an observed point.
fig, ax = make_map(bbox=bbox)
cs = ax.pcolormesh(x, y, ret['error'], vmin=0, vmax=50, cmap=plt.cm.cubehelix_r)
cbar = fig.colorbar(cs, extend='both', shrink=0.85)
_ = ax.plot(xx, yy, 'o', markerfacecolor='none')
cdd0 = ret['covariance']
cdd = ret['final_covariance']
error0 = ssq * np.diag(1 - np.dot(cdd0, np.dot(np.linalg.inv(cdd), cdd0.T)))
fig, ax = plt.subplots()
x = np.arange(len(tsub))
ax.plot(x, np.abs(ret['residual']), 'o', alpha=0.75, label='Actual |data-OI|')
ax.plot(x, np.sqrt(error0), 'o', alpha=0.5, label='Expected error')
ax.set_xlabel('Sample number')
ax.set_ylabel(r'$^\circ$C')
within_limits = np.abs(ret['residual']) <= np.sqrt(error0)
per = 100 * len(np.nonzero(within_limits)[0]) / len(tsub)
title = '%s%% of residuals are within expected error bars' % per
ax.set_title(title)
leg = ax.legend(numpoints=1)
And this last pictures show that the expected error should be used only as a guide, the actual error is usually higher.
HTML(html)