gdptools example without the OPeNDAP catalog - gridMET data

gdptools example without the OPeNDAP catalog - gridMET data#

import logging
import warnings

import hvplot.pandas
import hvplot.xarray
import numpy as np
import pandas as pd
import xarray as xr
from pynhd import NLDI, WaterData
from pygeohydro import watershed

from gdptools import AggGen, UserCatData, WeightGen

warnings.filterwarnings("ignore")

Create GeoDataFrame of Delaware River Basin HUC12s#

In this cell we use clients to several https://api.water.usgs.gov tools:

Tool: USGS NLDI Client: pynhd.NLDI
Tool: USGS GeoServer Client: pynhd.WaterData

More examples of using the HyRiver pynhd package can be found here

# Delaware River Basin HUC12s
# --------------------------------
wbd = watershed.WBD("huc4")
delaware_basin = wbd.byids(field="huc4", fids="0204")
huc12_basins = WaterData("wbd12").bygeom(delaware_basin.iloc[0].geometry)
huc12_basins = huc12_basins[huc12_basins["huc12"].str.startswith(("020401", "020402"))]
huc12_basins.head()

	geometry	tnmid	metasourceid	sourcedatadesc	sourceoriginator	sourcefeatureid	loaddate	gnis_id	areaacres	areasqkm	states	huc12	name	hutype	humod	tohuc	globalid
0	MULTIPOLYGON (((-75.02421 39.44397, -75.02353 ...	{001055CD-7B5E-4F96-ABB1-22923FD2B454}	NaN	None	None	None	2013-01-18T07:08:41Z	None	16178.10	65.47	NJ	020402060505	White Marsh Run-Maurice River	S	NM	020402060507	{A70A97FA-E29C-11E2-8094-0021280458E6}
1	MULTIPOLYGON (((-74.62113 41.00565, -74.62139 ...	{009DA440-393A-46A0-BBA3-C38139823414}	{ED602145-9201-4827-9CE1-05D252484579}	None	None	None	2017-10-03T20:11:06Z	None	29107.93	117.80	NJ	020401050502	Lubbers Run-Musconetcong River	S	NM	020401050503	{B830B4F8-E29C-11E2-8094-0021280458E6}
2	MULTIPOLYGON (((-75.75805 40.65996, -75.75783 ...	{00F73634-FA00-464F-93F0-F0BCCB464567}	{ED602145-9201-4827-9CE1-05D252484579}	None	None	None	2017-10-03T20:10:58Z	None	35229.84	142.57	PA	020402030305	Upper Maiden Creek	S	NM	020402030307	{BAF5E77A-E29C-11E2-8094-0021280458E6}
3	MULTIPOLYGON (((-75.13279 39.88601, -75.13439 ...	{017B8A58-E959-4600-B212-8EB921F0F9F7}	{E9F5C988-2313-440E-A05E-C5871E2773A6}	None	None	None	2017-10-03T20:11:04Z	None	24920.90	100.85	NJ,PA	020402020507	Woodbury Creek-Delaware River	S	TF	020402020607	{B122D6B7-E29C-11E2-8094-0021280458E6}
4	MULTIPOLYGON (((-74.92817 40.55256, -74.92833 ...	{01B4598B-0623-407F-9133-3040D99A8D1A}	{ED602145-9201-4827-9CE1-05D252484579}	None	None	None	2017-10-03T20:10:52Z	None	15086.80	61.05	NJ	020401050905	Lockatong Creek	S	NM	020401050908	{A6AB04BE-E29C-11E2-8094-0021280458E6}

from holoviews.element.tiles import EsriTerrain

drb = huc12_basins.hvplot(
    geo=True, coastline='50m', alpha=0.2, c='r', frame_width=300,
    xlabel="longitude", ylabel="latitude", clabel="monthly water evaporation (mm)",
    title="Delaware River HUC12 basins", xlim=(-76.8, -73.8), aspect='equal'
)
EsriTerrain() * drb

Load gridMET OPeNDAP endpoint with xarray#

Note currently the gridMET data require ‘#fillmismatch’ appended to the end of the OPeNDAP endpoint. For example

gridmet_URL = [
    'http://thredds.northwestknowledge.net:8080/thredds/dodsC/agg_met_tmmx_1979_CurrentYear_CONUS.nc#fillmismatch',
    'http://thredds.northwestknowledge.net:8080/thredds/dodsC/agg_met_tmmn_1979_CurrentYear_CONUS.nc#fillmismatch'
]
data = xr.open_mfdataset(gridmet_URL)
data

<xarray.Dataset> Size: 224GB
Dimensions:                    (day: 17272, lat: 585, lon: 1386, crs: 1)
Coordinates:
  * day                        (day) datetime64[ns] 138kB 1979-01-01 ... 2026...
  * lat                        (lat) float64 5kB 49.4 49.36 ... 25.11 25.07
  * lon                        (lon) float64 11kB -124.8 -124.7 ... -67.1 -67.06
  * crs                        (crs) float32 4B 3.0
Data variables:
    daily_maximum_temperature  (day, lat, lon) float64 112GB dask.array<chunksize=(17272, 585, 1386), meta=np.ndarray>
    daily_minimum_temperature  (day, lat, lon) float64 112GB dask.array<chunksize=(17272, 585, 1386), meta=np.ndarray>
Attributes: (12/19)
    geospatial_bounds_crs:      EPSG:4326
    Conventions:                CF-1.0
    geospatial_bounds:          POLYGON((-124.7666666333333 49.40000000000000...
    geospatial_lat_min:         25.066666666666666
    geospatial_lat_max:         49.40000000000000
    geospatial_lon_min:         -124.7666666333333
    ...                         ...
    date:                       03 March 2026
    note1:                      The projection information for this file is: ...
    note2:                      Citation: Abatzoglou, J.T., 2013, Development...
    note3:                      Data in slices after last_permanent_slice (1-...
    note4:                      Data in slices after last_provisional_slice (...
    note5:                      Days correspond approximately to calendar day...

Calculate weights#

To determine an area-weighted statistic for each HUC12 polygon, the area of all grid cells (a Shapely Polygon object representing each cell of the grid).
The weight of each cell contributing to a HUC12 polygon is the cell-intersected-area / HUC12 polygon-area.
Not all all data readable by xarray are made the same. Therefor, when using non-OPeNDAP catalog endpoints, the user must supply some metadata about the xarray data and GeoDataFrame data.

Enable logging#

gdptools uses Python’s standard logging module. By default it emits no output. Configuring the gdptools logger before running weight generation and aggregation lets you monitor progress, timing, and data dimensions throughout the workflow. Here we set gdptools to INFO while keeping third-party libraries at WARNING to avoid noisy output from rasterio, fiona, etc.

# Configure logging: gdptools at INFO, everything else at WARNING
logging.basicConfig(
    level=logging.WARNING,
    format="%(name)s | %(levelname)s | %(message)s",
    force=True,
)
logging.getLogger("gdptools").setLevel(logging.INFO)

data['crs'].attrs.get('spatial_ref')

'GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0,AUTHORITY["EPSG","8901"]],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AUTHORITY["EPSG","4326"]]'

# metadata for calculating weights

data_crs = data['crs'].attrs.get('spatial_ref')
x_coord = "lon"
y_coord = "lat"
t_coord = "day"
sdate = "1979-01-01"
edate = "1979-01-07"
var = ["daily_maximum_temperature", "daily_minimum_temperature"]
shp_crs = 4326
shp_poly_idx = "huc12"
wght_gen_crs = 6931

user_data = UserCatData(
    source_ds=data,
    source_crs=data_crs,
    source_x_coord=x_coord,
    source_y_coord=y_coord,
    source_t_coord=t_coord,
    source_var=var,
    target_gdf=huc12_basins,
    target_crs=shp_crs,
    target_id=shp_poly_idx,
    source_time_period=[sdate, edate]
)

wght_gen = WeightGen(
    user_data=user_data,
    method="serial",
    output_file='example_weights.csv',
    weight_gen_crs=6931,
)

wdf = wght_gen.calculate_weights()

To illustrate the format of the saved weights file it’s opened here, however in the next step we use the returned datafame.

pd.read_csv('example_weights.csv')

	huc12	i	j	wght
0	20402060505	74	35	0.081887
1	20402060505	77	35	0.002041
2	20402060505	76	35	0.168929
3	20402060505	75	35	0.201884
4	20402060505	76	34	0.119210
...	...	...	...	...
5677	20401050103	37	36	0.032199
5678	20401050103	36	37	0.184037
5679	20401050103	35	37	0.100861
5680	20401050103	34	37	0.000047
5681	20401050103	36	36	0.023088

5682 rows × 4 columns

Run the grid-to-polygon area weighted average interpolation using the calculated weights.#

Future releases will add additional area-weighted statistics. Currently only the mean is returned.
If more that one variable were available in the dataset then the run_weights function could be embedded in a loop that indexes each variable.

run_weights Output#

Run weighs returns:

ngdf is the GeoDataFrame passed to run_weights. huc12_basins maybe reordered when running weights so the re-ordered GeoDataFrame is returned.
vals are the area-weighted interpolated values ordered by the ngdf.index

agg_gen = AggGen(
    user_data=user_data,
    stat_method="masked_mean",
    agg_engine="serial",
    agg_writer="csv",
    weights='example_weights.csv',
    out_path='.',
    file_prefix="test_gm_nc"
)
ngdf, ds_out = agg_gen.calculate_agg()

pd.read_csv('test_gm_nc.csv', index_col=0)

	time	varname	units	020401010101	020401010102	020401010103	020401010104	020401010105	020401010106	020401010201	...	020402070503	020402070504	020402070505	020402070506	020402070507	020402070601	020402070602	020402070603	020402070604	020402070605
index
0	1979-01-01	daily_maximum_temperature	K	283.342101	283.789520	283.891782	284.313867	284.279681	283.724576	283.688328	...	288.614109	287.102319	287.257388	287.192510	285.911595	288.647165	287.673752	287.576502	286.573011	285.854688
1	1979-01-02	daily_maximum_temperature	K	282.645539	282.961788	283.040125	283.368031	283.418329	282.748207	282.790915	...	287.766527	285.970048	286.572958	286.722795	285.626663	286.617455	286.196586	286.075713	285.564972	285.071556
2	1979-01-03	daily_maximum_temperature	K	270.354667	270.317264	270.102145	270.433602	270.454707	270.254940	270.760374	...	278.432881	278.402124	278.703507	278.566303	278.221036	278.464381	278.479914	278.553679	278.529936	278.169785
3	1979-01-04	daily_maximum_temperature	K	263.918458	263.961688	263.736135	264.144980	264.093689	264.068521	264.586014	...	274.790052	274.646453	275.191191	275.202789	274.948669	274.324795	274.502752	274.522852	274.589280	274.395581
4	1979-01-05	daily_maximum_temperature	K	264.065839	264.056051	263.842020	264.165891	264.218206	264.084139	264.706366	...	277.148031	277.046858	277.584110	277.602286	277.298743	276.941864	276.956516	277.059530	277.263571	277.058642
5	1979-01-06	daily_maximum_temperature	K	269.963731	270.232074	270.220195	270.616740	270.654793	270.416690	270.891040	...	278.232881	278.396659	278.907483	278.801530	278.666241	278.123928	278.283779	278.408128	278.699587	278.547546
6	1979-01-07	daily_maximum_temperature	K	275.403016	275.938433	276.217032	276.513022	276.740472	275.976119	276.418405	...	284.271129	284.194942	284.334873	284.167058	283.541393	284.590986	284.436977	284.424597	284.175313	283.795570
0	1979-01-01	daily_minimum_temperature	K	276.875121	277.317797	277.205682	277.407442	277.398712	277.034636	276.912089	...	281.526809	280.340967	280.397078	280.529055	280.087390	280.910730	280.530568	280.421870	280.030056	280.079895
1	1979-01-02	daily_minimum_temperature	K	269.256667	269.413816	269.070310	269.239835	269.257574	269.118803	269.312846	...	274.801765	275.134578	274.757241	274.653814	274.893253	275.437823	275.296745	275.423789	275.366362	275.487308
2	1979-01-03	daily_minimum_temperature	K	257.501491	257.836032	257.340293	257.918269	257.588494	258.058366	258.241108	...	263.393736	263.564219	263.443779	263.438990	263.702673	263.170775	263.485903	263.498352	263.640248	263.869741
3	1979-01-04	daily_minimum_temperature	K	258.095835	258.408270	257.959467	258.499743	258.186048	258.647050	258.789096	...	263.532182	263.871596	263.858109	263.911718	264.327853	263.159602	263.670639	263.696368	264.030124	264.398075
4	1979-01-05	daily_minimum_temperature	K	259.297268	259.592666	259.187715	259.580379	259.406128	259.667843	259.853056	...	268.417116	268.802196	269.038191	269.138270	269.711928	267.564882	268.407991	268.464200	269.103907	269.629261
5	1979-01-06	daily_minimum_temperature	K	261.074570	261.397559	261.168801	261.430951	261.427480	261.406271	261.782945	...	271.789742	272.279462	272.127984	272.111718	272.457381	271.686187	272.149811	272.132619	272.379940	272.600686
6	1979-01-07	daily_minimum_temperature	K	262.672844	262.951875	262.715915	262.921207	263.001742	262.820974	263.269528	...	272.644310	273.099901	273.012330	272.954145	273.317758	272.646589	272.972866	273.022227	273.226546	273.487308

14 rows × 430 columns

aggdata = agg_gen.agg_data
ds_ss = aggdata.get("daily_maximum_temperature").da

Plot the gridded data and interpolated data#

var = "daily_maximum_temperature"
time_step = "1979-01-07"
tmin = np.nanmin(ds_ss.sel(day=time_step).values)
tmax = np.nanmax(ds_ss.sel(day=time_step).values)
print(tmin,tmax)

273.1 286.1

ds_out[var].shape

(7, 427)

ngdf[var] = ds_out[var].sel(time=time_step)
gridmet_agg = ngdf.to_crs(4326).hvplot(
    c='daily_maximum_temperature', geo=True, coastline='50m', frame_width=300, alpha=1, cmap='viridis', clim=(tmin, tmax),
    xlabel="longitude", ylabel="latitude", clabel="daily_maximum_temperature (K)", xlim=(-76.8, -73.8),
    title="HUC12 area-weighted average", aspect='equal'
)
gridmet_raw = ds_ss.isel(day=6).hvplot.quadmesh(
    'lon', 'lat', 'daily_maximum_temperature', cmap='viridis', alpha=1, grid=True, geo=True, coastline='50m', frame_width=300, clim=(tmin, tmax),
    xlabel="longitude", ylabel="latitude", clabel="daily_maximum_temperature (K)", xlim=(-76.8, -73.8),
    title="Gridmet daily max temperature", aspect='equal'
)

(EsriTerrain() * gridmet_raw) + (EsriTerrain() * gridmet_agg) + (EsriTerrain() * gridmet_raw * gridmet_agg)