Zonal statistics of rasters

Zonal statistics of rasters#

Deprecation Notice: The serial, parallel, and dask zonal engines are deprecated and will be removed in a future version. Use zonal_engine="exactextract" instead — it provides better performance and handles large datasets with bounded memory. See the ExactExtract Engine section below.

import geopandas as gpd
import pandas as pd
import rioxarray as rxr
import xarray as xr

from gdptools import ZonalGen
from gdptools.data.user_data import UserTiffData

# Be sure to use rioxarray to read tiff
rds_slope = rxr.open_rasterio("../../../tests/data/rasters/slope/slope.tif")
rds_text = rxr.open_rasterio("../../../tests/data/rasters/TEXT_PRMS/TEXT_PRMS.tif")
# Use geopandas to read shape file,
# Best if shape file only has feature id, and geometry if possible.
gdf = gpd.read_file("../../../tests/data/Oahu.shp")
id_feature = "fid"
print(len(gdf.groupby(id_feature)))
# These params are used to fill out the TiffAttributes class
tx_name= 'x'
ty_name = 'y'
band = 'band'
crs = 26904
varname = "slope" # not currently used
categorical = False # is the data categorical or not, if the data are integers it should
                   # probably be categorical.
data = UserTiffData(
    source_var=varname,
    source_ds=rds_slope,
    source_crs=crs,
    source_x_coord=tx_name,
    source_y_coord=ty_name,
    band=1,
    bname=band,
    target_gdf=gdf,
    target_id=id_feature
)

zonal_gen = ZonalGen(
    user_data=data,
    zonal_engine="serial",
    zonal_writer="csv",
    out_path=".",
    file_prefix="Oahu_slope_stats"
)
stats = zonal_gen.calculate_zonal(categorical=categorical)
print(stats)

# stats = zonal_gen.calculate_zonal(categorical=True)
# print(stats)

          count       mean        std  min    25%   50%   75%   max     sum
fid                                                                        
0  20431.0   1.872155   3.757167  0.0   0.00   0.0   1.0  32.0   38250
0   2388.0  26.350921  13.473957  5.0  14.00  24.0  37.0  61.0   62926
0   3192.0  33.687030  14.188686  1.0  22.00  34.0  45.0  70.0  107529
0   6600.0  23.704091  15.854788  0.0  10.00  21.0  37.0  66.0  156447
0  11380.0  22.116169  14.062708  0.0  11.00  21.0  33.0  62.0  251682
...         ...        ...        ...  ...    ...   ...   ...   ...     ...
0   2753.0   1.722121   3.183146  0.0   0.00   1.0   1.0  23.0    4741
0     99.0   1.363636   1.082840  0.0   0.00   2.0   2.0   3.0     135
0    174.0  10.505747   7.044653  0.0   5.25   9.0  14.0  32.0    1828
0   3089.0  10.931693  10.120911  0.0   4.00   7.0  15.0  59.0   33768
0    321.0   1.679128   1.702893  0.0   1.00   1.0   2.0   9.0     539

[304 rows x 9 columns]

gdf.sort_values(by=id_feature, inplace=True)
stats.sort_values(by=id_feature, inplace=True)
gdf["slope_mean"] = stats["mean"].values
gdf

	fid	rpu	cat_id	hru_segmen	geometry	slope_mean
0	1104.0	f	1104	995.0	POLYGON ((601200 2359860, 601180.267 2359853.7...	1.872155
1	1105.0	f	1105	996.0	POLYGON ((593760 2369060, 593675.2 2369014.8, ...	26.350921
2	1106.0	f	1106	997.0	POLYGON ((615055.367 2375324.633, 615130.633 2...	33.687030
3	1107.0	f	1107	998.0	POLYGON ((619150 2380290, 619096.633 2380263.3...	23.704091
4	1108.0	f	1108	999.0	POLYGON ((614640 2382950, 614628.833 2383001.1...	22.116169
...	...	...	...	...	...	...
299	1403.0	f	1403	1070.0	POLYGON ((620198.262 2355012.754, 620153.3 235...	1.722121
300	1404.0	f	1404	1029.0	POLYGON ((579986.031 2380993.933, 580280 23810...	1.363636
301	1405.0	f	1405	1146.0	POLYGON ((625200 2365570, 625207.872 2365555.4...	10.505747
302	1406.0	f	1406	1146.0	POLYGON ((625144.965 2365356.034, 625183.9 236...	10.931693
303	1407.0	f	1407	1233.0	POLYGON ((618131.067 2357417.7, 618105.367 235...	1.679128

304 rows × 6 columns

gdf.plot(column="slope_mean", legend=True)

<Axes: >

../../_images/18d237f085f0c52103453a0905c01816234ed789189b24621c532692066bb72e.png

Parallel generator#

In this case the generator is slower because the domain is small and the overhead of generating the parallel processes takes more time than the actual calculation. However with a large domain the “parallel” zonal_engine is faster and has a smaller memory footprint.

zonal_gen2 = ZonalGen(
    user_data=data,
    zonal_engine="parallel",
    zonal_writer="csv",
    out_path=".",
    file_prefix="Oahu_slope_stats_p",
    jobs=2
)
stats = zonal_gen2.calculate_zonal(categorical=categorical)
print(stats)

          count       mean        std  min    25%   50%   75%   max     sum
fid                                                                        
0  20431.0   1.872155   3.757167  0.0   0.00   0.0   1.0  32.0   38250
0   2388.0  26.350921  13.473957  5.0  14.00  24.0  37.0  61.0   62926
0   3192.0  33.687030  14.188686  1.0  22.00  34.0  45.0  70.0  107529
0   6600.0  23.704091  15.854788  0.0  10.00  21.0  37.0  66.0  156447
0  11380.0  22.116169  14.062708  0.0  11.00  21.0  33.0  62.0  251682
...         ...        ...        ...  ...    ...   ...   ...   ...     ...
0   2753.0   1.722121   3.183146  0.0   0.00   1.0   1.0  23.0    4741
0     99.0   1.363636   1.082840  0.0   0.00   2.0   2.0   3.0     135
0    174.0  10.505747   7.044653  0.0   5.25   9.0  14.0  32.0    1828
0   3089.0  10.931693  10.120911  0.0   4.00   7.0  15.0  59.0   33768
0    321.0   1.679128   1.702893  0.0   1.00   1.0   2.0   9.0     539

[304 rows x 9 columns]

gdf.sort_values(by=id_feature, inplace=True)
stats.sort_values(by=id_feature, inplace=True)
gdf["slope_mean"] = stats["mean"].values
gdf

	fid	rpu	cat_id	hru_segmen	geometry	slope_mean
0	1104.0	f	1104	995.0	POLYGON ((601200 2359860, 601180.267 2359853.7...	1.872155
1	1105.0	f	1105	996.0	POLYGON ((593760 2369060, 593675.2 2369014.8, ...	26.350921
2	1106.0	f	1106	997.0	POLYGON ((615055.367 2375324.633, 615130.633 2...	33.687030
3	1107.0	f	1107	998.0	POLYGON ((619150 2380290, 619096.633 2380263.3...	23.704091
4	1108.0	f	1108	999.0	POLYGON ((614640 2382950, 614628.833 2383001.1...	22.116169
...	...	...	...	...	...	...
299	1403.0	f	1403	1070.0	POLYGON ((620198.262 2355012.754, 620153.3 235...	1.722121
300	1404.0	f	1404	1029.0	POLYGON ((579986.031 2380993.933, 580280 23810...	1.363636
301	1405.0	f	1405	1146.0	POLYGON ((625200 2365570, 625207.872 2365555.4...	10.505747
302	1406.0	f	1406	1146.0	POLYGON ((625144.965 2365356.034, 625183.9 236...	10.931693
303	1407.0	f	1407	1233.0	POLYGON ((618131.067 2357417.7, 618105.367 235...	1.679128

304 rows × 6 columns

gdf.plot(column="slope_mean", legend=True)

<Axes: >

ExactExtract Engine (High Performance)#

gdptools now supports the exactextract engine, which provides 10-100x faster zonal statistics by operating directly on raster data without converting to vector polygons.

Engine Comparison#

Engine	Status	Description	Best For	Weighted Stats
`exactextract`	Recommended	Raster-native C++ library	Any size, fastest	Planned (#94)
`serial`	Deprecated	Single-threaded, vector-based	Small datasets, debugging	Yes
`parallel`	Deprecated	Multi-threaded via joblib	Medium datasets	Yes
`dask`	Deprecated	Distributed via Dask cluster	Very large datasets	Yes

Why is exactextract faster?#

Raster-native: Operates directly on raster blocks without creating polygon geometries
Coverage fractions: Computes cell coverage analytically (no expensive intersection calls)
Streaming: Processes raster blocks on-demand without full array materialization
C++ backend: Core algorithms implemented in optimized C++

Note: exactextract does not yet support weighted zonal statistics. Weighted support via exactextract’s built-in coverage fractions is planned in issue #94. Until then, the deprecated engines remain available for WeightedZonalGen.

import time

# Reload data for fair comparison
data_slope = UserTiffData(
    source_var="slope",
    source_ds=rds_slope,
    source_crs=crs,
    source_x_coord=tx_name,
    source_y_coord=ty_name,
    band=1,
    bname=band,
    target_gdf=gdf[[id_feature, "geometry"]].copy(),
    target_id=id_feature
)

# ExactExtract engine - fastest option
zonal_gen_exact = ZonalGen(
    user_data=data_slope,
    zonal_engine="exactextract",  # New high-performance engine!
    zonal_writer="csv",
    out_path=".",
    file_prefix="Oahu_slope_stats_exact"
)

start = time.perf_counter()
stats_exact = zonal_gen_exact.calculate_zonal(categorical=False)
exact_time = time.perf_counter() - start
print(f"\nexactextract completed in {exact_time:.4f} seconds")
print(stats_exact.head())

exactextract completed in 1.8695 seconds
               count       mean        std  min  25%  50%  75%  max  \
fid                                                                   
1104.0  20413.586391   1.874860   3.758853    0    0    0    1   32   
1105.0   2390.553052  26.282542  13.461160    0   14   23   36   61   
1106.0   3195.079310  33.641735  14.204321    1   22   34   44   70   
1107.0   6605.529560  23.720972  15.855884    0   10   21   37   66   
1108.0  11364.951446  22.139100  14.065037    0   11   20   33   62   

                  sum  
fid                    
1104.0   38272.622751  
1105.0   62829.811849  
1106.0  107488.011195  
1107.0  156689.583950  
1108.0  251609.791194  

Timing Comparison: All Engines#

Let’s compare the performance of all three engines on the same dataset:

timing_results = {}

# Serial engine
zonal_serial = ZonalGen(
    user_data=data_slope,
    zonal_engine="serial",
    zonal_writer="csv",
    out_path=".",
    file_prefix="timing_serial"
)
start = time.perf_counter()
_ = zonal_serial.calculate_zonal(categorical=False)
timing_results["serial"] = time.perf_counter() - start

# Parallel engine
zonal_parallel = ZonalGen(
    user_data=data_slope,
    zonal_engine="parallel",
    zonal_writer="csv",
    out_path=".",
    file_prefix="timing_parallel",
    jobs=4
)
start = time.perf_counter()
_ = zonal_parallel.calculate_zonal(categorical=False)
timing_results["parallel"] = time.perf_counter() - start

# ExactExtract engine
zonal_exact = ZonalGen(
    user_data=data_slope,
    zonal_engine="exactextract",
    zonal_writer="csv",
    out_path=".",
    file_prefix="timing_exact"
)
start = time.perf_counter()
_ = zonal_exact.calculate_zonal(categorical=False)
timing_results["exactextract"] = time.perf_counter() - start

# Display results
print("\n" + "="*50)
print("TIMING COMPARISON")
print("="*50)
for engine, t in timing_results.items():
    speedup = timing_results["serial"] / t
    print(f"{engine:15} : {t:8.4f} sec  ({speedup:5.1f}x vs serial)")
print("="*50)

==================================================
TIMING COMPARISON
==================================================
serial          :   2.1743 sec  (  1.0x vs serial)
parallel        :   4.2891 sec  (  0.5x vs serial)
exactextract    :   1.8850 sec  (  1.2x vs serial)
==================================================

Key Takeaways#

exactextract is the recommended engine — typically 10-100x faster than vector-based engines because it operates directly on raster blocks without constructing intersection geometries.
The serial, parallel, and dask engines are deprecated and will be removed in a future version.
Use exactextract for all new zonal statistics work (mean, sum, min, max, etc.).
Weighted zonal statistics via exactextract are planned (issue #94). Until then, WeightedZonalGen still uses the deprecated engines.

varname = "TEXT" # not currently used
categorical = True # is the data categorical or not, if the data are integers it should
                   # probably be categorical.
data = UserTiffData(
    source_var=varname,
    source_ds=rds_text,  # Use rds_text for categorical example
    source_crs=crs,
    source_x_coord=tx_name,
    source_y_coord=ty_name,
    band=1,
    bname=band,
    target_gdf=gdf,
    target_id=id_feature
)

zonal_gen = ZonalGen(
    user_data=data,
    zonal_engine="serial",
    zonal_writer="csv",
    out_path=".",
    file_prefix="Oahu_TEXT_stats"
)
stats = zonal_gen.calculate_zonal(categorical=categorical)
print(stats)

          0    1         2         3  count
fid                                        
0  0.0  0.0  0.654237  0.345763    295
0  0.0  0.0  0.882353  0.117647     34
0  0.0  0.0  0.979167  0.020833     48
0  0.0  0.0  0.191489  0.808511     94
0  0.0  0.0  0.197605  0.802395    167
...     ...  ...       ...       ...    ...
0  0.0  0.0  0.186047  0.813953     43
0  0.0  0.0  0.841818  0.158182    550
0  0.0  0.0  0.844660  0.155340    103
0  0.0  0.0  1.000000  0.000000     20
0  0.0  0.0  0.609756  0.390244     41

[304 rows x 5 columns]

# Compute the max category for each `fid`
max_category_df = stats.iloc[:, :-1].idxmax(axis=1).reset_index()
max_category_df.columns = ['fid', 'max_category']

# Merge the max category result into the GeoDataFrame
gdf = gdf.merge(max_category_df, on='fid', how='left')
gdf

	fid	rpu	cat_id	hru_segmen	geometry	slope_mean	max_category
0	1104.0	f	1104	995.0	POLYGON ((601200 2359860, 601180.267 2359853.7...	1.872155	2
1	1105.0	f	1105	996.0	POLYGON ((593760 2369060, 593675.2 2369014.8, ...	26.350921	2
2	1106.0	f	1106	997.0	POLYGON ((615055.367 2375324.633, 615130.633 2...	33.687030	2
3	1107.0	f	1107	998.0	POLYGON ((619150 2380290, 619096.633 2380263.3...	23.704091	3
4	1108.0	f	1108	999.0	POLYGON ((614640 2382950, 614628.833 2383001.1...	22.116169	3
...	...	...	...	...	...	...	...
299	1403.0	f	1403	1070.0	POLYGON ((620198.262 2355012.754, 620153.3 235...	1.722121	2
300	1404.0	f	1404	1029.0	POLYGON ((579986.031 2380993.933, 580280 23810...	1.363636	2
301	1405.0	f	1405	1146.0	POLYGON ((625200 2365570, 625207.872 2365555.4...	10.505747	2
302	1406.0	f	1406	1146.0	POLYGON ((625144.965 2365356.034, 625183.9 236...	10.931693	2
303	1407.0	f	1407	1233.0	POLYGON ((618131.067 2357417.7, 618105.367 235...	1.679128	2

304 rows × 7 columns

gdf.plot(column="max_category", legend=True)

<Axes: >

../../_images/323998122d4ce8b1bbb13773c9e9b3d1952e923e1d6603433c7c7acab0c07c1c.png

Parallel engine#

zonal_gen2 = ZonalGen(
    user_data=data,
    zonal_engine="parallel",
    zonal_writer="csv",
    out_path=".",
    file_prefix="Oahu_TEXT_stats",
    jobs=2
)
stats = zonal_gen2.calculate_zonal(categorical=categorical)
print(stats)

          0    1         2         3  count
fid                                        
0  0.0  0.0  0.654237  0.345763    295
0  0.0  0.0  0.882353  0.117647     34
0  0.0  0.0  0.979167  0.020833     48
0  0.0  0.0  0.191489  0.808511     94
0  0.0  0.0  0.197605  0.802395    167
...     ...  ...       ...       ...    ...
0  0.0  0.0  0.186047  0.813953     43
0  0.0  0.0  0.841818  0.158182    550
0  0.0  0.0  0.844660  0.155340    103
0  0.0  0.0  1.000000  0.000000     20
0  0.0  0.0  0.609756  0.390244     41

[304 rows x 5 columns]

# Compute the max category for each `fid`
max_category_df = stats.iloc[:, :-1].idxmax(axis=1).reset_index()
max_category_df.columns = ['fid', 'max_category_p']

gdf = gdf.merge(max_category_df, on='fid', how='left')
gdf

	fid	rpu	cat_id	hru_segmen	geometry	slope_mean	max_category	max_category_p
0	1104.0	f	1104	995.0	POLYGON ((601200 2359860, 601180.267 2359853.7...	1.872155	2	2
1	1105.0	f	1105	996.0	POLYGON ((593760 2369060, 593675.2 2369014.8, ...	26.350921	2	2
2	1106.0	f	1106	997.0	POLYGON ((615055.367 2375324.633, 615130.633 2...	33.687030	2	2
3	1107.0	f	1107	998.0	POLYGON ((619150 2380290, 619096.633 2380263.3...	23.704091	3	3
4	1108.0	f	1108	999.0	POLYGON ((614640 2382950, 614628.833 2383001.1...	22.116169	3	3
...	...	...	...	...	...	...	...	...
299	1403.0	f	1403	1070.0	POLYGON ((620198.262 2355012.754, 620153.3 235...	1.722121	2	2
300	1404.0	f	1404	1029.0	POLYGON ((579986.031 2380993.933, 580280 23810...	1.363636	2	2
301	1405.0	f	1405	1146.0	POLYGON ((625200 2365570, 625207.872 2365555.4...	10.505747	2	2
302	1406.0	f	1406	1146.0	POLYGON ((625144.965 2365356.034, 625183.9 236...	10.931693	2	2
303	1407.0	f	1407	1233.0	POLYGON ((618131.067 2357417.7, 618105.367 235...	1.679128	2	2

304 rows × 8 columns

gdf.plot(column="max_category_p", legend=True)

<Axes: >

ExactExtract Engine with Categorical Data#

The exactextract engine also supports categorical zonal statistics with the categorical=True parameter. For categorical data, it computes the count/area of each unique category within each polygon.

# Set up categorical data using TEXT raster
data_text = UserTiffData(
    source_var="TEXT",
    source_ds=rds_text,
    source_crs=crs,
    source_x_coord=tx_name,
    source_y_coord=ty_name,
    band=1,
    bname=band,
    target_gdf=gdf[[id_feature, "geometry"]].copy(),
    target_id=id_feature
)

# ExactExtract engine with categorical data
zonal_gen_exact_cat = ZonalGen(
    user_data=data_text,
    zonal_engine="exactextract",
    zonal_writer="csv",
    out_path=".",
    file_prefix="Oahu_TEXT_stats_exact"
)

start = time.perf_counter()
stats_exact_cat = zonal_gen_exact_cat.calculate_zonal(categorical=True)
exact_cat_time = time.perf_counter() - start
print(f"\nexactextract (categorical) completed in {exact_cat_time:.4f} seconds")
print(stats_exact_cat.head())

exactextract (categorical) completed in 1.5885 seconds
          0    1         2         3  count
fid                                        
1104.0  0.0  0.0  0.653648  0.346352    294
1105.0  0.0  0.0  0.857603  0.142397     34
1106.0  0.0  0.0  0.985308  0.014692     46
1107.0  0.0  0.0  0.199234  0.800766     95
1108.0  0.0  0.0  0.198175  0.801825    164

# Compute the max category for each polygon
max_category_exact = stats_exact_cat.iloc[:, :-1].idxmax(axis=1).reset_index()
max_category_exact.columns = ['fid', 'max_category_exact']

# Merge into GeoDataFrame and plot
gdf = gdf.merge(max_category_exact, on='fid', how='left')
gdf.plot(column="max_category_exact", legend=True)

<Axes: >

../../_images/f11ea5b1f5d11350ae2c60262902ef1b7bbabbb2828e4acc7ee2212a4519018e.png