Zonal statistics of rasters#

Deprecation Notice: The serial, parallel, and dask zonal engines are deprecated and will be removed in a future version. Use zonal_engine="exactextract" instead — it provides better performance and handles large datasets with bounded memory. See the ExactExtract Engine section below.

import geopandas as gpd
import pandas as pd
import rioxarray as rxr
import xarray as xr

from gdptools import ZonalGen
from gdptools.data.user_data import UserTiffData
# Be sure to use rioxarray to read tiff
rds_slope = rxr.open_rasterio("../../../tests/data/rasters/slope/slope.tif")
rds_text = rxr.open_rasterio("../../../tests/data/rasters/TEXT_PRMS/TEXT_PRMS.tif")
# Use geopandas to read shape file,
# Best if shape file only has feature id, and geometry if possible.
gdf = gpd.read_file("../../../tests/data/Oahu.shp")
id_feature = "fid"
print(len(gdf.groupby(id_feature)))
# These params are used to fill out the TiffAttributes class
tx_name= 'x'
ty_name = 'y'
band = 'band'
crs = 26904
varname = "slope" # not currently used
categorical = False # is the data categorical or not, if the data are integers it should
                   # probably be categorical.
data = UserTiffData(
    source_var=varname,
    source_ds=rds_slope,
    source_crs=crs,
    source_x_coord=tx_name,
    source_y_coord=ty_name,
    band=1,
    bname=band,
    target_gdf=gdf,
    target_id=id_feature
)

zonal_gen = ZonalGen(
    user_data=data,
    zonal_engine="serial",
    zonal_writer="csv",
    out_path=".",
    file_prefix="Oahu_slope_stats"
)
stats = zonal_gen.calculate_zonal(categorical=categorical)
print(stats)

# stats = zonal_gen.calculate_zonal(categorical=True)
# print(stats)
304
          count       mean        std  min    25%   50%   75%   max     sum
fid                                                                        
1104.0  20431.0   1.872155   3.757167  0.0   0.00   0.0   1.0  32.0   38250
1105.0   2388.0  26.350921  13.473957  5.0  14.00  24.0  37.0  61.0   62926
1106.0   3192.0  33.687030  14.188686  1.0  22.00  34.0  45.0  70.0  107529
1107.0   6600.0  23.704091  15.854788  0.0  10.00  21.0  37.0  66.0  156447
1108.0  11380.0  22.116169  14.062708  0.0  11.00  21.0  33.0  62.0  251682
...         ...        ...        ...  ...    ...   ...   ...   ...     ...
1403.0   2753.0   1.722121   3.183146  0.0   0.00   1.0   1.0  23.0    4741
1404.0     99.0   1.363636   1.082840  0.0   0.00   2.0   2.0   3.0     135
1405.0    174.0  10.505747   7.044653  0.0   5.25   9.0  14.0  32.0    1828
1406.0   3089.0  10.931693  10.120911  0.0   4.00   7.0  15.0  59.0   33768
1407.0    321.0   1.679128   1.702893  0.0   1.00   1.0   2.0   9.0     539

[304 rows x 9 columns]
gdf.sort_values(by=id_feature, inplace=True)
stats.sort_values(by=id_feature, inplace=True)
gdf["slope_mean"] = stats["mean"].values
gdf
fid rpu cat_id hru_segmen geometry slope_mean
0 1104.0 f 1104 995.0 POLYGON ((601200 2359860, 601180.267 2359853.7... 1.872155
1 1105.0 f 1105 996.0 POLYGON ((593760 2369060, 593675.2 2369014.8, ... 26.350921
2 1106.0 f 1106 997.0 POLYGON ((615055.367 2375324.633, 615130.633 2... 33.687030
3 1107.0 f 1107 998.0 POLYGON ((619150 2380290, 619096.633 2380263.3... 23.704091
4 1108.0 f 1108 999.0 POLYGON ((614640 2382950, 614628.833 2383001.1... 22.116169
... ... ... ... ... ... ...
299 1403.0 f 1403 1070.0 POLYGON ((620198.262 2355012.754, 620153.3 235... 1.722121
300 1404.0 f 1404 1029.0 POLYGON ((579986.031 2380993.933, 580280 23810... 1.363636
301 1405.0 f 1405 1146.0 POLYGON ((625200 2365570, 625207.872 2365555.4... 10.505747
302 1406.0 f 1406 1146.0 POLYGON ((625144.965 2365356.034, 625183.9 236... 10.931693
303 1407.0 f 1407 1233.0 POLYGON ((618131.067 2357417.7, 618105.367 235... 1.679128

304 rows × 6 columns

gdf.plot(column="slope_mean", legend=True)
<Axes: >
../../_images/18d237f085f0c52103453a0905c01816234ed789189b24621c532692066bb72e.png

Parallel generator#

In this case the generator is slower because the domain is small and the overhead of generating the parallel processes takes more time than the actual calculation. However with a large domain the “parallel” zonal_engine is faster and has a smaller memory footprint.

zonal_gen2 = ZonalGen(
    user_data=data,
    zonal_engine="parallel",
    zonal_writer="csv",
    out_path=".",
    file_prefix="Oahu_slope_stats_p",
    jobs=2
)
stats = zonal_gen2.calculate_zonal(categorical=categorical)
print(stats)
          count       mean        std  min    25%   50%   75%   max     sum
fid                                                                        
1104.0  20431.0   1.872155   3.757167  0.0   0.00   0.0   1.0  32.0   38250
1105.0   2388.0  26.350921  13.473957  5.0  14.00  24.0  37.0  61.0   62926
1106.0   3192.0  33.687030  14.188686  1.0  22.00  34.0  45.0  70.0  107529
1107.0   6600.0  23.704091  15.854788  0.0  10.00  21.0  37.0  66.0  156447
1108.0  11380.0  22.116169  14.062708  0.0  11.00  21.0  33.0  62.0  251682
...         ...        ...        ...  ...    ...   ...   ...   ...     ...
1403.0   2753.0   1.722121   3.183146  0.0   0.00   1.0   1.0  23.0    4741
1404.0     99.0   1.363636   1.082840  0.0   0.00   2.0   2.0   3.0     135
1405.0    174.0  10.505747   7.044653  0.0   5.25   9.0  14.0  32.0    1828
1406.0   3089.0  10.931693  10.120911  0.0   4.00   7.0  15.0  59.0   33768
1407.0    321.0   1.679128   1.702893  0.0   1.00   1.0   2.0   9.0     539

[304 rows x 9 columns]
gdf.sort_values(by=id_feature, inplace=True)
stats.sort_values(by=id_feature, inplace=True)
gdf["slope_mean"] = stats["mean"].values
gdf
fid rpu cat_id hru_segmen geometry slope_mean
0 1104.0 f 1104 995.0 POLYGON ((601200 2359860, 601180.267 2359853.7... 1.872155
1 1105.0 f 1105 996.0 POLYGON ((593760 2369060, 593675.2 2369014.8, ... 26.350921
2 1106.0 f 1106 997.0 POLYGON ((615055.367 2375324.633, 615130.633 2... 33.687030
3 1107.0 f 1107 998.0 POLYGON ((619150 2380290, 619096.633 2380263.3... 23.704091
4 1108.0 f 1108 999.0 POLYGON ((614640 2382950, 614628.833 2383001.1... 22.116169
... ... ... ... ... ... ...
299 1403.0 f 1403 1070.0 POLYGON ((620198.262 2355012.754, 620153.3 235... 1.722121
300 1404.0 f 1404 1029.0 POLYGON ((579986.031 2380993.933, 580280 23810... 1.363636
301 1405.0 f 1405 1146.0 POLYGON ((625200 2365570, 625207.872 2365555.4... 10.505747
302 1406.0 f 1406 1146.0 POLYGON ((625144.965 2365356.034, 625183.9 236... 10.931693
303 1407.0 f 1407 1233.0 POLYGON ((618131.067 2357417.7, 618105.367 235... 1.679128

304 rows × 6 columns

gdf.plot(column="slope_mean", legend=True)
<Axes: >
../../_images/18d237f085f0c52103453a0905c01816234ed789189b24621c532692066bb72e.png

ExactExtract Engine (High Performance)#

gdptools now supports the exactextract engine, which provides 10-100x faster zonal statistics by operating directly on raster data without converting to vector polygons.

Engine Comparison#

Engine

Status

Description

Best For

Weighted Stats

exactextract

Recommended

Raster-native C++ library

Any size, fastest

Planned (#94)

serial

Deprecated

Single-threaded, vector-based

Small datasets, debugging

Yes

parallel

Deprecated

Multi-threaded via joblib

Medium datasets

Yes

dask

Deprecated

Distributed via Dask cluster

Very large datasets

Yes

Why is exactextract faster?#

  1. Raster-native: Operates directly on raster blocks without creating polygon geometries

  2. Coverage fractions: Computes cell coverage analytically (no expensive intersection calls)

  3. Streaming: Processes raster blocks on-demand without full array materialization

  4. C++ backend: Core algorithms implemented in optimized C++

Note: exactextract does not yet support weighted zonal statistics. Weighted support via exactextract’s built-in coverage fractions is planned in issue #94. Until then, the deprecated engines remain available for WeightedZonalGen.

import time

# Reload data for fair comparison
data_slope = UserTiffData(
    source_var="slope",
    source_ds=rds_slope,
    source_crs=crs,
    source_x_coord=tx_name,
    source_y_coord=ty_name,
    band=1,
    bname=band,
    target_gdf=gdf[[id_feature, "geometry"]].copy(),
    target_id=id_feature
)

# ExactExtract engine - fastest option
zonal_gen_exact = ZonalGen(
    user_data=data_slope,
    zonal_engine="exactextract",  # New high-performance engine!
    zonal_writer="csv",
    out_path=".",
    file_prefix="Oahu_slope_stats_exact"
)

start = time.perf_counter()
stats_exact = zonal_gen_exact.calculate_zonal(categorical=False)
exact_time = time.perf_counter() - start
print(f"\nexactextract completed in {exact_time:.4f} seconds")
print(stats_exact.head())
exactextract completed in 1.8041 seconds
               count       mean        std  min  25%  50%  75%  max  \
fid                                                                   
1104.0  20413.586391   1.874860   3.758853    0    0    0    1   32   
1105.0   2390.553052  26.282542  13.461160    0   14   23   36   61   
1106.0   3195.079310  33.641735  14.204321    1   22   34   44   70   
1107.0   6605.529560  23.720972  15.855884    0   10   21   37   66   
1108.0  11364.951446  22.139100  14.065037    0   11   20   33   62   

                  sum  
fid                    
1104.0   38272.622751  
1105.0   62829.811849  
1106.0  107488.011195  
1107.0  156689.583950  
1108.0  251609.791194  

Timing Comparison: All Engines#

Let’s compare the performance of all three engines on the same dataset:

timing_results = {}

# Serial engine
zonal_serial = ZonalGen(
    user_data=data_slope,
    zonal_engine="serial",
    zonal_writer="csv",
    out_path=".",
    file_prefix="timing_serial"
)
start = time.perf_counter()
_ = zonal_serial.calculate_zonal(categorical=False)
timing_results["serial"] = time.perf_counter() - start

# Parallel engine
zonal_parallel = ZonalGen(
    user_data=data_slope,
    zonal_engine="parallel",
    zonal_writer="csv",
    out_path=".",
    file_prefix="timing_parallel",
    jobs=4
)
start = time.perf_counter()
_ = zonal_parallel.calculate_zonal(categorical=False)
timing_results["parallel"] = time.perf_counter() - start

# ExactExtract engine
zonal_exact = ZonalGen(
    user_data=data_slope,
    zonal_engine="exactextract",
    zonal_writer="csv",
    out_path=".",
    file_prefix="timing_exact"
)
start = time.perf_counter()
_ = zonal_exact.calculate_zonal(categorical=False)
timing_results["exactextract"] = time.perf_counter() - start

# Display results
print("\n" + "="*50)
print("TIMING COMPARISON")
print("="*50)
for engine, t in timing_results.items():
    speedup = timing_results["serial"] / t
    print(f"{engine:15} : {t:8.4f} sec  ({speedup:5.1f}x vs serial)")
print("="*50)
==================================================
TIMING COMPARISON
==================================================
serial          :   2.1343 sec  (  1.0x vs serial)
parallel        :   4.1300 sec  (  0.5x vs serial)
exactextract    :   1.8048 sec  (  1.2x vs serial)
==================================================

Key Takeaways#

  • exactextract is the recommended engine — typically 10-100x faster than vector-based engines because it operates directly on raster blocks without constructing intersection geometries.

  • The serial, parallel, and dask engines are deprecated and will be removed in a future version.

  • Use exactextract for all new zonal statistics work (mean, sum, min, max, etc.).

  • Weighted zonal statistics via exactextract are planned (issue #94). Until then, WeightedZonalGen still uses the deprecated engines.

varname = "TEXT" # not currently used
categorical = True # is the data categorical or not, if the data are integers it should
                   # probably be categorical.
data = UserTiffData(
    source_var=varname,
    source_ds=rds_text,  # Use rds_text for categorical example
    source_crs=crs,
    source_x_coord=tx_name,
    source_y_coord=ty_name,
    band=1,
    bname=band,
    target_gdf=gdf,
    target_id=id_feature
)

zonal_gen = ZonalGen(
    user_data=data,
    zonal_engine="serial",
    zonal_writer="csv",
    out_path=".",
    file_prefix="Oahu_TEXT_stats"
)
stats = zonal_gen.calculate_zonal(categorical=categorical)
print(stats)
          0    1         2         3  count
fid                                        
1104.0  0.0  0.0  0.654237  0.345763    295
1105.0  0.0  0.0  0.882353  0.117647     34
1106.0  0.0  0.0  0.979167  0.020833     48
1107.0  0.0  0.0  0.191489  0.808511     94
1108.0  0.0  0.0  0.197605  0.802395    167
...     ...  ...       ...       ...    ...
1298.0  0.0  0.0  0.186047  0.813953     43
1307.0  0.0  0.0  0.841818  0.158182    550
1377.0  0.0  0.0  0.844660  0.155340    103
1389.0  0.0  0.0  1.000000  0.000000     20
1401.0  0.0  0.0  0.609756  0.390244     41

[304 rows x 5 columns]
# Compute the max category for each `fid`
max_category_df = stats.iloc[:, :-1].idxmax(axis=1).reset_index()
max_category_df.columns = ['fid', 'max_category']
# Merge the max category result into the GeoDataFrame
gdf = gdf.merge(max_category_df, on='fid', how='left')
gdf
fid rpu cat_id hru_segmen geometry slope_mean max_category
0 1104.0 f 1104 995.0 POLYGON ((601200 2359860, 601180.267 2359853.7... 1.872155 2
1 1105.0 f 1105 996.0 POLYGON ((593760 2369060, 593675.2 2369014.8, ... 26.350921 2
2 1106.0 f 1106 997.0 POLYGON ((615055.367 2375324.633, 615130.633 2... 33.687030 2
3 1107.0 f 1107 998.0 POLYGON ((619150 2380290, 619096.633 2380263.3... 23.704091 3
4 1108.0 f 1108 999.0 POLYGON ((614640 2382950, 614628.833 2383001.1... 22.116169 3
... ... ... ... ... ... ... ...
299 1403.0 f 1403 1070.0 POLYGON ((620198.262 2355012.754, 620153.3 235... 1.722121 2
300 1404.0 f 1404 1029.0 POLYGON ((579986.031 2380993.933, 580280 23810... 1.363636 2
301 1405.0 f 1405 1146.0 POLYGON ((625200 2365570, 625207.872 2365555.4... 10.505747 2
302 1406.0 f 1406 1146.0 POLYGON ((625144.965 2365356.034, 625183.9 236... 10.931693 2
303 1407.0 f 1407 1233.0 POLYGON ((618131.067 2357417.7, 618105.367 235... 1.679128 2

304 rows × 7 columns

gdf.plot(column="max_category", legend=True)
<Axes: >
../../_images/323998122d4ce8b1bbb13773c9e9b3d1952e923e1d6603433c7c7acab0c07c1c.png

Parallel engine#

zonal_gen2 = ZonalGen(
    user_data=data,
    zonal_engine="parallel",
    zonal_writer="csv",
    out_path=".",
    file_prefix="Oahu_TEXT_stats",
    jobs=2
)
stats = zonal_gen2.calculate_zonal(categorical=categorical)
print(stats)
          0    1         2         3  count
fid                                        
1104.0  0.0  0.0  0.654237  0.345763    295
1105.0  0.0  0.0  0.882353  0.117647     34
1106.0  0.0  0.0  0.979167  0.020833     48
1107.0  0.0  0.0  0.191489  0.808511     94
1108.0  0.0  0.0  0.197605  0.802395    167
...     ...  ...       ...       ...    ...
1298.0  0.0  0.0  0.186047  0.813953     43
1307.0  0.0  0.0  0.841818  0.158182    550
1377.0  0.0  0.0  0.844660  0.155340    103
1389.0  0.0  0.0  1.000000  0.000000     20
1401.0  0.0  0.0  0.609756  0.390244     41

[304 rows x 5 columns]
# Compute the max category for each `fid`
max_category_df = stats.iloc[:, :-1].idxmax(axis=1).reset_index()
max_category_df.columns = ['fid', 'max_category_p']

gdf = gdf.merge(max_category_df, on='fid', how='left')
gdf
fid rpu cat_id hru_segmen geometry slope_mean max_category max_category_p
0 1104.0 f 1104 995.0 POLYGON ((601200 2359860, 601180.267 2359853.7... 1.872155 2 2
1 1105.0 f 1105 996.0 POLYGON ((593760 2369060, 593675.2 2369014.8, ... 26.350921 2 2
2 1106.0 f 1106 997.0 POLYGON ((615055.367 2375324.633, 615130.633 2... 33.687030 2 2
3 1107.0 f 1107 998.0 POLYGON ((619150 2380290, 619096.633 2380263.3... 23.704091 3 3
4 1108.0 f 1108 999.0 POLYGON ((614640 2382950, 614628.833 2383001.1... 22.116169 3 3
... ... ... ... ... ... ... ... ...
299 1403.0 f 1403 1070.0 POLYGON ((620198.262 2355012.754, 620153.3 235... 1.722121 2 2
300 1404.0 f 1404 1029.0 POLYGON ((579986.031 2380993.933, 580280 23810... 1.363636 2 2
301 1405.0 f 1405 1146.0 POLYGON ((625200 2365570, 625207.872 2365555.4... 10.505747 2 2
302 1406.0 f 1406 1146.0 POLYGON ((625144.965 2365356.034, 625183.9 236... 10.931693 2 2
303 1407.0 f 1407 1233.0 POLYGON ((618131.067 2357417.7, 618105.367 235... 1.679128 2 2

304 rows × 8 columns

gdf.plot(column="max_category_p", legend=True)
<Axes: >
../../_images/323998122d4ce8b1bbb13773c9e9b3d1952e923e1d6603433c7c7acab0c07c1c.png

ExactExtract Engine with Categorical Data#

The exactextract engine also supports categorical zonal statistics with the categorical=True parameter. For categorical data, it computes the count/area of each unique category within each polygon.

# Set up categorical data using TEXT raster
data_text = UserTiffData(
    source_var="TEXT",
    source_ds=rds_text,
    source_crs=crs,
    source_x_coord=tx_name,
    source_y_coord=ty_name,
    band=1,
    bname=band,
    target_gdf=gdf[[id_feature, "geometry"]].copy(),
    target_id=id_feature
)

# ExactExtract engine with categorical data
zonal_gen_exact_cat = ZonalGen(
    user_data=data_text,
    zonal_engine="exactextract",
    zonal_writer="csv",
    out_path=".",
    file_prefix="Oahu_TEXT_stats_exact"
)

start = time.perf_counter()
stats_exact_cat = zonal_gen_exact_cat.calculate_zonal(categorical=True)
exact_cat_time = time.perf_counter() - start
print(f"\nexactextract (categorical) completed in {exact_cat_time:.4f} seconds")
print(stats_exact_cat.head())
exactextract (categorical) completed in 1.5559 seconds
          0    1         2         3  count
fid                                        
1104.0  0.0  0.0  0.653648  0.346352    294
1105.0  0.0  0.0  0.857603  0.142397     34
1106.0  0.0  0.0  0.985308  0.014692     46
1107.0  0.0  0.0  0.199234  0.800766     95
1108.0  0.0  0.0  0.198175  0.801825    164
# Compute the max category for each polygon
max_category_exact = stats_exact_cat.iloc[:, :-1].idxmax(axis=1).reset_index()
max_category_exact.columns = ['fid', 'max_category_exact']

# Merge into GeoDataFrame and plot
gdf = gdf.merge(max_category_exact, on='fid', how='left')
gdf.plot(column="max_category_exact", legend=True)
<Axes: >
../../_images/f11ea5b1f5d11350ae2c60262902ef1b7bbabbb2828e4acc7ee2212a4519018e.png