"""Calculate zonal statistics for raster data.
This module provides classes for computing zonal statistics on raster data,
including both standard and weighted zonal statistics calculations. It supports
various computational engines (serial, parallel, Dask, and exactextract) and
output formats.
Classes:
ZonalGen: Calculate standard zonal statistics for raster data.
WeightedZonalGen: Calculate area-weighted zonal statistics for raster data.
Notes:
This module requires that the user data has been properly preprocessed
using the appropriate data preparation tools in the gdptools package.
The "exactextract" engine provides significantly better performance for
non-weighted zonal statistics by operating directly on raster data.
"""
import logging
import time
import warnings
from datetime import datetime
from pathlib import Path
from typing import Literal
import pandas as pd
from pyproj import CRS
from gdptools.agg.zonal_engines import (
ZonalEngineDask,
ZonalEngineExactExtract,
ZonalEngineParallel,
ZonalEngineSerial,
)
from gdptools.data.user_data import UserData
logger = logging.getLogger(__name__)
ZONAL_ENGINES = Literal["serial", "parallel", "dask", "exactextract"]
"""Literal type alias for the zonal computation engines.
Options:
serial: Perform zonal calculations in a serial manner.
.. deprecated:: Use ``exactextract`` instead.
parallel: Perform zonal calculations in parallel using multiple processes.
.. deprecated:: Use ``exactextract`` instead.
dask: Perform zonal calculations using Dask for distributed computing.
.. deprecated:: Use ``exactextract`` instead.
exactextract: High-performance raster-native zonal statistics using exactextract.
This is the recommended engine. It only supports non-weighted statistics
currently; weighted support is planned (issue #94).
"""
ZONAL_WRITERS = Literal["csv"]
"""Literal type alias for the zonal statistics output formats.
Options:
csv: Write zonal statistics to a CSV file.
"""
[docs]
class ZonalGen:
"""Calculate standard zonal statistics for raster data.
This class provides functionality to compute zonal statistics (mean, sum, count,
etc.) for raster data within defined zones. It supports multiple computational
engines for flexibility and performance optimization.
The class handles the entire workflow from data preparation to result output,
including options for different computational backends and output formats.
Examples:
Recommended usage with exactextract engine:
>>> from gdptools.zonal_gen import ZonalGen
>>> from gdptools.data.user_data import UserData
>>>
>>> # Create zonal statistics generator
>>> zonal_gen = ZonalGen(
... user_data=user_data,
... zonal_engine="exactextract",
... zonal_writer="csv",
... out_path="/path/to/output",
... file_prefix="zonal_stats"
... )
>>>
>>> # Calculate zonal statistics
>>> stats = zonal_gen.calculate_zonal()
Using a deprecated engine (serial/parallel/dask emit a FutureWarning):
>>> zonal_gen = ZonalGen(
... user_data=user_data,
... zonal_engine="serial", # deprecated, use "exactextract"
... zonal_writer="csv",
... out_path="/path/to/output",
... file_prefix="zonal_stats",
... append_date=True,
... )
>>> stats = zonal_gen.calculate_zonal()
Attributes:
agg: The zonal engine instance used for calculations.
precision: Number of decimal places for output statistics.
"""
[docs]
def __init__(
self,
user_data: UserData,
zonal_engine: ZONAL_ENGINES,
zonal_writer: ZONAL_WRITERS,
out_path: str | None = None,
file_prefix: str | None = None,
append_date: bool = False,
precision: int | None = None,
jobs: int = 1,
) -> None:
"""Initialize ZonalGen for calculating zonal statistics.
Args:
user_data: An instance of UserData containing the preprocessed raster
and vector data for zonal analysis.
zonal_engine: The computational engine to use for zonal calculations.
Options: "exactextract" (recommended), "serial", "parallel", or
"dask". The serial, parallel, and dask engines are deprecated
and will be removed in a future version.
zonal_writer: The output format for zonal statistics. Currently
supports "csv".
out_path: Directory path where output files will be saved. If None,
output will not be written to disk.
file_prefix: Prefix for the output filename. If None, no prefix
will be used.
append_date: Whether to append current timestamp to the filename.
Creates filenames like "2024_01_15_14_30_00_prefix.csv".
precision: Number of decimal places for output statistics. If None,
no rounding is applied.
jobs: Number of parallel jobs for computation. Only used with
"parallel" or "dask" engines.
Raises:
FileNotFoundError: If the specified output path does not exist.
TypeError: If an invalid zonal engine is specified.
Notes:
The user_data should contain properly aligned raster and vector data
with matching coordinate reference systems.
"""
self._user_data = user_data
self._zonal_engine = zonal_engine
self._zonal_writer = zonal_writer
self._jobs = jobs
self._out_path = Path(out_path)
if not self._out_path.exists():
raise FileNotFoundError(f"Path: {self._out_path} does not exist")
self._file_prefix = file_prefix
self._append_date = append_date
if self._append_date:
self._fdate = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
self._fname = f"{self._fdate}_{self._file_prefix}"
else:
self._fname = f"{self._file_prefix}"
self.precision = precision
self.agg: ZonalEngineSerial | ZonalEngineParallel | ZonalEngineDask | ZonalEngineExactExtract
if self._zonal_engine == "serial":
warnings.warn(
"The 'serial' zonal engine is deprecated and will be removed in a future version. "
"Use zonal_engine='exactextract' instead, which provides better performance "
"and handles large datasets with bounded memory.",
FutureWarning,
stacklevel=2,
)
self.agg = ZonalEngineSerial()
elif self._zonal_engine == "parallel":
warnings.warn(
"The 'parallel' zonal engine is deprecated and will be removed in a future version. "
"Use zonal_engine='exactextract' instead, which provides better performance "
"and handles large datasets with bounded memory.",
FutureWarning,
stacklevel=2,
)
self.agg = ZonalEngineParallel()
elif self._zonal_engine == "dask":
warnings.warn(
"The 'dask' zonal engine is deprecated and will be removed in a future version. "
"Use zonal_engine='exactextract' instead, which provides better performance "
"and handles large datasets with bounded memory.",
FutureWarning,
stacklevel=2,
)
self.agg = ZonalEngineDask()
elif self._zonal_engine == "exactextract":
self.agg = ZonalEngineExactExtract()
else:
raise TypeError(f"agg_engine: {self._zonal_engine} not in {ZONAL_ENGINES}")
[docs]
def calculate_zonal(self, categorical: bool = False) -> pd.DataFrame:
"""Calculate zonal statistics for raster data.
Computes zonal statistics (mean, sum, count, etc.) for raster data within
defined zones using the specified computational engine. Results are
automatically saved to the configured output path if specified.
Args:
categorical: Whether to calculate categorical statistics (mode, unique
counts) or continuous statistics (mean, sum, std, etc.). For
categorical data like land cover classifications, set to True.
Returns:
pandas.DataFrame: Calculated zonal statistics with zones as rows and
statistics as columns. The exact columns depend on the data type and
whether `categorical=True`.
Notes:
The method automatically rounds results to the specified precision if set
during initialization. Execution time is printed to stdout for performance
monitoring.
Examples:
Calculate continuous zonal statistics:
>>> stats = zonal_gen.calculate_zonal(categorical=False)
>>> print(stats.columns) # doctest: +SKIP
Index(['zone_id', 'mean', 'sum', 'count', 'std', 'min', 'max'])
Calculate categorical zonal statistics:
>>> stats = zonal_gen.calculate_zonal(categorical=True)
>>> print(stats.columns) # doctest: +SKIP
Index(['zone_id', 'mode', 'unique_count', 'majority_percent'])
"""
tstrt = time.perf_counter()
stats = self.agg.calc_zonal_from_aggdata(user_data=self._user_data, categorical=categorical, jobs=self._jobs)
if self.precision is not None:
stats = stats.round(self.precision)
if self._zonal_writer == "csv":
fullpath = self._out_path / f"{self._fname}.csv"
stats.to_csv(fullpath, sep=",")
tend = time.perf_counter()
logger.info(f"Total time for zonal stats calculation {tend - tstrt:0.4f} seconds")
return stats
# elif self._zonal_writer == "feather":
# fullpath = self._out_path / f"{self._fname}"
# stats.to_feather(path=fullpath, )
[docs]
class WeightedZonalGen:
"""Calculate area-weighted zonal statistics for raster data.
This class extends standard zonal statistics by incorporating area weights
based on the intersection of raster cells with zone boundaries. This is
particularly important when working with data in geographic coordinate
systems or when raster cells are partially covered by zones.
The weighting is based on the specified CRS and accounts for the actual
area of intersection between raster cells and zone polygons, providing
more accurate statistics for geographic analyses.
Note:
The serial, parallel, and dask engines are deprecated and will be removed
in a future version. Weighted zonal statistics support via the exactextract
engine is planned (issue #94).
Examples:
Basic weighted zonal statistics (deprecated engines):
>>> from gdptools.zonal_gen import WeightedZonalGen
>>> from pyproj import CRS
>>>
>>> # Create weighted zonal statistics generator
>>> # Note: serial/parallel/dask engines are deprecated; exactextract
>>> # weighted support is planned (issue #94).
>>> weighted_gen = WeightedZonalGen(
... user_data=user_data,
... weight_gen_crs=CRS.from_epsg(4326),
... zonal_engine="parallel", # deprecated
... zonal_writer="csv",
... out_path="/path/to/output",
... file_prefix="weighted_stats",
... jobs=4
... )
>>>
>>> # Calculate weighted zonal statistics
>>> stats = weighted_gen.calculate_weighted_zonal()
Using with equal-area projection for accurate area calculations:
>>> # Use an equal-area projection for more accurate area weighting
>>> albers_crs = CRS.from_proj4(
... "+proj=aea +lat_1=29.5 +lat_2=45.5 +lat_0=37.5 +lon_0=-96"
... )
>>> weighted_gen = WeightedZonalGen(
... user_data=user_data,
... weight_gen_crs=albers_crs,
... zonal_engine="dask", # deprecated
... zonal_writer="csv",
... out_path="/path/to/output",
... precision=3
... )
>>> stats = weighted_gen.calculate_weighted_zonal()
Attributes:
agg: The zonal engine instance used for calculations.
precision: Number of decimal places for output statistics.
"""
[docs]
def __init__(
self,
user_data: UserData,
weight_gen_crs: str | int | CRS,
zonal_engine: ZONAL_ENGINES,
zonal_writer: ZONAL_WRITERS,
out_path: str | None = None,
file_prefix: str | None = None,
append_date: bool = False,
precision: int | None = None,
jobs: int = 1,
) -> None:
"""Initialize WeightedZonalGen for calculating area-weighted zonal statistics.
Args:
user_data: An instance of UserData containing the preprocessed raster
and vector data for zonal analysis.
weight_gen_crs: The coordinate reference system for area weight
calculations. Can be an EPSG code (int), proj4 string (str),
or pyproj CRS object. Should be an appropriate equal-area
projection for accurate area calculations.
zonal_engine: The computational engine to use for zonal calculations.
Options: "serial", "parallel", or "dask". All three are
deprecated and will be removed in a future version.
Weighted zonal statistics support via exactextract is planned
(issue #94).
zonal_writer: The output format for zonal statistics. Currently
supports "csv".
out_path: Directory path where output files will be saved. If None,
output will not be written to disk.
file_prefix: Prefix for the output filename. If None, no prefix
will be used.
append_date: Whether to append current timestamp to the filename.
Creates filenames like "2024_01_15_14_30_00_prefix.csv".
precision: Number of decimal places for output statistics. If None,
no rounding is applied.
jobs: Number of parallel jobs for computation. Only used with
"parallel" or "dask" engines.
Raises:
FileNotFoundError: If the specified output path does not exist.
TypeError: If an invalid zonal engine is specified.
Notes:
For most accurate area-weighted calculations, use an equal-area
projection appropriate for your study region. Geographic coordinate
systems (like EPSG:4326) can introduce area distortions.
"""
self._user_data = user_data
self._weight_gen_crs = weight_gen_crs
self._zonal_engine = zonal_engine
self._zonal_writer = zonal_writer
self._jobs = jobs
self._out_path = Path(out_path)
if not self._out_path.exists():
raise FileNotFoundError(f"Path: {self._out_path} does not exist")
self._file_prefix = file_prefix
self._append_date = append_date
if self._append_date:
self._fdate = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
self._fname = f"{self._fdate}_{self._file_prefix}"
else:
self._fname = f"{self._file_prefix}"
self.precision = precision
self.agg: ZonalEngineSerial | ZonalEngineParallel | ZonalEngineDask
if self._zonal_engine == "exactextract":
raise TypeError(
"exactextract engine does not yet support weighted zonal statistics. "
"Use 'serial', 'parallel', or 'dask' instead (deprecated). "
"Weighted support via exactextract is planned (issue #94)."
)
elif self._zonal_engine == "serial":
warnings.warn(
"The 'serial' zonal engine is deprecated and will be removed in a future version. "
"Weighted zonal stats support via exactextract is planned (issue #94).",
FutureWarning,
stacklevel=2,
)
self.agg = ZonalEngineSerial()
elif self._zonal_engine == "parallel":
warnings.warn(
"The 'parallel' zonal engine is deprecated and will be removed in a future version. "
"Weighted zonal stats support via exactextract is planned (issue #94).",
FutureWarning,
stacklevel=2,
)
self.agg = ZonalEngineParallel()
elif self._zonal_engine == "dask":
warnings.warn(
"The 'dask' zonal engine is deprecated and will be removed in a future version. "
"Weighted zonal stats support via exactextract is planned (issue #94).",
FutureWarning,
stacklevel=2,
)
self.agg = ZonalEngineDask()
else:
raise TypeError(f"agg_engine: {self._zonal_engine} not in {ZONAL_ENGINES}")
[docs]
def calculate_weighted_zonal(self, categorical: bool = False) -> pd.DataFrame:
"""Calculate area-weighted zonal statistics for raster data.
Computes zonal statistics weighted by the actual area of intersection
between raster cells and zone boundaries. This provides more accurate
statistics when raster cells are partially covered by zones or when
working with geographic coordinate systems.
Args:
categorical: Whether to calculate categorical statistics (mode, unique
counts) or continuous statistics (mean, sum, std, etc.). For
categorical data like land cover classifications, set to True.
Returns:
A DataFrame containing the calculated area-weighted zonal statistics
with zones as rows and statistics as columns. The exact columns
depend on the data type and whether categorical=True.
Notes:
The method automatically rounds results to the specified precision
if set during initialization. Execution time is printed to stdout
for performance monitoring.
Examples:
Calculate weighted continuous statistics:
>>> stats = weighted_gen.calculate_weighted_zonal(categorical=False)
>>> print(stats.columns) # doctest: +SKIP
Index(['zone_id', 'weighted_mean', 'weighted_sum', 'total_area'])
Calculate weighted categorical statistics:
>>> stats = weighted_gen.calculate_weighted_zonal(categorical=True)
>>> print(stats.columns) # doctest: +SKIP
Index(['zone_id', 'weighted_mode', 'area_weighted_majority'])
"""
tstrt = time.perf_counter()
stats = self.agg.calc_weights_zonal_from_aggdata(
user_data=self._user_data, crs=self._weight_gen_crs, categorical=categorical, jobs=self._jobs
)
if self.precision is not None:
stats = stats.round(self.precision)
if self._zonal_writer == "csv":
fullpath = self._out_path / f"{self._fname}.csv"
stats.to_csv(fullpath, sep=",")
tend = time.perf_counter()
logger.info(f"Total time for weighted zonal stats calculation {tend - tstrt:0.4f} seconds")
return stats
# elif self._zonal_writer == "feather":
# fullpath = self._out_path / f"{self._fname}"
# stats.to_feather(path=fullpath, )