Source code for gdptools.zonal_gen

"""Calculate zonal statistics for raster data.

This module provides classes for computing zonal statistics on raster data,
including both standard and weighted zonal statistics calculations. It supports
various computational engines (serial, parallel, Dask, and exactextract) and
output formats.

Classes:
    ZonalGen: Calculate standard zonal statistics for raster data.
    WeightedZonalGen: Calculate area-weighted zonal statistics for raster data.

Notes:
    This module requires that the user data has been properly preprocessed
    using the appropriate data preparation tools in the gdptools package.

    The "exactextract" engine provides significantly better performance for
    non-weighted zonal statistics by operating directly on raster data.

"""

import logging
import time
import warnings
from datetime import datetime
from pathlib import Path
from typing import Literal

import pandas as pd
from pyproj import CRS

from gdptools.agg.zonal_engines import (
    ZonalEngineDask,
    ZonalEngineExactExtract,
    ZonalEngineParallel,
    ZonalEngineSerial,
)
from gdptools.data.user_data import UserData

logger = logging.getLogger(__name__)

ZONAL_ENGINES = Literal["serial", "parallel", "dask", "exactextract"]
"""Literal type alias for the zonal computation engines.

Options:
    serial: Perform zonal calculations in a serial manner.
        .. deprecated:: Use ``exactextract`` instead.
    parallel: Perform zonal calculations in parallel using multiple processes.
        .. deprecated:: Use ``exactextract`` instead.
    dask: Perform zonal calculations using Dask for distributed computing.
        .. deprecated:: Use ``exactextract`` instead.
    exactextract: High-performance raster-native zonal statistics using exactextract.
        This is the recommended engine. It only supports non-weighted statistics
        currently; weighted support is planned (issue #94).
"""

ZONAL_WRITERS = Literal["csv"]
"""Literal type alias for the zonal statistics output formats.

Options:
    csv: Write zonal statistics to a CSV file.
"""



[docs]
class ZonalGen:
    """Calculate standard zonal statistics for raster data.

    This class provides functionality to compute zonal statistics (mean, sum, count,
    etc.) for raster data within defined zones. It supports multiple computational
    engines for flexibility and performance optimization.

    The class handles the entire workflow from data preparation to result output,
    including options for different computational backends and output formats.

    Examples:
        Recommended usage with exactextract engine:

        >>> from gdptools.zonal_gen import ZonalGen
        >>> from gdptools.data.user_data import UserData
        >>>
        >>> # Create zonal statistics generator
        >>> zonal_gen = ZonalGen(
        ...     user_data=user_data,
        ...     zonal_engine="exactextract",
        ...     zonal_writer="csv",
        ...     out_path="/path/to/output",
        ...     file_prefix="zonal_stats"
        ... )
        >>>
        >>> # Calculate zonal statistics
        >>> stats = zonal_gen.calculate_zonal()

        Using a deprecated engine (serial/parallel/dask emit a FutureWarning):

        >>> zonal_gen = ZonalGen(
        ...     user_data=user_data,
        ...     zonal_engine="serial",  # deprecated, use "exactextract"
        ...     zonal_writer="csv",
        ...     out_path="/path/to/output",
        ...     file_prefix="zonal_stats",
        ...     append_date=True,
        ... )
        >>> stats = zonal_gen.calculate_zonal()

    Attributes:
        agg: The zonal engine instance used for calculations.
        precision: Number of decimal places for output statistics.

    """


[docs]
    def __init__(
        self,
        user_data: UserData,
        zonal_engine: ZONAL_ENGINES,
        zonal_writer: ZONAL_WRITERS,
        out_path: str | None = None,
        file_prefix: str | None = None,
        append_date: bool = False,
        precision: int | None = None,
        jobs: int = 1,
    ) -> None:
        """Initialize ZonalGen for calculating zonal statistics.

        Args:
            user_data: An instance of UserData containing the preprocessed raster
                and vector data for zonal analysis.
            zonal_engine: The computational engine to use for zonal calculations.
                Options: "exactextract" (recommended), "serial", "parallel", or
                "dask". The serial, parallel, and dask engines are deprecated
                and will be removed in a future version.
            zonal_writer: The output format for zonal statistics. Currently
                supports "csv".
            out_path: Directory path where output files will be saved. If None,
                output will not be written to disk.
            file_prefix: Prefix for the output filename. If None, no prefix
                will be used.
            append_date: Whether to append current timestamp to the filename.
                Creates filenames like "2024_01_15_14_30_00_prefix.csv".
            precision: Number of decimal places for output statistics. If None,
                no rounding is applied.
            jobs: Number of parallel jobs for computation. Only used with
                "parallel" or "dask" engines.

        Raises:
            FileNotFoundError: If the specified output path does not exist.
            TypeError: If an invalid zonal engine is specified.

        Notes:
            The user_data should contain properly aligned raster and vector data
            with matching coordinate reference systems.

        """
        self._user_data = user_data
        self._zonal_engine = zonal_engine
        self._zonal_writer = zonal_writer
        self._jobs = jobs
        self._out_path = Path(out_path)
        if not self._out_path.exists():
            raise FileNotFoundError(f"Path: {self._out_path} does not exist")
        self._file_prefix = file_prefix
        self._append_date = append_date
        if self._append_date:
            self._fdate = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
            self._fname = f"{self._fdate}_{self._file_prefix}"
        else:
            self._fname = f"{self._file_prefix}"
        self.precision = precision
        self.agg: ZonalEngineSerial | ZonalEngineParallel | ZonalEngineDask | ZonalEngineExactExtract
        if self._zonal_engine == "serial":
            warnings.warn(
                "The 'serial' zonal engine is deprecated and will be removed in a future version. "
                "Use zonal_engine='exactextract' instead, which provides better performance "
                "and handles large datasets with bounded memory.",
                FutureWarning,
                stacklevel=2,
            )
            self.agg = ZonalEngineSerial()
        elif self._zonal_engine == "parallel":
            warnings.warn(
                "The 'parallel' zonal engine is deprecated and will be removed in a future version. "
                "Use zonal_engine='exactextract' instead, which provides better performance "
                "and handles large datasets with bounded memory.",
                FutureWarning,
                stacklevel=2,
            )
            self.agg = ZonalEngineParallel()
        elif self._zonal_engine == "dask":
            warnings.warn(
                "The 'dask' zonal engine is deprecated and will be removed in a future version. "
                "Use zonal_engine='exactextract' instead, which provides better performance "
                "and handles large datasets with bounded memory.",
                FutureWarning,
                stacklevel=2,
            )
            self.agg = ZonalEngineDask()
        elif self._zonal_engine == "exactextract":
            self.agg = ZonalEngineExactExtract()
        else:
            raise TypeError(f"agg_engine: {self._zonal_engine} not in {ZONAL_ENGINES}")



[docs]
    def calculate_zonal(self, categorical: bool = False) -> pd.DataFrame:
        """Calculate zonal statistics for raster data.

        Computes zonal statistics (mean, sum, count, etc.) for raster data within
        defined zones using the specified computational engine. Results are
        automatically saved to the configured output path if specified.

        Args:
            categorical: Whether to calculate categorical statistics (mode, unique
                counts) or continuous statistics (mean, sum, std, etc.). For
                categorical data like land cover classifications, set to True.

        Returns:
            pandas.DataFrame: Calculated zonal statistics with zones as rows and
            statistics as columns. The exact columns depend on the data type and
            whether `categorical=True`.

        Notes:
            The method automatically rounds results to the specified precision if set
            during initialization. Execution time is printed to stdout for performance
            monitoring.

        Examples:
        Calculate continuous zonal statistics:

            >>> stats = zonal_gen.calculate_zonal(categorical=False)
            >>> print(stats.columns) # doctest: +SKIP
            Index(['zone_id', 'mean', 'sum', 'count', 'std', 'min', 'max'])

            Calculate categorical zonal statistics:

            >>> stats = zonal_gen.calculate_zonal(categorical=True)
            >>> print(stats.columns) # doctest: +SKIP
            Index(['zone_id', 'mode', 'unique_count', 'majority_percent'])

        """
        tstrt = time.perf_counter()
        stats = self.agg.calc_zonal_from_aggdata(user_data=self._user_data, categorical=categorical, jobs=self._jobs)
        if self.precision is not None:
            stats = stats.round(self.precision)
        if self._zonal_writer == "csv":
            fullpath = self._out_path / f"{self._fname}.csv"
            stats.to_csv(fullpath, sep=",")
        tend = time.perf_counter()
        logger.info(f"Total time for zonal stats calculation {tend - tstrt:0.4f} seconds")
        return stats


        # elif self._zonal_writer == "feather":
        #     fullpath = self._out_path / f"{self._fname}"
        #     stats.to_feather(path=fullpath, )



[docs]
class WeightedZonalGen:
    """Calculate area-weighted zonal statistics for raster data.

    This class extends standard zonal statistics by incorporating area weights
    based on the intersection of raster cells with zone boundaries. This is
    particularly important when working with data in geographic coordinate
    systems or when raster cells are partially covered by zones.

    The weighting is based on the specified CRS and accounts for the actual
    area of intersection between raster cells and zone polygons, providing
    more accurate statistics for geographic analyses.

    Note:
        The serial, parallel, and dask engines are deprecated and will be removed
        in a future version. Weighted zonal statistics support via the exactextract
        engine is planned (issue #94).

    Examples:
        Basic weighted zonal statistics (deprecated engines):

        >>> from gdptools.zonal_gen import WeightedZonalGen
        >>> from pyproj import CRS
        >>>
        >>> # Create weighted zonal statistics generator
        >>> # Note: serial/parallel/dask engines are deprecated; exactextract
        >>> # weighted support is planned (issue #94).
        >>> weighted_gen = WeightedZonalGen(
        ...     user_data=user_data,
        ...     weight_gen_crs=CRS.from_epsg(4326),
        ...     zonal_engine="parallel",  # deprecated
        ...     zonal_writer="csv",
        ...     out_path="/path/to/output",
        ...     file_prefix="weighted_stats",
        ...     jobs=4
        ... )
        >>>
        >>> # Calculate weighted zonal statistics
        >>> stats = weighted_gen.calculate_weighted_zonal()

        Using with equal-area projection for accurate area calculations:

        >>> # Use an equal-area projection for more accurate area weighting
        >>> albers_crs = CRS.from_proj4(
        ...     "+proj=aea +lat_1=29.5 +lat_2=45.5 +lat_0=37.5 +lon_0=-96"
        ... )
        >>> weighted_gen = WeightedZonalGen(
        ...     user_data=user_data,
        ...     weight_gen_crs=albers_crs,
        ...     zonal_engine="dask",  # deprecated
        ...     zonal_writer="csv",
        ...     out_path="/path/to/output",
        ...     precision=3
        ... )
        >>> stats = weighted_gen.calculate_weighted_zonal()

    Attributes:
        agg: The zonal engine instance used for calculations.
        precision: Number of decimal places for output statistics.

    """


[docs]
    def __init__(
        self,
        user_data: UserData,
        weight_gen_crs: str | int | CRS,
        zonal_engine: ZONAL_ENGINES,
        zonal_writer: ZONAL_WRITERS,
        out_path: str | None = None,
        file_prefix: str | None = None,
        append_date: bool = False,
        precision: int | None = None,
        jobs: int = 1,
    ) -> None:
        """Initialize WeightedZonalGen for calculating area-weighted zonal statistics.

        Args:
            user_data: An instance of UserData containing the preprocessed raster
                and vector data for zonal analysis.
            weight_gen_crs: The coordinate reference system for area weight
                calculations. Can be an EPSG code (int), proj4 string (str),
                or pyproj CRS object. Should be an appropriate equal-area
                projection for accurate area calculations.
            zonal_engine: The computational engine to use for zonal calculations.
                Options: "serial", "parallel", or "dask". All three are
                deprecated and will be removed in a future version.
                Weighted zonal statistics support via exactextract is planned
                (issue #94).
            zonal_writer: The output format for zonal statistics. Currently
                supports "csv".
            out_path: Directory path where output files will be saved. If None,
                output will not be written to disk.
            file_prefix: Prefix for the output filename. If None, no prefix
                will be used.
            append_date: Whether to append current timestamp to the filename.
                Creates filenames like "2024_01_15_14_30_00_prefix.csv".
            precision: Number of decimal places for output statistics. If None,
                no rounding is applied.
            jobs: Number of parallel jobs for computation. Only used with
                "parallel" or "dask" engines.

        Raises:
            FileNotFoundError: If the specified output path does not exist.
            TypeError: If an invalid zonal engine is specified.

        Notes:
            For most accurate area-weighted calculations, use an equal-area
            projection appropriate for your study region. Geographic coordinate
            systems (like EPSG:4326) can introduce area distortions.

        """
        self._user_data = user_data
        self._weight_gen_crs = weight_gen_crs
        self._zonal_engine = zonal_engine
        self._zonal_writer = zonal_writer
        self._jobs = jobs
        self._out_path = Path(out_path)
        if not self._out_path.exists():
            raise FileNotFoundError(f"Path: {self._out_path} does not exist")
        self._file_prefix = file_prefix
        self._append_date = append_date
        if self._append_date:
            self._fdate = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
            self._fname = f"{self._fdate}_{self._file_prefix}"
        else:
            self._fname = f"{self._file_prefix}"
        self.precision = precision
        self.agg: ZonalEngineSerial | ZonalEngineParallel | ZonalEngineDask
        if self._zonal_engine == "exactextract":
            raise TypeError(
                "exactextract engine does not yet support weighted zonal statistics. "
                "Use 'serial', 'parallel', or 'dask' instead (deprecated). "
                "Weighted support via exactextract is planned (issue #94)."
            )
        elif self._zonal_engine == "serial":
            warnings.warn(
                "The 'serial' zonal engine is deprecated and will be removed in a future version. "
                "Weighted zonal stats support via exactextract is planned (issue #94).",
                FutureWarning,
                stacklevel=2,
            )
            self.agg = ZonalEngineSerial()
        elif self._zonal_engine == "parallel":
            warnings.warn(
                "The 'parallel' zonal engine is deprecated and will be removed in a future version. "
                "Weighted zonal stats support via exactextract is planned (issue #94).",
                FutureWarning,
                stacklevel=2,
            )
            self.agg = ZonalEngineParallel()
        elif self._zonal_engine == "dask":
            warnings.warn(
                "The 'dask' zonal engine is deprecated and will be removed in a future version. "
                "Weighted zonal stats support via exactextract is planned (issue #94).",
                FutureWarning,
                stacklevel=2,
            )
            self.agg = ZonalEngineDask()
        else:
            raise TypeError(f"agg_engine: {self._zonal_engine} not in {ZONAL_ENGINES}")



[docs]
    def calculate_weighted_zonal(self, categorical: bool = False) -> pd.DataFrame:
        """Calculate area-weighted zonal statistics for raster data.

        Computes zonal statistics weighted by the actual area of intersection
        between raster cells and zone boundaries. This provides more accurate
        statistics when raster cells are partially covered by zones or when
        working with geographic coordinate systems.

        Args:
            categorical: Whether to calculate categorical statistics (mode, unique
                counts) or continuous statistics (mean, sum, std, etc.). For
                categorical data like land cover classifications, set to True.

        Returns:
            A DataFrame containing the calculated area-weighted zonal statistics
            with zones as rows and statistics as columns. The exact columns
            depend on the data type and whether categorical=True.

        Notes:
            The method automatically rounds results to the specified precision
            if set during initialization. Execution time is printed to stdout
            for performance monitoring.

        Examples:
            Calculate weighted continuous statistics:

            >>> stats = weighted_gen.calculate_weighted_zonal(categorical=False)
            >>> print(stats.columns) # doctest: +SKIP
            Index(['zone_id', 'weighted_mean', 'weighted_sum', 'total_area'])

            Calculate weighted categorical statistics:

            >>> stats = weighted_gen.calculate_weighted_zonal(categorical=True)
            >>> print(stats.columns) # doctest: +SKIP
            Index(['zone_id', 'weighted_mode', 'area_weighted_majority'])

        """
        tstrt = time.perf_counter()
        stats = self.agg.calc_weights_zonal_from_aggdata(
            user_data=self._user_data, crs=self._weight_gen_crs, categorical=categorical, jobs=self._jobs
        )
        if self.precision is not None:
            stats = stats.round(self.precision)
        if self._zonal_writer == "csv":
            fullpath = self._out_path / f"{self._fname}.csv"
            stats.to_csv(fullpath, sep=",")
        tend = time.perf_counter()
        logger.info(f"Total time for weighted zonal stats calculation {tend - tstrt:0.4f} seconds")
        return stats


        # elif self._zonal_writer == "feather":
        #     fullpath = self._out_path / f"{self._fname}"
        #     stats.to_feather(path=fullpath, )