Zonal statistics of rasters

Zonal statistics of rasters#

import geopandas as gpd
import pandas as pd
import rioxarray as rxr
import xarray as xr

from gdptools import ZonalGen
from gdptools.data.user_data import UserTiffData
# Be sure to use rioxarray to read tiff
rds_slope = rxr.open_rasterio("../../../tests/data/rasters/slope/slope.tif")
rds_text = rxr.open_rasterio("../../../tests/data/rasters/TEXT_PRMS/TEXT_PRMS.tif")
# Use geopandas to read shape file,
# Best if shape file only has feature id, and geometry if possible.
gdf = gpd.read_file("../../../tests/data/Oahu.shp")
id_feature = "fid"
print(len(gdf.groupby(id_feature)))
# These params are used to fill out the TiffAttributes class
tx_name= 'x'
ty_name = 'y'
band = 'band'
crs = 26904
varname = "slope" # not currently used
categorical = False # is the data categorical or not, if the data are integers it should
                   # probably be categorical.
data = UserTiffData(
    var=varname,
    ds=rds_slope,
    proj_ds=crs,
    x_coord=tx_name,
    y_coord=ty_name,
    band=1,
    bname=band,
    f_feature=gdf,
    id_feature=id_feature,
    proj_feature=crs
)

zonal_gen = ZonalGen(
    user_data=data,
    zonal_engine="serial",
    zonal_writer="csv",
    out_path=".",
    file_prefix="Oahu_slope_stats"
)
stats = zonal_gen.calculate_zonal(categorical=categorical)
print(stats)

# stats = zonal_gen.calculate_zonal(categorical=True)
# print(stats)
304
data prepped for zonal in 0.0087 seconds
converted tiff to points in 1.3728 seconds
     - fixing 0 invalid polygons.
overlaps calculated in 1.4767 seconds
zonal stats calculated in 0.6617 seconds
[]
number of missing values: 0
fill missing values with nearest neighbors in 0.0040 seconds
Total time for serial zonal stats calculation 3.8094 seconds
          count       mean        std  min    25%   50%   75%   max     sum
fid                                                                        
1104.0  20431.0   1.872155   3.757167  0.0   0.00   0.0   1.0  32.0   38250
1105.0   2388.0  26.350921  13.473957  5.0  14.00  24.0  37.0  61.0   62926
1106.0   3192.0  33.687030  14.188686  1.0  22.00  34.0  45.0  70.0  107529
1107.0   6600.0  23.704091  15.854788  0.0  10.00  21.0  37.0  66.0  156447
1108.0  11380.0  22.116169  14.062708  0.0  11.00  21.0  33.0  62.0  251682
...         ...        ...        ...  ...    ...   ...   ...   ...     ...
1403.0   2753.0   1.722121   3.183146  0.0   0.00   1.0   1.0  23.0    4741
1404.0     99.0   1.363636   1.082840  0.0   0.00   2.0   2.0   3.0     135
1405.0    174.0  10.505747   7.044653  0.0   5.25   9.0  14.0  32.0    1828
1406.0   3089.0  10.931693  10.120911  0.0   4.00   7.0  15.0  59.0   33768
1407.0    321.0   1.679128   1.702893  0.0   1.00   1.0   2.0   9.0     539

[304 rows x 9 columns]
gdf.sort_values(by=id_feature, inplace=True)
stats.sort_values(by=id_feature, inplace=True)
gdf["slope_mean"] = stats["mean"].values
gdf
fid rpu cat_id hru_segmen geometry slope_mean
0 1104.0 f 1104 995.0 POLYGON ((601200.000 2359860.000, 601180.267 2... 1.872155
1 1105.0 f 1105 996.0 POLYGON ((593760.000 2369060.000, 593675.200 2... 26.350921
2 1106.0 f 1106 997.0 POLYGON ((615055.367 2375324.633, 615130.633 2... 33.687030
3 1107.0 f 1107 998.0 POLYGON ((619150.000 2380290.000, 619096.633 2... 23.704091
4 1108.0 f 1108 999.0 POLYGON ((614640.000 2382950.000, 614628.833 2... 22.116169
... ... ... ... ... ... ...
299 1403.0 f 1403 1070.0 POLYGON ((620198.262 2355012.754, 620153.300 2... 1.722121
300 1404.0 f 1404 1029.0 POLYGON ((579986.031 2380993.933, 580280.000 2... 1.363636
301 1405.0 f 1405 1146.0 POLYGON ((625200.000 2365570.000, 625207.872 2... 10.505747
302 1406.0 f 1406 1146.0 POLYGON ((625144.965 2365356.034, 625183.900 2... 10.931693
303 1407.0 f 1407 1233.0 POLYGON ((618131.067 2357417.700, 618105.367 2... 1.679128

304 rows × 6 columns

gdf.plot(column="slope_mean")
<Axes: >
../../_images/fd496b1c570d6f18f023c539844c8ecf11937c2f54ea7fea1e1130cea19c042c.png

Parallel generator#

In this case the generator is slower because the domain is small and the overhead of generating the parallel processes takes more time than the actual calculation. However with a large domain the “parallel” zonal_engine is faster and has a smaller memory footprint.

zonal_gen2 = ZonalGen(
    user_data=data,
    zonal_engine="parallel",
    zonal_writer="csv",
    out_path=".",
    file_prefix="Oahu_slope_stats_p"
)
stats = zonal_gen2.calculate_zonal(categorical=categorical)
print(stats)
data prepped for zonal in 0.0098 seconds
     - fixing 0 invalid polygons.
<class 'pandas.core.frame.DataFrame'>
Parallel calculation of zonal stats in 9.906 seconds
Total time for serial zonal stats calculation 9.9332 seconds
          count       mean        std  min    25%   50%   75%   max     sum
fid                                                                        
1104.0  20431.0   1.872155   3.757167  0.0   0.00   0.0   1.0  32.0   38250
1105.0   2388.0  26.350921  13.473957  5.0  14.00  24.0  37.0  61.0   62926
1106.0   3192.0  33.687030  14.188686  1.0  22.00  34.0  45.0  70.0  107529
1107.0   6600.0  23.704091  15.854788  0.0  10.00  21.0  37.0  66.0  156447
1108.0  11380.0  22.116169  14.062708  0.0  11.00  21.0  33.0  62.0  251682
...         ...        ...        ...  ...    ...   ...   ...   ...     ...
1403.0   2753.0   1.722121   3.183146  0.0   0.00   1.0   1.0  23.0    4741
1404.0     99.0   1.363636   1.082840  0.0   0.00   2.0   2.0   3.0     135
1405.0    174.0  10.505747   7.044653  0.0   5.25   9.0  14.0  32.0    1828
1406.0   3089.0  10.931693  10.120911  0.0   4.00   7.0  15.0  59.0   33768
1407.0    321.0   1.679128   1.702893  0.0   1.00   1.0   2.0   9.0     539

[304 rows x 9 columns]
gdf.sort_values(by=id_feature, inplace=True)
stats.sort_values(by=id_feature, inplace=True)
gdf["slope_mean"] = stats["mean"].values
gdf
fid rpu cat_id hru_segmen geometry slope_mean
0 1104.0 f 1104 995.0 POLYGON ((601200.000 2359860.000, 601180.267 2... 1.872155
1 1105.0 f 1105 996.0 POLYGON ((593760.000 2369060.000, 593675.200 2... 26.350921
2 1106.0 f 1106 997.0 POLYGON ((615055.367 2375324.633, 615130.633 2... 33.687030
3 1107.0 f 1107 998.0 POLYGON ((619150.000 2380290.000, 619096.633 2... 23.704091
4 1108.0 f 1108 999.0 POLYGON ((614640.000 2382950.000, 614628.833 2... 22.116169
... ... ... ... ... ... ...
299 1403.0 f 1403 1070.0 POLYGON ((620198.262 2355012.754, 620153.300 2... 1.722121
300 1404.0 f 1404 1029.0 POLYGON ((579986.031 2380993.933, 580280.000 2... 1.363636
301 1405.0 f 1405 1146.0 POLYGON ((625200.000 2365570.000, 625207.872 2... 10.505747
302 1406.0 f 1406 1146.0 POLYGON ((625144.965 2365356.034, 625183.900 2... 10.931693
303 1407.0 f 1407 1233.0 POLYGON ((618131.067 2357417.700, 618105.367 2... 1.679128

304 rows × 6 columns

gdf.plot(column="slope_mean")
<Axes: >
../../_images/fd496b1c570d6f18f023c539844c8ecf11937c2f54ea7fea1e1130cea19c042c.png
varname = "TEXT" # not currently used
categorical = True # is the data categorical or not, if the data are integers it should
                   # probably be categorical.
data = UserTiffData(
    var=varname,
    ds=rds_text,
    proj_ds=crs,
    x_coord=tx_name,
    y_coord=ty_name,
    band=1,
    bname=band,
    f_feature=gdf,
    id_feature=id_feature,
    proj_feature=crs
)

zonal_gen = ZonalGen(
    user_data=data,
    zonal_engine="serial",
    zonal_writer="csv",
    out_path=".",
    file_prefix="Oahu_TEXT_stats"
)
stats = zonal_gen.calculate_zonal(categorical=categorical)
print(stats)
data prepped for zonal in 0.0052 seconds
converted tiff to points in 0.0084 seconds
     - fixing 0 invalid polygons.
overlaps calculated in 0.0204 seconds
categorical zonal stats calculated in 0.2473 seconds
[[  0   1   2   3   4   5   6]
 [ 24  88 191 108 122 281 290]]
number of missing values: 7
fill missing values with nearest neighbors in 0.0088 seconds
Total time for serial zonal stats calculation 0.3143 seconds
        count  unique  top  freq
fid                             
1104.0    295       2    2   193
1105.0     34       2    2    30
1106.0     48       2    2    47
1107.0     94       2    3    76
1108.0    167       2    3   134
...       ...     ...  ...   ...
1298.0     43       2    3    35
1307.0    550       2    2   463
1377.0    103       2    2    87
1389.0     20       1    2    20
1401.0     41       2    2    25

[304 rows x 4 columns]
stats.sort_values(by=id_feature, inplace=True)
gdf["TEXT"] = stats["top"].values
gdf
fid rpu cat_id hru_segmen geometry slope_mean TEXT
0 1104.0 f 1104 995.0 POLYGON ((601200.000 2359860.000, 601180.267 2... 1.872155 2
1 1105.0 f 1105 996.0 POLYGON ((593760.000 2369060.000, 593675.200 2... 26.350921 2
2 1106.0 f 1106 997.0 POLYGON ((615055.367 2375324.633, 615130.633 2... 33.687030 2
3 1107.0 f 1107 998.0 POLYGON ((619150.000 2380290.000, 619096.633 2... 23.704091 3
4 1108.0 f 1108 999.0 POLYGON ((614640.000 2382950.000, 614628.833 2... 22.116169 3
... ... ... ... ... ... ... ...
299 1403.0 f 1403 1070.0 POLYGON ((620198.262 2355012.754, 620153.300 2... 1.722121 2
300 1404.0 f 1404 1029.0 POLYGON ((579986.031 2380993.933, 580280.000 2... 1.363636 2
301 1405.0 f 1405 1146.0 POLYGON ((625200.000 2365570.000, 625207.872 2... 10.505747 2
302 1406.0 f 1406 1146.0 POLYGON ((625144.965 2365356.034, 625183.900 2... 10.931693 2
303 1407.0 f 1407 1233.0 POLYGON ((618131.067 2357417.700, 618105.367 2... 1.679128 2

304 rows × 7 columns

gdf.plot(column="TEXT")
<Axes: >
../../_images/0a25af4f8d7f5981369f81f8ce1ff0a30fe2e7958f57b756e796a8ec7ef0d9e7.png

Parallel engine#

zonal_gen2 = ZonalGen(
    user_data=data,
    zonal_engine="parallel",
    zonal_writer="csv",
    out_path=".",
    file_prefix="Oahu_TEXT_stats"
)
stats = zonal_gen2.calculate_zonal(categorical=categorical)
print(stats)
data prepped for zonal in 0.0071 seconds
     - fixing 0 invalid polygons.
<class 'pandas.core.frame.DataFrame'>
Parallel calculation of zonal stats in 3.133 seconds
[[  0   1   2   3   4   5   6]
 [ 24  88 191 108 122 281 290]]
number of missing values: 7
fill missing values with nearest neighbors in 0.0085 seconds
Total time for serial zonal stats calculation 3.1641 seconds
        count  unique  top  freq
fid                             
1104.0    295       2    2   193
1105.0     34       2    2    30
1106.0     48       2    2    47
1107.0     94       2    3    76
1108.0    167       2    3   134
...       ...     ...  ...   ...
1298.0     43       2    3    35
1307.0    550       2    2   463
1377.0    103       2    2    87
1389.0     20       1    2    20
1401.0     41       2    2    25

[304 rows x 4 columns]
stats.sort_values(by=id_feature, inplace=True)
gdf["TEXT"] = stats["top"].values
gdf
fid rpu cat_id hru_segmen geometry slope_mean TEXT
0 1104.0 f 1104 995.0 POLYGON ((601200.000 2359860.000, 601180.267 2... 1.872155 2
1 1105.0 f 1105 996.0 POLYGON ((593760.000 2369060.000, 593675.200 2... 26.350921 2
2 1106.0 f 1106 997.0 POLYGON ((615055.367 2375324.633, 615130.633 2... 33.687030 2
3 1107.0 f 1107 998.0 POLYGON ((619150.000 2380290.000, 619096.633 2... 23.704091 3
4 1108.0 f 1108 999.0 POLYGON ((614640.000 2382950.000, 614628.833 2... 22.116169 3
... ... ... ... ... ... ... ...
299 1403.0 f 1403 1070.0 POLYGON ((620198.262 2355012.754, 620153.300 2... 1.722121 2
300 1404.0 f 1404 1029.0 POLYGON ((579986.031 2380993.933, 580280.000 2... 1.363636 2
301 1405.0 f 1405 1146.0 POLYGON ((625200.000 2365570.000, 625207.872 2... 10.505747 2
302 1406.0 f 1406 1146.0 POLYGON ((625144.965 2365356.034, 625183.900 2... 10.931693 2
303 1407.0 f 1407 1233.0 POLYGON ((618131.067 2357417.700, 618105.367 2... 1.679128 2

304 rows × 7 columns

gdf.plot(column="TEXT")
<Axes: >
../../_images/3895d982719e35bcfe1aa3321d350c7f86618aa5cce052f3582ce9ba589c90fb.png