Dataset Class #

Detailed class diagram for the Dataset class and related components:

Hold "Ctrl" to enable pan & zoom

classDiagram
    %% configuration class
    class Config {
    }

    %% abstract base class for rasters
    class AbstractDataset {
        +__init__(src, access)
        +__str__()
        +__repr__()
        +access()
        +raster()
        +raster(value)
        +values()
        +rows()
        +columns()
        +shape()
        +geotransform()
        +top_left_corner()
        +epsg()
        +epsg(value)
        +crs()
        +crs(value)
        +cell_size()
        +no_data_value()
        +no_data_value(value)
        +meta_data()
        +meta_data(value)
        +block_size()
        +block_size(value)
        +file_name()
        +driver_type()
        +read_file(path, read_only)
        +read_array(band, window)
        +_read_block(band, window)
        +plot(band, exclude_value, rgb, surface_reflectance, cutoff, overview, overview_index, **kwargs)
    }

    %% concrete raster class
    class Dataset {
        +__init__(src, access)
        +__str__()
        +__repr__()
        +access()
        +raster()
        +raster(value)
        +values()
        +rows()
        +columns()
        +shape()
        +geotransform()
        +epsg()
        +epsg(value)
        +crs()
        +crs(value)
        +cell_size()
        +band_count()
        +band_names()
        +band_names(name_list)
        +band_units()
        +band_units(value)
        +no_data_value()
        +no_data_value(value)
        +meta_data()
        +meta_data(value)
        +block_size()
        +block_size(value)
        +file_name()
        +driver_type()
        +scale()
        +scale(value)
        +offset()
        +offset(value)
        +read_file(path, read_only)
        +create_from_array(arr, top_left_corner, cell_size, epsg)
        +read_array(band, window)
        +_read_block(band, window)
        +plot(band, exclude_value, rgb, surface_reflectance, cutoff, overview, overview_index, **kwargs)
        +to_file(path, driver, band)
        +to_crs(to_epsg, method, maintain_alignment)
        +resample(cell_size, method)
        +align(alignment_src)
        +crop(mask, touch)
        +merge(src, dst, no_data_value, init, n)
        +apply(ufunc)
        +overlay(classes_map, exclude_value)
    }



    %% Driver catalog
    class _utils_Catalog {
    }

    %% NetCDF
    class NetCDF {
    }

    %% error classes
    class _errors_ReadOnlyError
    class _errors_DatasetNoFoundError
    class _errors_NoDataValueError
    class _errors_AlignmentError
    class _errors_DriverNotExistError
    class _errors_FileFormatNotSupported
    class _errors_OptionalPackageDoesNotExist
    class _errors_FailedToSaveError
    class _errors_OutOfBoundsError

    %% inheritance relations
    AbstractDataset <|-- Dataset
    Dataset <|-- NetCDF

    %% composition/usage relations
    AbstractDataset ..> _utils_Catalog : "uses Catalog constant"
    AbstractDataset ..> featurecollection_FeatureCollection : "vector ops"
    Dataset ..> featurecollection_FeatureCollection : "vector ops"
    Dataset ..> _errors_ReadOnlyError : "raises"
    Dataset ..> _errors_AlignmentError : "raises"
    Dataset ..> _errors_NoDataValueError : "raises"
    Dataset ..> _errors_FailedToSaveError : "raises"
    Dataset ..> _errors_OutOfBoundsError : "raises"
    NetCDF ..> _errors_OptionalPackageDoesNotExist : "raises"
    Config ..> Dataset : "initialises raster settings"

Hold "Ctrl" to enable pan & zoom

classDiagram

    %% Central dataset class with its main attributes
    class Dataset {
        +raster
        +cell_size
        +values
        +shape
        +rows
        +columns
        +pivot_point
        +geotransform
        +bounds
        +bbox
        +epsg
        +crs
        +lon
        +lat
        +x
        +y
        +band_count
        +band_names
        +variables
        +no_data_value
        +meta_data
        +dtype
        +gdal_dtype
        +numpy_dtype
        +file_name
        +time_stamp
        +driver_type
    }

    %% Group: visualisation functionality
    class Visualization {
        +plot()
        +overview_count()
        +read_overview_array()
        +create_overviews()
        +recreate_overviews()
        +get_overview()
    }
    Dataset --> Visualization : «visualisation»

    %% Group: data access methods
    class AccessData {
        +read_array()
        +get_variables()
        +count_domain_cells()
        +get_band_names()
        +extract()
        +stats()
    }
    Dataset --> AccessData : «data access»

    %% Group: mathematical operations on raster values
    class MathOperations {
        +apply()
        +fill()
        +normalize()
        +cluster()
        +cluster2()
        +get_tile()
        +groupNeighbours()
    }
    Dataset --> MathOperations : «math ops»

    %% Group: spatial operations and reprojection
    class SpatialOperations {
        +to_crs()
        +resample()
        +align()
        +crop()
        +locate_points()
        +overlay()
        +extract()
        +footprint()
    }
    Dataset --> SpatialOperations : «spatial ops»

    %% Group: conversion to other data types
    class Conversion {
        +to_feature_collection()
    }
    Dataset --> Conversion : «conversion»

    %% Group: coordinate system handling
    class OSR {
        +create_sr_from_epsg()
    }
    Dataset --> OSR : «osr»

    %% Group: bounding‐box and bounds calculations
    class BBoxBounds {
        +calculate_bbox()
        +calculate_bounds()
    }
    Dataset --> BBoxBounds : «bbox/bounds»

    %% Group: CRS/EPSG getters
    class CrsEpsg {
        +get_crs()
        +get_epsg()
    }
    Dataset --> CrsEpsg : «crs/epsg»

    %% Group: latitude/longitude getters
    class LatLon {
        +get_lat_lon()
    }
    Dataset --> LatLon : «lat/lon»

    %% Group: band names management
    class BandNames {
        +get_band_names_internal()
        +set_band_names()
    }
    Dataset --> BandNames : «band names»

    %% Group: timestamp handling
    class TimeStamp {
        +get_time_variable()
        +read_variable()
    }
    Dataset --> TimeStamp : «time»

    %% Group: handling of no‐data values
    class NoDataValue {
        +set_no_data_value()
        +set_no_data_value_backend()
        +change_no_data_value_attr()
    }
    Dataset --> NoDataValue : «no data value»

    %% Group: helpers for creating GDAL datasets
    class GdalDataset {
        +create_empty_driver()
        +create_driver_from_scratch()
        +create_mem_gtiff_dataset()
    }
    Dataset --> GdalDataset : «gdal creation»

    %% Group: factory methods for creating Dataset objects
    class CreateObject {
        +from_gdal_dataset()
        +read_file()
        +create_from_array()
        +dataset_like()
    }
    Dataset --> CreateObject : «object factory»

Zip files: - Internal Zip file path (one/multiple files inside the compressed file): if the path contains a zip but does not end with zip (compressed-file-name.zip/1.asc), so the path contains the internal path inside the zip file, so just ad

```python
>>> rdir = "tests/data/virtual-file-system"
>>> dataset = Dataset.read_file(f"{rdir}/multiple_compressed_files.zip/1.asc")
>>> print(dataset)
<BLANKLINE>
            Cell size: 4000.0
            Dimension: 13 * 14
            EPSG: 4326
            Number of Bands: 1
            Band names: ['Band_1']
            Mask: -3.4028230607370965e+38
            Data type: float32
            File: /vsizip/tests/data/virtual-file-system/multiple_compressed_files.zip/1.asc
<BLANKLINE>

```

Only the Zip file path (one/multiple files inside the compressed file): If you provide the name of the zip file with multiple files inside it, it will return the path to the first file.

>>> dataset = Dataset.read_file(f"{rdir}/multiple_compressed_files.zip")
>>> print(dataset)
<BLANKLINE>
            Cell size: 4000.0
            Dimension: 13 * 14
            EPSG: 4326
            Number of Bands: 1
            Band names: ['Band_1']
            Mask: -3.4028230607370965e+38
            Data type: float32
            File: /vsizip/tests/data/virtual-file-system/multiple_compressed_files.zip/1.asc
<BLANKLINE>

Zip file path and an index (one/multiple files inside the compressed file): if you provide the path to the zip file and an index to the file inside the compressed file you want to read.

>>> dataset = Dataset.read_file(f"{rdir}/multiple_compressed_files.zip", file_i=1)
>>> print(dataset)
<BLANKLINE>
            Cell size: 4000.0
            Dimension: 13 * 14
            EPSG: 4326
            Number of Bands: 1
            Band names: ['Band_1']
            Mask: -3.4028230607370965e+38
            Data type: float32
            File: /vsizip/tests/data/virtual-file-system/multiple_compressed_files.zip/2.asc
<BLANKLINE>

Virtual files: - You can open files stored online simply by using the full url to the file with the read_file method.

>>> url = "https://sentinel-cogs.s3.us-west-2.amazonaws.com/sentinel-s2-l2a-cogs/31/U/FU/2020/3/S2A_31UFU_20200328_0_L2A/B01.tif"
>>> dataset = Dataset.read_file(url)
>>> print(dataset)
<BLANKLINE>
            Top Left Corner: (600000.0, 5900040.0)
            Cell size: 60.0
            Dimension: 1830 * 1830
            EPSG: 32631
            Number of Bands: 1
            Band names: ['Band_1']
            Band colors: {0: 'gray_index'}
            Band units: ['']
            Scale: [1.0]
            Offset: [0]
            Mask: 0.0
            Data type: uint16
            File: https://sentinel-cogs.s3.us-west-2.amazonaws.com/sentinel-s2-l2a-cogs/31/U/FU/2020/3/S2A_31UFU_20200328_0_L2A/B01.tif
<BLANKLINE>

See Also: - Dataset.read_array: Read the values stored in a dataset band.

Source code in pyramids/dataset.py

@classmethod
def read_file(
    cls,
    path: str,
    read_only=True,
    file_i: int = 0,
) -> "Dataset":
    """read_file.

    Args:
        path (str):
            Path of file to open.
        read_only (bool):
            File mode, set to False, to open in "update" mode.
        file_i (int):
            Index to the file inside the compressed file you want to read, if the compressed file has only one file. Default is 0.

    Returns:
        Dataset:
            Opened dataset instance.

    Examples:
        Zip files:
        - Internal Zip file path (one/multiple files inside the compressed file):
            if the path contains a zip but does not end with zip (compressed-file-name.zip/1.asc), so the path contains
                the internal path inside the zip file, so just ad


            ```python
            >>> rdir = "tests/data/virtual-file-system"
            >>> dataset = Dataset.read_file(f"{rdir}/multiple_compressed_files.zip/1.asc")
            >>> print(dataset)
            <BLANKLINE>
                        Cell size: 4000.0
                        Dimension: 13 * 14
                        EPSG: 4326
                        Number of Bands: 1
                        Band names: ['Band_1']
                        Mask: -3.4028230607370965e+38
                        Data type: float32
                        File: /vsizip/tests/data/virtual-file-system/multiple_compressed_files.zip/1.asc
            <BLANKLINE>

            ```

        - Only the Zip file path (one/multiple files inside the compressed file):
            If you provide the name of the zip file with multiple files inside it, it will return the path to the first
            file.


            ```python
            >>> dataset = Dataset.read_file(f"{rdir}/multiple_compressed_files.zip")
            >>> print(dataset)
            <BLANKLINE>
                        Cell size: 4000.0
                        Dimension: 13 * 14
                        EPSG: 4326
                        Number of Bands: 1
                        Band names: ['Band_1']
                        Mask: -3.4028230607370965e+38
                        Data type: float32
                        File: /vsizip/tests/data/virtual-file-system/multiple_compressed_files.zip/1.asc
            <BLANKLINE>

            ```

        - Zip file path and an index (one/multiple files inside the compressed file):
            if you provide the path to the zip file and an index to the file inside the compressed file you want to
            read.


            ```python
            >>> dataset = Dataset.read_file(f"{rdir}/multiple_compressed_files.zip", file_i=1)
            >>> print(dataset)
            <BLANKLINE>
                        Cell size: 4000.0
                        Dimension: 13 * 14
                        EPSG: 4326
                        Number of Bands: 1
                        Band names: ['Band_1']
                        Mask: -3.4028230607370965e+38
                        Data type: float32
                        File: /vsizip/tests/data/virtual-file-system/multiple_compressed_files.zip/2.asc
            <BLANKLINE>

            ```
    Virtual files:
        - You can open files stored online simply by using the full url to the file with the `read_file` method.
            ```python
            >>> url = "https://sentinel-cogs.s3.us-west-2.amazonaws.com/sentinel-s2-l2a-cogs/31/U/FU/2020/3/S2A_31UFU_20200328_0_L2A/B01.tif"
            >>> dataset = Dataset.read_file(url)
            >>> print(dataset)
            <BLANKLINE>
                        Top Left Corner: (600000.0, 5900040.0)
                        Cell size: 60.0
                        Dimension: 1830 * 1830
                        EPSG: 32631
                        Number of Bands: 1
                        Band names: ['Band_1']
                        Band colors: {0: 'gray_index'}
                        Band units: ['']
                        Scale: [1.0]
                        Offset: [0]
                        Mask: 0.0
                        Data type: uint16
                        File: https://sentinel-cogs.s3.us-west-2.amazonaws.com/sentinel-s2-l2a-cogs/31/U/FU/2020/3/S2A_31UFU_20200328_0_L2A/B01.tif
            <BLANKLINE>

            ```
    See Also:
        - Dataset.read_array: Read the values stored in a dataset band.
    """
    src = _io.read_file(path, read_only=read_only, file_i=file_i)
    return cls(src, access="read_only" if read_only else "write")

`read_array(band=None, window=None)` #

Read the values stored in a given band.

Data Chuncks/blocks When a raster dataset is stored on disk, it might not be stored as one continuous chunk of data. Instead, it can be divided into smaller rectangular blocks or tiles. These blocks can be individually accessed, which is particularly useful for large datasets:

    - Efficiency: Reading or writing small blocks requires less memory than dealing with the entire dataset
          at once. This is especially beneficial when only a small portion of the data needs to be processed.
    - Performance: For certain file formats and operations, working with optimal block sizes can significantly
          improve performance. For example, if the block size matches the reading or processing window,
          Pyramids can minimize disk access and data transfer.

Parameters:

Name	Type	Description	Default
`band`	`int`	The band you want to get its data. If None, data of all bands will be read. Default is None.	`None`
`window`	`List[int] \| GeoDataFrame`	Specify a block of data to read from the dataset. The window can be specified in two ways: List: Window specified as a list of 4 integers [offset_x, offset_y, window_columns, window_rows]. offset_x/column index: x offset of the block. offset_y/row index: y offset of the block. window_columns: number of columns in the block. window_rows: number of rows in the block. GeoDataFrame: GeoDataFrame with a geometry column filled with polygon geometries; the function will get the total_bounds of the GeoDataFrame and use it as a window to read the raster.	`None`

Returns:

Type	Description
`ndarray`	np.ndarray: array with all the values in the raster.

Examples:

Create Dataset consisting of 4 bands, 5 rows, and 5 columns at the point lon/lat (0, 0):

>>> import numpy as np
>>> arr = np.random.rand(4, 5, 5)
>>> top_left_corner = (0, 0)
>>> cell_size = 0.05
>>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)

Read all the values stored in a given band:

>>> arr = dataset.read_array(band=0) # doctest: +SKIP
array([[0.50482225, 0.45678043, 0.53294294, 0.28862223, 0.66753579],
       [0.38471912, 0.14617829, 0.05045189, 0.00761358, 0.25501918],
       [0.32689036, 0.37358843, 0.32233918, 0.75450564, 0.45197608],
       [0.22944676, 0.2780928 , 0.71605189, 0.71859309, 0.61896933],
       [0.47740168, 0.76490779, 0.07679277, 0.16142599, 0.73630836]])

Read a 2x2 block from the first band. The block starts at the 2nd column (index 1) and 2nd row (index 1) (the first index is the column index):

>>> arr = dataset.read_array(band=0, window=[1, 1, 2, 2])
>>> print(arr) # doctest: +SKIP
array([[0.14617829, 0.05045189],
       [0.37358843, 0.32233918]])

If you check the values of the 2x2 block, you will find them the same as the values in the entire array of band 0, starting at the 2nd row and 2nd column.
Read a block using a GeoDataFrame polygon that covers the same area as the window above:

>>> import geopandas as gpd
>>> from shapely.geometry import Polygon
>>> poly = gpd.GeoDataFrame(geometry=[Polygon([(0.1, -0.1), (0.1, -0.2), (0.2, -0.2), (0.2, -0.1)])], crs=4326)
>>> arr = dataset.read_array(band=0, window=poly)
>>> print(arr) # doctest: +SKIP
array([[0.14617829, 0.05045189],
       [0.37358843, 0.32233918]])

See Also

Dataset.get_tile: Read the dataset in chunks.
Dataset.get_block_arrangement: Get block arrangement to read the dataset in chunks.

Source code in pyramids/dataset.py

def read_array(
    self, band: int = None, window: Union[GeoDataFrame, List[int]] = None
) -> np.ndarray:
    """Read the values stored in a given band.

    Data Chuncks/blocks
        When a raster dataset is stored on disk, it might not be stored as one continuous chunk of data. Instead,
        it can be divided into smaller rectangular blocks or tiles. These blocks can be individually accessed,
        which is particularly useful for large datasets:

            - Efficiency: Reading or writing small blocks requires less memory than dealing with the entire dataset
                  at once. This is especially beneficial when only a small portion of the data needs to be processed.
            - Performance: For certain file formats and operations, working with optimal block sizes can significantly
                  improve performance. For example, if the block size matches the reading or processing window,
                  Pyramids can minimize disk access and data transfer.

    Args:
        band (int, optional):
            The band you want to get its data. If None, data of all bands will be read. Default is None.
        window (List[int] | GeoDataFrame, optional):
            Specify a block of data to read from the dataset. The window can be specified in two ways:

            - List:
                Window specified as a list of 4 integers [offset_x, offset_y, window_columns, window_rows].

                - offset_x/column index: x offset of the block.
                - offset_y/row index: y offset of the block.
                - window_columns: number of columns in the block.
                - window_rows: number of rows in the block.

            - GeoDataFrame:
                GeoDataFrame with a geometry column filled with polygon geometries; the function will get the
                total_bounds of the GeoDataFrame and use it as a window to read the raster.

    Returns:
        np.ndarray:
            array with all the values in the raster.

    Examples:
        - Create `Dataset` consisting of 4 bands, 5 rows, and 5 columns at the point lon/lat (0, 0):

          ```python
          >>> import numpy as np
          >>> arr = np.random.rand(4, 5, 5)
          >>> top_left_corner = (0, 0)
          >>> cell_size = 0.05
          >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)

          ```

        - Read all the values stored in a given band:

          ```python
          >>> arr = dataset.read_array(band=0) # doctest: +SKIP
          array([[0.50482225, 0.45678043, 0.53294294, 0.28862223, 0.66753579],
                 [0.38471912, 0.14617829, 0.05045189, 0.00761358, 0.25501918],
                 [0.32689036, 0.37358843, 0.32233918, 0.75450564, 0.45197608],
                 [0.22944676, 0.2780928 , 0.71605189, 0.71859309, 0.61896933],
                 [0.47740168, 0.76490779, 0.07679277, 0.16142599, 0.73630836]])

          ```

        - Read a 2x2 block from the first band. The block starts at the 2nd column (index 1) and 2nd row (index 1)
            (the first index is the column index):

          ```python
          >>> arr = dataset.read_array(band=0, window=[1, 1, 2, 2])
          >>> print(arr) # doctest: +SKIP
          array([[0.14617829, 0.05045189],
                 [0.37358843, 0.32233918]])

          ```

        - If you check the values of the 2x2 block, you will find them the same as the values in the entire array
            of band 0, starting at the 2nd row and 2nd column.

        - Read a block using a GeoDataFrame polygon that covers the same area as the window above:

          ```python
          >>> import geopandas as gpd
          >>> from shapely.geometry import Polygon
          >>> poly = gpd.GeoDataFrame(geometry=[Polygon([(0.1, -0.1), (0.1, -0.2), (0.2, -0.2), (0.2, -0.1)])], crs=4326)
          >>> arr = dataset.read_array(band=0, window=poly)
          >>> print(arr) # doctest: +SKIP
          array([[0.14617829, 0.05045189],
                 [0.37358843, 0.32233918]])

          ```

    See Also:
        - Dataset.get_tile: Read the dataset in chunks.
        - Dataset.get_block_arrangement: Get block arrangement to read the dataset in chunks.
    """
    if band is None and self.band_count > 1:
        rows = self.rows if window is None else window[3]
        columns = self.columns if window is None else window[2]
        arr = np.ones(
            (
                self.band_count,
                rows,
                columns,
            ),
            dtype=self.numpy_dtype[0],
        )

        for i in range(self.band_count):
            if window is None:
                # this line could be replaced with the following line
                # arr[i, :, :] = self._iloc(i).ReadAsArray()
                arr[i, :, :] = self._raster.GetRasterBand(i + 1).ReadAsArray()
            else:
                arr[i, :, :] = self._read_block(i, window)
    else:
        # given band number or the raster has only one band
        if band is None:
            band = 0
        else:
            if band > self.band_count - 1:
                raise ValueError(
                    f"band index should be between 0 and {self.band_count - 1}"
                )
        if window is None:
            arr = self._iloc(band).ReadAsArray()
        else:
            arr = self._read_block(band, window)

    return arr

`get_x_lon_dimension_array(pivot_x, cell_size, columns)` `staticmethod` #

Get X/Lon coordinates.

Source code in pyramids/dataset.py

@staticmethod
def get_x_lon_dimension_array(pivot_x, cell_size, columns) -> List[float]:
    """Get X/Lon coordinates."""
    x_coords = [pivot_x + i * cell_size + cell_size / 2 for i in range(columns)]
    return x_coords

`get_y_lat_dimension_array(pivot_y, cell_size, rows)` `staticmethod` #

Get Y/Lat coordinates.

Source code in pyramids/dataset.py

@staticmethod
def get_y_lat_dimension_array(pivot_y, cell_size, rows) -> List[float]:
    """Get Y/Lat coordinates."""
    y_coords = [pivot_y - i * cell_size - cell_size / 2 for i in range(rows)]
    return y_coords

`get_block_arrangement(band=0, x_block_size=None, y_block_size=None)` #

Get Block Arrangement.

Parameters:

Name	Type	Description	Default
`band`	`int`	band index, by default 0	`0`
`x_block_size`	`int`	x block size/number of columns, by default None	`None`
`y_block_size`	`int`	y block size/number of rows, by default None	`None`

Returns:

Name	Type	Description
`DataFrame`	`DataFrame`	with the following columns: [x_offset, y_offset, window_xsize, window_ysize]

Examples:

Example of getting block arrangement:

>>> import numpy as np
>>> arr = np.random.rand(13, 14)
>>> top_left_corner = (0, 0)
>>> cell_size = 0.05
>>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
>>> df = dataset.get_block_arrangement(x_block_size=5, y_block_size=5)
>>> print(df)
   x_offset  y_offset  window_xsize  window_ysize
0         0         0             5             5
1         5         0             5             5
2        10         0             4             5
3         0         5             5             5
4         5         5             5             5
5        10         5             4             5
6         0        10             5             3
7         5        10             5             3
8        10        10             4             3

Source code in pyramids/dataset.py

def get_block_arrangement(
    self, band: int = 0, x_block_size: int = None, y_block_size: int = None
) -> DataFrame:
    """Get Block Arrangement.

    Args:
        band (int, optional):
            band index, by default 0
        x_block_size (int, optional):
            x block size/number of columns, by default None
        y_block_size (int, optional):
            y block size/number of rows, by default None

    Returns:
        DataFrame:
            with the following columns: [x_offset, y_offset, window_xsize, window_ysize]

    Examples:
        - Example of getting block arrangement:

          ```python
          >>> import numpy as np
          >>> arr = np.random.rand(13, 14)
          >>> top_left_corner = (0, 0)
          >>> cell_size = 0.05
          >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
          >>> df = dataset.get_block_arrangement(x_block_size=5, y_block_size=5)
          >>> print(df)
             x_offset  y_offset  window_xsize  window_ysize
          0         0         0             5             5
          1         5         0             5             5
          2        10         0             4             5
          3         0         5             5             5
          4         5         5             5             5
          5        10         5             4             5
          6         0        10             5             3
          7         5        10             5             3
          8        10        10             4             3

          ```
    """
    block_sizes = self.block_size[band]
    x_block_size = block_sizes[0] if x_block_size is None else x_block_size
    y_block_size = block_sizes[1] if y_block_size is None else y_block_size

    df = pd.DataFrame(
        [
            {
                "x_offset": x,
                "y_offset": y,
                "window_xsize": min(x_block_size, self.columns - x),
                "window_ysize": min(y_block_size, self.rows - y),
            }
            for y in range(0, self.rows, y_block_size)
            for x in range(0, self.columns, x_block_size)
        ],
        columns=["x_offset", "y_offset", "window_xsize", "window_ysize"],
    )
    return df

`copy(path=None)` #

Deep copy.

Parameters:

Name	Type	Description	Default
`path`	`str`	Destination path to save the copied dataset. If None is passed, the copied dataset will be created in memory.	`None`

Examples:

First, we will create a dataset with 1 band, 3 rows and 5 columns.

>>> import numpy as np
>>> arr = np.random.rand(3, 5)
>>> top_left_corner = (0, 0)
>>> cell_size = 0.05
>>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
>>> print(dataset)
<BLANKLINE>
            Cell size: 0.05
            Dimension: 3 * 5
            EPSG: 4326
            Number of Bands: 1
            Band names: ['Band_1']
            Mask: -9999.0
            Data type: float64
            File:...
<BLANKLINE>

Now, we will create a copy of the dataset.

>>> copied_dataset = dataset.copy(path="copy-dataset.tif")
>>> print(copied_dataset)
<BLANKLINE>
            Cell size: 0.05
            Dimension: 3 * 5
            EPSG: 4326
            Number of Bands: 1
            Band names: ['Band_1']
            Mask: -9999.0
            Data type: float64
            File: copy-dataset.tif
<BLANKLINE>

Now close the dataset.

>>> copied_dataset.close()

Source code in pyramids/dataset.py

def copy(self, path: str = None) -> "Dataset":
    """Deep copy.

    Args:
        path (str, optional):
            Destination path to save the copied dataset. If None is passed, the copied dataset will be created in memory.

    Examples:
        - First, we will create a dataset with 1 band, 3 rows and 5 columns.

          ```python
          >>> import numpy as np
          >>> arr = np.random.rand(3, 5)
          >>> top_left_corner = (0, 0)
          >>> cell_size = 0.05
          >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
          >>> print(dataset)
          <BLANKLINE>
                      Cell size: 0.05
                      Dimension: 3 * 5
                      EPSG: 4326
                      Number of Bands: 1
                      Band names: ['Band_1']
                      Mask: -9999.0
                      Data type: float64
                      File:...
          <BLANKLINE>

          ```

        - Now, we will create a copy of the dataset.

          ```python
          >>> copied_dataset = dataset.copy(path="copy-dataset.tif")
          >>> print(copied_dataset)
          <BLANKLINE>
                      Cell size: 0.05
                      Dimension: 3 * 5
                      EPSG: 4326
                      Number of Bands: 1
                      Band names: ['Band_1']
                      Mask: -9999.0
                      Data type: float64
                      File: copy-dataset.tif
          <BLANKLINE>

          ```

        - Now close the dataset.

          ```python
          >>> copied_dataset.close()

          ```

    """
    if path is None:
        path = ""
        driver = "MEM"
    else:
        driver = "GTiff"

    src = gdal.GetDriverByName(driver).CreateCopy(path, self._raster)

    return Dataset(src, access="write")

`close()` #

Close the dataset.

Source code in pyramids/dataset.py

def close(self):
    """Close the dataset."""
    self._raster.FlushCache()
    self._raster = None

`get_attribute_table(band=0)` #

Get the attribute table for a given band.

- Get the attribute table of a band.

Parameters:

Name	Type	Description	Default
`band`	`int`	Band index, the index starts from 1.	`0`

Returns:

Name	Type	Description
`DataFrame`	`DataFrame`	DataFrame with the attribute table.

Examples:

Read a dataset and fetch its attribute table:

>>> dataset = Dataset.read_file("examples/data/geotiff/south-america-mswep_1979010100.tif")
>>> df = dataset.get_attribute_table()
>>> print(df)
  Precipitation Range (mm)   Category              Description
0                     0-50        Low   Very low precipitation
1                   51-100   Moderate   Moderate precipitation
2                  101-200       High       High precipitation
3                  201-500  Very High  Very high precipitation
4                     >500    Extreme    Extreme precipitation

Source code in pyramids/dataset.py

def get_attribute_table(self, band: int = 0) -> DataFrame:
    """Get the attribute table for a given band.

        - Get the attribute table of a band.

    Args:
        band (int):
            Band index, the index starts from 1.

    Returns:
        DataFrame:
            DataFrame with the attribute table.

    Examples:
        - Read a dataset and fetch its attribute table:

          ```python
          >>> dataset = Dataset.read_file("examples/data/geotiff/south-america-mswep_1979010100.tif")
          >>> df = dataset.get_attribute_table()
          >>> print(df)
            Precipitation Range (mm)   Category              Description
          0                     0-50        Low   Very low precipitation
          1                   51-100   Moderate   Moderate precipitation
          2                  101-200       High       High precipitation
          3                  201-500  Very High  Very high precipitation
          4                     >500    Extreme    Extreme precipitation

          ```
    """
    band = self._iloc(band)
    rat = band.GetDefaultRAT()
    if rat is None:
        df = None
    else:
        df = self._attribute_table_to_df(rat)

    return df

`set_attribute_table(df, band=None)` #

Set the attribute table for a band.

The attribute table can be used to associate tabular data with the values of a raster band. This is particularly useful for categorical raster data, such as land cover classifications, where each pixel value corresponds to a category that has additional attributes (e.g., class name, color description).

Notes

The attribute table is stored in an xml file by the name of the raster file with the extension of .aux.xml.
Setting an attribute table to a band will overwrite the existing attribute table if it exists.
Setting an attribute table to a band does not need the dataset to be opened in a write mode.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	DataFrame with the attribute table.	required
`band`	`int`	Band index.	`None`

Examples:

First create a dataset:

>>> dataset = Dataset.create(
... cell_size=0.05, rows=10, columns=10, dtype="float32", bands=1, top_left_corner=(0, 0),
... epsg=4326, no_data_value=-9999
... )

Create a DataFrame with the attribute table:

>>> data = {
...     "Value": [1, 2, 3],
...     "ClassName": ["Forest", "Water", "Urban"],
...     "Color": ["#008000", "#0000FF", "#808080"],
... }
>>> df = pd.DataFrame(data)

Set the attribute table to the dataset:

>>> dataset.set_attribute_table(df, band=0)

Then the attribute table can be retrieved using the get_attribute_table method.
The content of the attribute table will be stored in an xml file by the name of the raster file with the extension of .aux.xml. The content of the file will be like the following:

    <PAMDataset>
      <PAMRasterBand band="1">
        <GDALRasterAttributeTable tableType="thematic">
          <FieldDefn index="0">
            <Name>Precipitation Range (mm)</Name>
            <Type>2</Type>
            <Usage>0</Usage>
          </FieldDefn>
          <FieldDefn index="1">
            <Name>Category</Name>
            <Type>2</Type>
            <Usage>0</Usage>
          </FieldDefn>
          <FieldDefn index="2">
            <Name>Description</Name>
            <Type>2</Type>
            <Usage>0</Usage>
          </FieldDefn>
          <Row index="0">
            <F>0-50</F>
            <F>Low</F>
            <F>Very low precipitation</F>
          </Row>
          <Row index="1">
            <F>51-100</F>
            <F>Moderate</F>
            <F>Moderate precipitation</F>
          </Row>
          <Row index="2">
            <F>101-200</F>
            <F>High</F>
            <F>High precipitation</F>
          </Row>
          <Row index="3">
            <F>201-500</F>
            <F>Very High</F>
            <F>Very high precipitation</F>
          </Row>
          <Row index="4">
            <F>&gt;500</F>
            <F>Extreme</F>
            <F>Extreme precipitation</F>
          </Row>
        </GDALRasterAttributeTable>
      </PAMRasterBand>
    </PAMDataset>

Source code in pyramids/dataset.py

def set_attribute_table(self, df: DataFrame, band: int = None) -> None:
    """Set the attribute table for a band.

    The attribute table can be used to associate tabular data with the values of a raster band.
    This is particularly useful for categorical raster data, such as land cover classifications, where each pixel
    value corresponds to a category that has additional attributes (e.g., class name, color description).

    Notes:
        - The attribute table is stored in an xml file by the name of the raster file with the extension of .aux.xml.
        - Setting an attribute table to a band will overwrite the existing attribute table if it exists.
        - Setting an attribute table to a band does not need the dataset to be opened in a write mode.

    Args:
        df (DataFrame):
            DataFrame with the attribute table.
        band (int):
            Band index.

    Examples:
        - First create a dataset:

          ```python
          >>> dataset = Dataset.create(
          ... cell_size=0.05, rows=10, columns=10, dtype="float32", bands=1, top_left_corner=(0, 0),
          ... epsg=4326, no_data_value=-9999
          ... )

          ```

        - Create a DataFrame with the attribute table:

          ```python
          >>> data = {
          ...     "Value": [1, 2, 3],
          ...     "ClassName": ["Forest", "Water", "Urban"],
          ...     "Color": ["#008000", "#0000FF", "#808080"],
          ... }
          >>> df = pd.DataFrame(data)

          ```

        - Set the attribute table to the dataset:

          ```python
          >>> dataset.set_attribute_table(df, band=0)

          ```

        - Then the attribute table can be retrieved using the `get_attribute_table` method.
        - The content of the attribute table will be stored in an xml file by the name of the raster file with
          the extension of .aux.xml. The content of the file will be like the following:

          ```xml

              <PAMDataset>
                <PAMRasterBand band="1">
                  <GDALRasterAttributeTable tableType="thematic">
                    <FieldDefn index="0">
                      <Name>Precipitation Range (mm)</Name>
                      <Type>2</Type>
                      <Usage>0</Usage>
                    </FieldDefn>
                    <FieldDefn index="1">
                      <Name>Category</Name>
                      <Type>2</Type>
                      <Usage>0</Usage>
                    </FieldDefn>
                    <FieldDefn index="2">
                      <Name>Description</Name>
                      <Type>2</Type>
                      <Usage>0</Usage>
                    </FieldDefn>
                    <Row index="0">
                      <F>0-50</F>
                      <F>Low</F>
                      <F>Very low precipitation</F>
                    </Row>
                    <Row index="1">
                      <F>51-100</F>
                      <F>Moderate</F>
                      <F>Moderate precipitation</F>
                    </Row>
                    <Row index="2">
                      <F>101-200</F>
                      <F>High</F>
                      <F>High precipitation</F>
                    </Row>
                    <Row index="3">
                      <F>201-500</F>
                      <F>Very High</F>
                      <F>Very high precipitation</F>
                    </Row>
                    <Row index="4">
                      <F>&gt;500</F>
                      <F>Extreme</F>
                      <F>Extreme precipitation</F>
                    </Row>
                  </GDALRasterAttributeTable>
                </PAMRasterBand>
              </PAMDataset>

          ```
    """
    rat = self._df_to_attribute_table(df)
    band = self._iloc(band)
    band.SetDefaultRAT(rat)

`add_band(array, unit=None, attribute_table=None, inplace=False)` #

Add a new band to the dataset.

Parameters:

Name	Type	Description	Default
`array`	`ndarray`	2D array to add as a new band.	required
`unit`	`Any`	Unit of the values in the new band.	`None`
`attribute_table`	`DataFrame`	Attribute table provides a way to associate tabular data with the values of a raster band. This is particularly useful for categorical raster data, such as land cover classifications, where each pixel value corresponds to a category that has additional attributes (e.g., class name, color, description). Default is None.	`None`
`inplace`	`bool`	If True the new band will be added to the current dataset, if False the new band will be added to a new dataset. Default is False.	`False`

Returns:

Type	Description
`Union[None, Dataset]`	None

Examples:

First create a dataset:

>>> dataset = Dataset.create(
... cell_size=0.05, rows=10, columns=10, dtype="float32", bands=1, top_left_corner=(0, 0),
... epsg=4326, no_data_value=-9999
... )
>>> print(dataset)
<BLANKLINE>
            Cell size: 0.05
            Dimension: 10 * 10
            EPSG: 4326
            Number of Bands: 1
            Band names: ['Band_1']
            Mask: -9999.0
            Data type: float32
            File:...
<BLANKLINE>

Create a 2D array to add as a new band:

>>> import numpy as np
>>> array = np.random.rand(10, 10)

Add the new band to the dataset inplace:

>>> dataset.add_band(array, unit="m", attribute_table=None, inplace=True)
>>> print(dataset)
<BLANKLINE>
            Cell size: 0.05
            Dimension: 10 * 10
            EPSG: 4326
            Number of Bands: 2
            Band names: ['Band_1', 'Band_2']
            Mask: -9999.0
            Data type: float32
            File:...
<BLANKLINE>

The new band will be added to the dataset inplace.
You can also add an attribute table to the band when you add a new band to the dataset.

>>> import pandas as pd
>>> data = {
...     "Value": [1, 2, 3],
...     "ClassName": ["Forest", "Water", "Urban"],
...     "Color": ["#008000", "#0000FF", "#808080"],
... }
>>> df = pd.DataFrame(data)
>>> dataset.add_band(array, unit="m", attribute_table=df, inplace=True)

See Also

Dataset.create_from_array: create a new dataset from an array. Dataset.create: create a new dataset with an empty band. Dataset.dataset_like: create a new dataset from another dataset. Dataset.get_attribute_table: get the attribute table for a specific band. Dataset.set_attribute_table: Set the attribute table for a specific band.

Source code in pyramids/dataset.py

def add_band(
    self,
    array: np.ndarray,
    unit: Any = None,
    attribute_table: DataFrame = None,
    inplace: bool = False,
) -> Union[None, "Dataset"]:
    """Add a new band to the dataset.

    Args:
        array (np.ndarray):
            2D array to add as a new band.
        unit (Any, optional):
            Unit of the values in the new band.
        attribute_table (DataFrame, optional):
            Attribute table provides a way to associate tabular data with the values of a raster band. This is
            particularly useful for categorical raster data, such as land cover classifications, where each pixel
            value corresponds to a category that has additional attributes (e.g., class name, color, description).
            Default is None.
        inplace (bool, optional):
            If True the new band will be added to the current dataset, if False the new band will be added to a
            new dataset. Default is False.

    Returns:
        None

    Examples:
        - First create a dataset:

          ```python
          >>> dataset = Dataset.create(
          ... cell_size=0.05, rows=10, columns=10, dtype="float32", bands=1, top_left_corner=(0, 0),
          ... epsg=4326, no_data_value=-9999
          ... )
          >>> print(dataset)
          <BLANKLINE>
                      Cell size: 0.05
                      Dimension: 10 * 10
                      EPSG: 4326
                      Number of Bands: 1
                      Band names: ['Band_1']
                      Mask: -9999.0
                      Data type: float32
                      File:...
          <BLANKLINE>

          ```

        - Create a 2D array to add as a new band:

          ```python
          >>> import numpy as np
          >>> array = np.random.rand(10, 10)

          ```

        - Add the new band to the dataset inplace:

          ```python
          >>> dataset.add_band(array, unit="m", attribute_table=None, inplace=True)
          >>> print(dataset)
          <BLANKLINE>
                      Cell size: 0.05
                      Dimension: 10 * 10
                      EPSG: 4326
                      Number of Bands: 2
                      Band names: ['Band_1', 'Band_2']
                      Mask: -9999.0
                      Data type: float32
                      File:...
          <BLANKLINE>

          ```

        - The new band will be added to the dataset inplace.
        - You can also add an attribute table to the band when you add a new band to the dataset.

          ```python
          >>> import pandas as pd
          >>> data = {
          ...     "Value": [1, 2, 3],
          ...     "ClassName": ["Forest", "Water", "Urban"],
          ...     "Color": ["#008000", "#0000FF", "#808080"],
          ... }
          >>> df = pd.DataFrame(data)
          >>> dataset.add_band(array, unit="m", attribute_table=df, inplace=True)

          ```

    See Also:
        Dataset.create_from_array: create a new dataset from an array.
        Dataset.create: create a new dataset with an empty band.
        Dataset.dataset_like: create a new dataset from another dataset.
        Dataset.get_attribute_table: get the attribute table for a specific band.
        Dataset.set_attribute_table: Set the attribute table for a specific band.
    """
    # check the dimensions of the new array
    if array.ndim != 2:
        raise ValueError("The array must be 2D.")
    if array.shape[0] != self.rows or array.shape[1] != self.columns:
        raise ValueError(
            f"The array must have the same dimensions as the raster.{self.rows} {self.columns}"
        )
    # check if the dataset is opened in a write mode
    if inplace:
        if self.access == "read_only":
            raise ValueError("The dataset is not opened in a write mode.")
        else:
            src = self._raster
    else:
        src = gdal.GetDriverByName("MEM").CreateCopy("", self._raster)

    dtype = numpy_to_gdal_dtype(array.dtype)
    num_bands = src.RasterCount
    src.AddBand(dtype, [])
    band = src.GetRasterBand(num_bands + 1)

    if unit is not None:
        band.SetUnitType(unit)

    if attribute_table is not None:
        # Attach the RAT to the raster band
        rat = Dataset._df_to_attribute_table(attribute_table)
        band.SetDefaultRAT(rat)

    band.WriteArray(array)

    if inplace:
        self.__init__(src, self.access)
    else:
        return Dataset(src, self.access)

`stats(band=None, mask=None)` #

Get statistics of a band [Min, max, mean, std].

Parameters:

Name	Type	Description	Default
`band`	`int`	Band index. If None, the statistics of all bands will be returned.	`None`
`mask`	`Polygon GeoDataFrame or Dataset`	GeodataFrame with a geometry of polygon type.	`None`

Returns:

Name Type Description

DataFrame

DataFrame wit the stats of each band, the dataframe has the following columns [min, max, mean, std], the index of the dataframe is the band names.

                   Min         max        mean       std
    Band_1  270.369720  270.762299  270.551361  0.154270
    Band_2  269.611938  269.744751  269.673645  0.043788
    Band_3  273.641479  274.168823  273.953979  0.198447
    Band_4  273.991516  274.540344  274.310669  0.205754

Notes

The value of the stats will be stored in an xml file by the name of the raster file with the extension of .aux.xml.
The content of the file will be like the following:

    <PAMDataset>
      <PAMRasterBand band="1">
        <Description>Band_1</Description>
        <Metadata>
          <MDI key="RepresentationType">ATHEMATIC</MDI>
          <MDI key="STATISTICS_MAXIMUM">88</MDI>
          <MDI key="STATISTICS_MEAN">7.9662921348315</MDI>
          <MDI key="STATISTICS_MINIMUM">0</MDI>
          <MDI key="STATISTICS_STDDEV">18.294377743948</MDI>
          <MDI key="STATISTICS_VALID_PERCENT">48.9</MDI>
        </Metadata>
      </PAMRasterBand>
    </PAMDataset>

Examples:

Get the statistics of all bands in the dataset:

>>> import numpy as np
>>> arr = np.random.rand(4, 10, 10)
>>> geotransform = (0, 0.05, 0, 0, 0, -0.05)
>>> dataset = Dataset.create_from_array(arr, geo=geotransform, epsg=4326)
>>> print(dataset.stats()) # doctest: +SKIP
             min       max      mean       std
Band_1  0.006443  0.942943  0.468935  0.266634
Band_2  0.020377  0.978130  0.477189  0.306864
Band_3  0.019652  0.992184  0.537215  0.286502
Band_4  0.011955  0.984313  0.503616  0.295852
>>> print(dataset.stats(band=1))  # doctest: +SKIP
             min      max      mean       std
Band_2  0.020377  0.97813  0.477189  0.306864

Get the statistics of all the bands using a mask polygon.

Create the polygon using shapely polygon, and use the xmin, ymin, xmax, ymax = [0.1, -0.2, 0.2 -0.1] to cover the 4 cells.

>>> from shapely.geometry import Polygon
>>> import geopandas as gpd
>>> mask = gpd.GeoDataFrame(geometry=[Polygon([(0.1, -0.1), (0.1, -0.2), (0.2, -0.2), (0.2, -0.1)])],crs=4326)
>>> print(dataset.stats(mask=mask))  # doctest: +SKIP
             min       max      mean       std
Band_1  0.193441  0.702108  0.541478  0.202932
Band_2  0.281281  0.932573  0.665602  0.239410
Band_3  0.031395  0.982235  0.493086  0.377608
Band_4  0.079562  0.930965  0.591025  0.341578

Source code in pyramids/dataset.py

def stats(self, band: int = None, mask: GeoDataFrame = None) -> DataFrame:
    """Get statistics of a band [Min, max, mean, std].

    Args:
        band (int, optional):
            Band index. If None, the statistics of all bands will be returned.
        mask (Polygon GeoDataFrame or Dataset, optional):
            GeodataFrame with a geometry of polygon type.

    Returns:
        DataFrame:
            DataFrame wit the stats of each band, the dataframe has the following columns
            [min, max, mean, std], the index of the dataframe is the band names.

            ```text

                               Min         max        mean       std
                Band_1  270.369720  270.762299  270.551361  0.154270
                Band_2  269.611938  269.744751  269.673645  0.043788
                Band_3  273.641479  274.168823  273.953979  0.198447
                Band_4  273.991516  274.540344  274.310669  0.205754
            ```

    Notes:
        - The value of the stats will be stored in an xml file by the name of the raster file with the extension of
          .aux.xml.
        - The content of the file will be like the following:

          ```xml

              <PAMDataset>
                <PAMRasterBand band="1">
                  <Description>Band_1</Description>
                  <Metadata>
                    <MDI key="RepresentationType">ATHEMATIC</MDI>
                    <MDI key="STATISTICS_MAXIMUM">88</MDI>
                    <MDI key="STATISTICS_MEAN">7.9662921348315</MDI>
                    <MDI key="STATISTICS_MINIMUM">0</MDI>
                    <MDI key="STATISTICS_STDDEV">18.294377743948</MDI>
                    <MDI key="STATISTICS_VALID_PERCENT">48.9</MDI>
                  </Metadata>
                </PAMRasterBand>
              </PAMDataset>

          ```

    Examples:
        - Get the statistics of all bands in the dataset:

          ```python
          >>> import numpy as np
          >>> arr = np.random.rand(4, 10, 10)
          >>> geotransform = (0, 0.05, 0, 0, 0, -0.05)
          >>> dataset = Dataset.create_from_array(arr, geo=geotransform, epsg=4326)
          >>> print(dataset.stats()) # doctest: +SKIP
                       min       max      mean       std
          Band_1  0.006443  0.942943  0.468935  0.266634
          Band_2  0.020377  0.978130  0.477189  0.306864
          Band_3  0.019652  0.992184  0.537215  0.286502
          Band_4  0.011955  0.984313  0.503616  0.295852
          >>> print(dataset.stats(band=1))  # doctest: +SKIP
                       min      max      mean       std
          Band_2  0.020377  0.97813  0.477189  0.306864

          ```

        - Get the statistics of all the bands using a mask polygon.

          - Create the polygon using shapely polygon, and use the xmin, ymin, xmax, ymax = [0.1, -0.2,
            0.2 -0.1] to cover the 4 cells.
          ```python
          >>> from shapely.geometry import Polygon
          >>> import geopandas as gpd
          >>> mask = gpd.GeoDataFrame(geometry=[Polygon([(0.1, -0.1), (0.1, -0.2), (0.2, -0.2), (0.2, -0.1)])],crs=4326)
          >>> print(dataset.stats(mask=mask))  # doctest: +SKIP
                       min       max      mean       std
          Band_1  0.193441  0.702108  0.541478  0.202932
          Band_2  0.281281  0.932573  0.665602  0.239410
          Band_3  0.031395  0.982235  0.493086  0.377608
          Band_4  0.079562  0.930965  0.591025  0.341578

          ```

    """
    if mask is not None:
        dst = self.crop(mask, touch=True)

    if band is None:
        df = pd.DataFrame(
            index=self.band_names,
            columns=["min", "max", "mean", "std"],
            dtype=np.float32,
        )
        for i in range(self.band_count):
            if mask is not None:
                df.iloc[i, :] = dst._get_stats(i)
            else:
                df.iloc[i, :] = self._get_stats(i)
    else:
        df = pd.DataFrame(
            index=[self.band_names[band]],
            columns=["min", "max", "mean", "std"],
            dtype=np.float32,
        )
        if mask is not None:
            df.iloc[0, :] = dst._get_stats(band)
        else:
            df.iloc[0, :] = self._get_stats(band)

    return df

`plot(band=None, exclude_value=None, rgb=None, surface_reflectance=None, cutoff=None, overview=False, overview_index=0, percentile=None, **kwargs)` #

Plot the values/overviews of a given band.

The plot function uses the cleopatra as a backend to plot the raster data, for more information check ArrayGlyph.

Parameters:

Name	Type	Description	Default
`band`	`int`	The band you want to get its data. Default is 0.	`None`
`exclude_value`	`Any`	Value to exclude from the plot. Default is None.	`None`
`rgb`	`List[int]`	The indices of the red, green, and blue bands in the `Dataset`. the `rgb` parameter can be a list of three values, or a list of four values if the alpha band is also included. The `plot` method will check if the rgb bands are defined in the `Dataset`, if all the three bands ( red, green, blue)) are defined, the method will use them to plot the real image, if not the rgb bands will be considered as [2,1,0] as the default order for sentinel tif files.	`None`
`surface_reflectance`	`int`	Surface reflectance value for normalizing satellite data, by default None. Typically 10000 for Sentinel-2 data.	`None`
`cutoff`	`List`	clip the range of pixel values for each band. (take only the pixel values from 0 to the value of the cutoff and scale them back to between 0 and 1). Default is None.	`None`
`overview`	`bool`	True if you want to plot the overview. Default is False.	`False`
`overview_index`	`int`	Index of the overview. Default is 0.	`0`
`percentile`	`Optional[int]`	int The percentile value to be used for scaling.	`None`

kwargs: | Parameter |------------------------ | points | point_color | point_size | pid_color | pid_size | figsize | title | title_size | orientation | rotation | cbar_length | ticks_spacing | cbar_label_size | cbar_label | color_scale | gamma | line_threshold | line_scale | bounds | midpoint | cmap | display_cell_value | num_size | background_color_ | Type | Description | -----|---------------------|-------------| | array | 3 column array with the first column as the value to display for the point, the second as the row index, and the third as the column index in the array. The second and third columns tell the location of the point. | | str | Color of the point. | | Any | Size of the point. | | str | Color of the annotation of the point. Default is blue. | | Any | Size of the point annotation. | | tuple, optional | Figure size. Default is (8, 8). | | str, optional | Title of the plot. Default is 'Total Discharge'. | | int, optional | Title size. Default is 15. | | str, optional | Orientation of the color bar (horizontal or vertical). Default is 'vertical'. | | number, optional | Rotation of the color bar label. Default is -90. | | float, optional | Ratio to control the height of the color bar. Default is 0.75. | | int, optional | Spacing between color bar ticks. Default is 2. | | int, optional | Size of the color bar label. Default is 12. | | str, optional | Label of the color bar. Default is 'Discharge m³/s'. | | int, optional | Scale mode for colors. Options: 1 = normal, 2 = power, 3 = SymLogNorm, 4 = PowerNorm, 5 = BoundaryNorm. Default is 1. | | float, optional | Value needed for color scale option 2. Default is 1/2. | | float, optional | Value needed for color scale option 3. Default is 0.0001. | | float, optional | Value needed for color scale option 3. Default is 0.001. | | list, optional | Discrete bounds for color scale option 4. Default is None. | | float, optional | Value needed for color scale option 5. Default is 0. | | str, optional | Color map style. Default is 'coolwarm_r'. | | bool, optional | Whether to display cell values as text. | | int, optional | Size of numbers plotted on top of each cell. Default is 8. | threshold| float or int, optional | Threshold for deciding text color over cells: if value > threshold → black text; else white text. If None, max value / 2 is used. Default is None. |

Returns:

Name	Type	Description
`ArrayGlyph`	`ArrayGlyph`	ArrayGlyph object. For more details of the ArrayGlyph object check the ArrayGlyph.

Examples:

Plot a certain band:

>>> import numpy as np
>>> arr = np.random.rand(4, 10, 10)
>>> top_left_corner = (0, 0)
>>> cell_size = 0.05
>>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size,epsg=4326)
>>> dataset.plot(band=0)
(<Figure size 800x800 with 2 Axes>, <Axes: >)

plot using power scale.

>>> dataset.plot(band=0, color_scale="power")
(<Figure size 800x800 with 2 Axes>, <Axes: >)

plot using SymLogNorm scale.

>>> dataset.plot(band=0, color_scale="sym-lognorm")
(<Figure size 800x800 with 2 Axes>, <Axes: >)

plot using PowerNorm scale.

>>> dataset.plot(band=0, color_scale="boundary-norm", bounds=[0, 0.2, 0.4, 0.6, 0.8, 1])
(<Figure size 800x800 with 2 Axes>, <Axes: >)

plot using BoundaryNorm scale.

>>> dataset.plot(band=0, color_scale="midpoint")
(<Figure size 800x800 with 2 Axes>, <Axes: >)

Source code in pyramids/dataset.py

def plot(
    self,
    band: Optional[int] = None,
    exclude_value: Optional[Any] = None,
    rgb: Optional[List[int]] = None,
    surface_reflectance: Optional[int] = None,
    cutoff: Optional[List] = None,
    overview: Optional[bool] = False,
    overview_index: Optional[int] = 0,
    percentile: Optional[int] = None,
    **kwargs: Any,
) -> "ArrayGlyph":
    """Plot the values/overviews of a given band.

    The plot function uses the `cleopatra` as a backend to plot the raster data, for more information check
    [ArrayGlyph](https://serapieum-of-alex.github.io/cleopatra/latest/api/array-glyph-class/#cleopatra.array_glyph.ArrayGlyph.plot).

    Args:
        band (int, optional):
            The band you want to get its data. Default is 0.
        exclude_value (Any, optional):
            Value to exclude from the plot. Default is None.
        rgb (List[int], optional):
            The indices of the red, green, and blue bands in the `Dataset`. the `rgb` parameter can be a list of
            three values, or a list of four values if the alpha band is also included.
            The `plot` method will check if the rgb bands are defined in the `Dataset`, if all the three bands (
            red, green, blue)) are defined, the method will use them to plot the real image, if not the rgb bands
            will be considered as [2,1,0] as the default order for sentinel tif files.
        surface_reflectance (int, optional):
            Surface reflectance value for normalizing satellite data, by default None.
            Typically 10000 for Sentinel-2 data.
        cutoff (List, optional):
            clip the range of pixel values for each band. (take only the pixel values from 0 to the value of the cutoff
            and scale them back to between 0 and 1). Default is None.
        overview (bool, optional):
            True if you want to plot the overview. Default is False.
        overview_index (int, optional):
            Index of the overview. Default is 0.
        percentile: int
            The percentile value to be used for scaling.
    kwargs:
            | Parameter                   | Type                | Description |
            |-----------------------------|---------------------|-------------|
            | `points`                    | array               | 3 column array with the first column as the value to display for the point, the second as the row index, and the third as the column index in the array. The second and third columns tell the location of the point. |
            | `point_color`               | str                 | Color of the point. |
            | `point_size`                | Any                 | Size of the point. |
            | `pid_color`                 | str                 | Color of the annotation of the point. Default is blue. |
            | `pid_size`                  | Any                 | Size of the point annotation. |
            | `figsize`                   | tuple, optional     | Figure size. Default is `(8, 8)`. |
            | `title`                     | str, optional       | Title of the plot. Default is `'Total Discharge'`. |
            | `title_size`                | int, optional       | Title size. Default is `15`. |
            | `orientation`               | str, optional       | Orientation of the color bar (`horizontal` or `vertical`). Default is `'vertical'`. |
            | `rotation`                  | number, optional    | Rotation of the color bar label. Default is `-90`. |
            | `cbar_length`               | float, optional     | Ratio to control the height of the color bar. Default is `0.75`. |
            | `ticks_spacing`             | int, optional       | Spacing between color bar ticks. Default is `2`. |
            | `cbar_label_size`           | int, optional       | Size of the color bar label. Default is `12`. |
            | `cbar_label`                | str, optional       | Label of the color bar. Default is `'Discharge m³/s'`. |
            | `color_scale`               | int, optional       | Scale mode for colors. Options: 1 = normal, 2 = power, 3 = SymLogNorm, 4 = PowerNorm, 5 = BoundaryNorm. Default is `1`. |
            | `gamma`                     | float, optional     | Value needed for color scale option 2. Default is `1/2`. |
            | `line_threshold`            | float, optional     | Value needed for color scale option 3. Default is `0.0001`. |
            | `line_scale`                | float, optional     | Value needed for color scale option 3. Default is `0.001`. |
            | `bounds`                    | list, optional      | Discrete bounds for color scale option 4. Default is `None`. |
            | `midpoint`                  | float, optional     | Value needed for color scale option 5. Default is `0`. |
            | `cmap`                      | str, optional       | Color map style. Default is `'coolwarm_r'`. |
            | `display_cell_value`        | bool, optional      | Whether to display cell values as text. |
            | `num_size`                  | int, optional       | Size of numbers plotted on top of each cell. Default is `8`. |
            | `background_color_threshold`| float or int, optional | Threshold for deciding text color over cells: if value > threshold → black text; else white text. If `None`, max value / 2 is used. Default is `None`. |

    Returns:
        ArrayGlyph:
            ArrayGlyph object. For more details of the ArrayGlyph object check the [ArrayGlyph](https://serapieum-of-alex.github.io/cleopatra/latest/api/array-glyph-class/).


    Examples:
        - Plot a certain band:

          ```python
          >>> import numpy as np
          >>> arr = np.random.rand(4, 10, 10)
          >>> top_left_corner = (0, 0)
          >>> cell_size = 0.05
          >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size,epsg=4326)
          >>> dataset.plot(band=0)
          (<Figure size 800x800 with 2 Axes>, <Axes: >)

          ```

        - plot using power scale.

          ```python
          >>> dataset.plot(band=0, color_scale="power")
          (<Figure size 800x800 with 2 Axes>, <Axes: >)

          ```

        - plot using SymLogNorm scale.

          ```python
          >>> dataset.plot(band=0, color_scale="sym-lognorm")
          (<Figure size 800x800 with 2 Axes>, <Axes: >)

          ```

        - plot using PowerNorm scale.

          ```python
          >>> dataset.plot(band=0, color_scale="boundary-norm", bounds=[0, 0.2, 0.4, 0.6, 0.8, 1])
          (<Figure size 800x800 with 2 Axes>, <Axes: >)

          ```

        - plot using BoundaryNorm scale.

          ```python
          >>> dataset.plot(band=0, color_scale="midpoint")
          (<Figure size 800x800 with 2 Axes>, <Axes: >)

          ```
    """
    import_cleopatra(
        "The current function uses cleopatra package to for plotting, please install it manually, for more info "
        "check https://github.com/Serapieum-of-alex/cleopatra"
    )
    from cleopatra.array_glyph import ArrayGlyph

    no_data_value = [np.nan if i is None else i for i in self.no_data_value]
    if overview:
        arr = self.read_overview_array(band=band, overview_index=overview_index)
    else:
        arr = self.read_array(band=band)
    # if the raster has three bands or more.
    if self.band_count >= 3:
        if band is None:
            if rgb is None:
                rgb = [
                    self.get_band_by_color("red"),
                    self.get_band_by_color("green"),
                    self.get_band_by_color("blue"),
                ]
                if None in rgb:
                    rgb = [2, 1, 0]
            # first make the band index the first band in the rgb list (red band)
            band = rgb[0]
    # elif self.band_count == 1:
    #     band = 0
    else:
        if band is None:
            band = 0

    exclude_value = (
        [no_data_value[band], exclude_value]
        if exclude_value is not None
        else [no_data_value[band]]
    )

    cleo = ArrayGlyph(
        arr,
        exclude_value=exclude_value,
        extent=self.bbox,
        rgb=rgb,
        surface_reflectance=surface_reflectance,
        cutoff=cutoff,
        percentile=percentile,
        **kwargs,
    )
    cleo.plot(**kwargs)
    return cleo

`translate(path=None, **kwargs)` #

Translate.

The translate function can be used to - Convert Between Formats: Convert a raster from one format to another (e.g., from GeoTIFF to JPEG). - Subset: Extract a subregion of a raster. - Resample: Change the resolution of a raster. - Reproject: Change the coordinate reference system of a raster. - Scale Values: Scale pixel values to a new range. - Change Data Type: Convert the data type of the raster. - Apply Compression: Apply compression to the output raster. - Apply No-Data Values: Define no-data values for the output raster.

Parameters #

path: str, optional, default is None. path to save the output, if None, the output will be saved in memory. kwargs: unscale: unscale values with scale and offset metadata. scaleParams: list of scale parameters, each of the form [src_min,src_max] or [src_min,src_max,dst_min,dst_max] outputType: output type (gdalconst.GDT_Byte, etc...) exponents: list of exponentiation parameters bandList: array of band numbers (index start at 1) maskBand: mask band to generate or not ("none", "auto", "mask", 1, ...) creationOptions: list or dict of creation options srcWin: subwindow in pixels to extract: [left_x, top_y, width, height] projWin: subwindow in projected coordinates to extract: [ulx, uly, lrx, lry] projWinSRS: SRS in which projWin is expressed outputBounds: assigned output bounds: [ulx, uly, lrx, lry] outputGeotransform: assigned geotransform matrix (array of 6 values) (mutually exclusive with outputBounds) metadataOptions: list or dict of metadata options outputSRS: assigned output SRS noData: nodata value (or "none" to unset it) rgbExpand: Color palette expansion mode: "gray", "rgb", "rgba" xmp: whether to copy XMP metadata resampleAlg: resampling mode overviewLevel: To specify which overview level of source files must be used domainMetadataOptions: list or dict of domain-specific metadata options

Returns #

Dataset

Examples #

Scale & offset: - the translate function can be used to get rid of the scale and offset that are used to manipulate the dataset, to get the real values of the dataset.

Scale:
    - First we will create a dataset from a float32 array with values between 1 and 10, and then we will
        assign a scale of 0.1 to the dataset.

        >>> import numpy as np
        >>> arr = np.random.randint(1, 10, size=(5, 5)).astype(np.float32)
        >>> print(arr) # doctest: +SKIP
        [[5. 5. 3. 4. 2.]
         [2. 5. 5. 8. 5.]
         [7. 5. 6. 1. 2.]
         [6. 8. 1. 5. 8.]
         [2. 5. 2. 2. 9.]]
        >>> top_left_corner = (0, 0)
        >>> cell_size = 0.05
        >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size,epsg=4326)
        >>> print(dataset)
        <BLANKLINE>
                    Top Left Corner: (0.0, 0.0)
                    Cell size: 0.05
                    Dimension: 5 * 5
                    EPSG: 4326
                    Number of Bands: 1
                    Band names: ['Band_1']
                    Band colors: {0: 'undefined'}
                    Band units: ['']
                    Scale: [1.0]
                    Offset: [0]
                    Mask: -9999.0
                    Data type: float32
                    File: ...
        <BLANKLINE>
        >>> dataset.scale = [0.1]

    - now lets unscale the dataset values.

        >>> unscaled_dataset = dataset.translate(unscale=True)
        >>> print(unscaled_dataset) # doctest: +SKIP
        <BLANKLINE>
                    Top Left Corner: (0.0, 0.0)
                    Cell size: 0.05
                    Dimension: 5 * 5
                    EPSG: 4326
                    Number of Bands: 1
                    Band names: ['Band_1']
                    Band colors: {0: 'undefined'}
                    Band units: ['']
                    Scale: [1.0]
                    Offset: [0]
                    Mask: -9999.0
                    Data type: float32
                    File:
        <BLANKLINE>
        >>> print(unscaled_dataset.read_array()) # doctest: +SKIP
        [[0.5 0.5 0.3 0.4 0.2]
         [0.2 0.5 0.5 0.8 0.5]
         [0.7 0.5 0.6 0.1 0.2]
         [0.6 0.8 0.1 0.5 0.8]
         [0.2 0.5 0.2 0.2 0.9]]

offset:
    - You can also unshift the values of the dataset if the dataset has an offset. To remove the offset from
        all values in the dataset, you can read the values using the `read_array` and then add the offset value
        to the array. we will create a dataset from the same array we created above (values are between 1, and 10)
        with an offset of 100.

        >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size,epsg=4326)
        >>> print(dataset)
        <BLANKLINE>
                    Top Left Corner: (0.0, 0.0)
                    Cell size: 0.05
                    Dimension: 5 * 5
                    EPSG: 4326
                    Number of Bands: 1
                    Band names: ['Band_1']
                    Band colors: {0: 'undefined'}
                    Band units: ['']
                    Scale: [1.0]
                    Offset: [0]
                    Mask: -9999.0
                    Data type: float32
                    File: ...
        <BLANKLINE>

    - set the offset to 100.

        >>> dataset.offset = [100]

    - check if the offset has been set.

        >>> print(dataset.offset)
        [100.0]

    - now lets unscale the dataset values.

        >>> unscaled_dataset = dataset.translate(unscale=True)
        >>> print(unscaled_dataset.read_array()) # doctest: +SKIP
        [[105. 105. 103. 104. 102.]
         [102. 105. 105. 108. 105.]
         [107. 105. 106. 101. 102.]
         [106. 108. 101. 105. 108.]
         [102. 105. 102. 102. 109.]]

    - as you see, all the values have been shifted by 100. now if you check the offset of the dataset

        >>> print(unscaled_dataset.offset)
        [0]

Offset and Scale together:
    - we can unscale and get rid of the offset at the same time.

        >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size,epsg=4326)

    - set the offset to 100, and a scale of 0.1.

        >>> dataset.offset = [100]
        >>> dataset.scale = [0.1]

    - check if the offset has been set.

        >>> print(dataset.offset)
        [100.0]
        >>> print(dataset.scale)
        [0.1]

    - now lets unscale the dataset values.

        >>> unscaled_dataset = dataset.translate(unscale=True)
        >>> print(unscaled_dataset.read_array()) # doctest: +SKIP
        [[100.5 100.5 100.3 100.4 100.2]
         [100.2 100.5 100.5 100.8 100.5]
         [100.7 100.5 100.6 100.1 100.2]
         [100.6 100.8 100.1 100.5 100.8]
         [100.2 100.5 100.2 100.2 100.9]]

    - Now you can see that the values were multiplied first by the scale; then the offset value was added.
        `value * scale + offset`

        >>> print(unscaled_dataset.offset)
        [0]
        >>> print(unscaled_dataset.scale)
        [1.0]

Scale between two values

you can scale the values of the dataset between two values, for example, you can scale the values between two values 0 and 1.

dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size,epsg=4326) print(dataset.stats()) # doctest: +SKIP min max mean std Band_1 1.0 9.0 4.0 2.19089 scaled_dataset = dataset.translate(scaleParams=[[1, 9, 0, 255]], outputType=gdal.GDT_Byte) print(scaled_dataset.read_array()) # doctest: +SKIP [[128 128 64 96 32] [ 32 128 128 223 128] [191 128 159 0 32] [159 223 0 128 223] [ 32 128 32 32 255]]

Source code in pyramids/dataset.py

def translate(self, path: str = None, **kwargs):
    """Translate.

    The translate function can be used to
    - Convert Between Formats: Convert a raster from one format to another (e.g., from GeoTIFF to JPEG).
    - Subset: Extract a subregion of a raster.
    - Resample: Change the resolution of a raster.
    - Reproject: Change the coordinate reference system of a raster.
    - Scale Values: Scale pixel values to a new range.
    - Change Data Type: Convert the data type of the raster.
    - Apply Compression: Apply compression to the output raster.
    - Apply No-Data Values: Define no-data values for the output raster.


    Parameters
    ----------
    path: str, optional, default is None.
        path to save the output, if None, the output will be saved in memory.
    kwargs:
        unscale:
            unscale values with scale and offset metadata.
        scaleParams:
            list of scale parameters, each of the form [src_min,src_max] or [src_min,src_max,dst_min,dst_max]
        outputType:
            output type (gdalconst.GDT_Byte, etc...)
        exponents:
            list of exponentiation parameters
        bandList:
            array of band numbers (index start at 1)
        maskBand:
            mask band to generate or not ("none", "auto", "mask", 1, ...)
        creationOptions:
            list or dict of creation options
        srcWin:
            subwindow in pixels to extract: [left_x, top_y, width, height]
        projWin:
            subwindow in projected coordinates to extract: [ulx, uly, lrx, lry]
        projWinSRS:
            SRS in which projWin is expressed
        outputBounds:
            assigned output bounds: [ulx, uly, lrx, lry]
        outputGeotransform:
            assigned geotransform matrix (array of 6 values) (mutually exclusive with outputBounds)
        metadataOptions:
            list or dict of metadata options
        outputSRS:
            assigned output SRS
        noData:
            nodata value (or "none" to unset it)
        rgbExpand:
            Color palette expansion mode: "gray", "rgb", "rgba"
        xmp:
            whether to copy XMP metadata
        resampleAlg:
            resampling mode
        overviewLevel:
            To specify which overview level of source files must be used
        domainMetadataOptions:
            list or dict of domain-specific metadata options

    Returns
    -------
    Dataset

    Examples
    --------
    Scale & offset:
        - the translate function can be used to get rid of the scale and offset that are used to manipulate the
        dataset, to get the real values of the dataset.

        Scale:
            - First we will create a dataset from a float32 array with values between 1 and 10, and then we will
                assign a scale of 0.1 to the dataset.

                >>> import numpy as np
                >>> arr = np.random.randint(1, 10, size=(5, 5)).astype(np.float32)
                >>> print(arr) # doctest: +SKIP
                [[5. 5. 3. 4. 2.]
                 [2. 5. 5. 8. 5.]
                 [7. 5. 6. 1. 2.]
                 [6. 8. 1. 5. 8.]
                 [2. 5. 2. 2. 9.]]
                >>> top_left_corner = (0, 0)
                >>> cell_size = 0.05
                >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size,epsg=4326)
                >>> print(dataset)
                <BLANKLINE>
                            Top Left Corner: (0.0, 0.0)
                            Cell size: 0.05
                            Dimension: 5 * 5
                            EPSG: 4326
                            Number of Bands: 1
                            Band names: ['Band_1']
                            Band colors: {0: 'undefined'}
                            Band units: ['']
                            Scale: [1.0]
                            Offset: [0]
                            Mask: -9999.0
                            Data type: float32
                            File: ...
                <BLANKLINE>
                >>> dataset.scale = [0.1]

            - now lets unscale the dataset values.

                >>> unscaled_dataset = dataset.translate(unscale=True)
                >>> print(unscaled_dataset) # doctest: +SKIP
                <BLANKLINE>
                            Top Left Corner: (0.0, 0.0)
                            Cell size: 0.05
                            Dimension: 5 * 5
                            EPSG: 4326
                            Number of Bands: 1
                            Band names: ['Band_1']
                            Band colors: {0: 'undefined'}
                            Band units: ['']
                            Scale: [1.0]
                            Offset: [0]
                            Mask: -9999.0
                            Data type: float32
                            File:
                <BLANKLINE>
                >>> print(unscaled_dataset.read_array()) # doctest: +SKIP
                [[0.5 0.5 0.3 0.4 0.2]
                 [0.2 0.5 0.5 0.8 0.5]
                 [0.7 0.5 0.6 0.1 0.2]
                 [0.6 0.8 0.1 0.5 0.8]
                 [0.2 0.5 0.2 0.2 0.9]]

        offset:
            - You can also unshift the values of the dataset if the dataset has an offset. To remove the offset from
                all values in the dataset, you can read the values using the `read_array` and then add the offset value
                to the array. we will create a dataset from the same array we created above (values are between 1, and 10)
                with an offset of 100.

                >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size,epsg=4326)
                >>> print(dataset)
                <BLANKLINE>
                            Top Left Corner: (0.0, 0.0)
                            Cell size: 0.05
                            Dimension: 5 * 5
                            EPSG: 4326
                            Number of Bands: 1
                            Band names: ['Band_1']
                            Band colors: {0: 'undefined'}
                            Band units: ['']
                            Scale: [1.0]
                            Offset: [0]
                            Mask: -9999.0
                            Data type: float32
                            File: ...
                <BLANKLINE>

            - set the offset to 100.

                >>> dataset.offset = [100]

            - check if the offset has been set.

                >>> print(dataset.offset)
                [100.0]

            - now lets unscale the dataset values.

                >>> unscaled_dataset = dataset.translate(unscale=True)
                >>> print(unscaled_dataset.read_array()) # doctest: +SKIP
                [[105. 105. 103. 104. 102.]
                 [102. 105. 105. 108. 105.]
                 [107. 105. 106. 101. 102.]
                 [106. 108. 101. 105. 108.]
                 [102. 105. 102. 102. 109.]]

            - as you see, all the values have been shifted by 100. now if you check the offset of the dataset

                >>> print(unscaled_dataset.offset)
                [0]

        Offset and Scale together:
            - we can unscale and get rid of the offset at the same time.

                >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size,epsg=4326)

            - set the offset to 100, and a scale of 0.1.

                >>> dataset.offset = [100]
                >>> dataset.scale = [0.1]

            - check if the offset has been set.

                >>> print(dataset.offset)
                [100.0]
                >>> print(dataset.scale)
                [0.1]

            - now lets unscale the dataset values.

                >>> unscaled_dataset = dataset.translate(unscale=True)
                >>> print(unscaled_dataset.read_array()) # doctest: +SKIP
                [[100.5 100.5 100.3 100.4 100.2]
                 [100.2 100.5 100.5 100.8 100.5]
                 [100.7 100.5 100.6 100.1 100.2]
                 [100.6 100.8 100.1 100.5 100.8]
                 [100.2 100.5 100.2 100.2 100.9]]

            - Now you can see that the values were multiplied first by the scale; then the offset value was added.
                `value * scale + offset`

                >>> print(unscaled_dataset.offset)
                [0]
                >>> print(unscaled_dataset.scale)
                [1.0]

    Scale between two values:
        - you can scale the values of the dataset between two values, for example, you can scale the values between
            two values 0 and 1.

            >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size,epsg=4326)
            >>> print(dataset.stats()) # doctest: +SKIP
                    min  max  mean      std
            Band_1  1.0  9.0   4.0  2.19089
            >>> scaled_dataset = dataset.translate(scaleParams=[[1, 9, 0, 255]], outputType=gdal.GDT_Byte)
            >>> print(scaled_dataset.read_array()) # doctest: +SKIP
            [[128 128  64  96  32]
             [ 32 128 128 223 128]
             [191 128 159   0  32]
             [159 223   0 128 223]
             [ 32 128  32  32 255]]


    """
    if path is None:
        driver = "MEM"
        path = ""
    else:
        driver = "GTiff"

    options = gdal.TranslateOptions(format=driver, **kwargs)
    dst = gdal.Translate(path, self.raster, options=options)
    return Dataset(dst, access="write")

`create(cell_size, rows, columns, dtype, bands, top_left_corner, epsg, no_data_value=None, path=None)` `classmethod` #

Create a new dataset and fill it with the no_data_value.

The new dataset will have an array filled with the no_data_value.

Parameters:

Name	Type	Description	Default
`cell_size`	`int \| float`	Cell size.	required
`rows`	`int`	Number of rows.	required
`columns`	`int`	Number of columns.	required
`dtype`	`str`	Data type. One of: None, "byte", "uint16", "int16", "uint32", "int32", "float32", "float64", "complex-int16", "complex-int32", "complex-float32", "complex-float64", "uint64", "int64", "int8", "count".	required
`bands`	`int \| None`	Number of bands to create in the output raster.	required
`top_left_corner`	`Tuple`	Coordinates of the top left corner point.	required
`epsg`	`int`	EPSG number to identify the projection of the coordinates in the created raster.	required
`no_data_value`	`float \| None`	No data value.	`None`
`path`	`str`	Path on disk; if None, the dataset is created in memory. Default is None.	`None`

Returns:

Name	Type	Description
`Dataset`	`Dataset`	A new dataset

Hint

The no_data_value will be filled in the array of the output dataset.
The coordinates of the top left corner point should be in the same projection as the epsg.
The cell size should be in the same unit as the coordinates.
The number of rows and columns should be positive integers.

Examples:

To create a dataset using the create method you need to provide all the information needed to locate the dataset in space top_left_corner and epsg, then the information needed to specify the data to be stored in the dataset like dtype, rows, columns, cell_size, bands and no_data_value.

>>> cell_size = 10
>>> rows = 5
>>> columns = 5
>>> dtype = "float32"
>>> bands = 1
>>> top_left_corner = (0, 0)
>>> epsg = 32618
>>> no_data_value = -9999
>>> path = "create-new-dataset.tif"
>>> dataset = Dataset.create(cell_size, rows, columns, dtype, bands, top_left_corner, epsg, no_data_value, path)
>>> print(dataset)
<BLANKLINE>
            Cell size: 10.0
            Dimension: 5 * 5
            EPSG: 32618
            Number of Bands: 1
            Band names: ['Band_1']
            Mask: -9999.0
            Data type: float32
            File: create-new-dataset.tif
<BLANKLINE>

If you check the value stored in the band using the read_array method, you will find that the band is full of the no_data_value value which we used here as -9999.

>>> print(dataset.read_array(band=0))
[[-9999. -9999. -9999. -9999. -9999.]
 [-9999. -9999. -9999. -9999. -9999.]
 [-9999. -9999. -9999. -9999. -9999.]
 [-9999. -9999. -9999. -9999. -9999.]
 [-9999. -9999. -9999. -9999. -9999.]]

Source code in pyramids/dataset.py

@classmethod
def create(
    cls,
    cell_size: Union[int, float],
    rows: int,
    columns: int,
    dtype: str,
    bands: int,
    top_left_corner: Tuple,
    epsg: int,
    no_data_value: Any = None,
    path: str = None,
) -> "Dataset":
    """Create a new dataset and fill it with the no_data_value.

    The new dataset will have an array filled with the no_data_value.

    Args:
        cell_size (int|float):
            Cell size.
        rows (int):
            Number of rows.
        columns (int):
            Number of columns.
        dtype (str):
            Data type. One of: None, "byte", "uint16", "int16", "uint32", "int32", "float32", "float64",
            "complex-int16", "complex-int32", "complex-float32", "complex-float64", "uint64", "int64", "int8",
            "count".
        bands (int|None):
            Number of bands to create in the output raster.
        top_left_corner (Tuple):
            Coordinates of the top left corner point.
        epsg (int):
            EPSG number to identify the projection of the coordinates in the created raster.
        no_data_value (float|None):
            No data value.
        path (str, optional):
            Path on disk; if None, the dataset is created in memory. Default is None.

    Returns:
        Dataset: A new dataset

    Hint:
        - The no_data_value will be filled in the array of the output dataset.
        - The coordinates of the top left corner point should be in the same projection as the epsg.
        - The cell size should be in the same unit as the coordinates.
        - The number of rows and columns should be positive integers.

    Examples:
        - To create a dataset using the `create` method you need to provide all the information needed to locate the
         dataset in space `top_left_corner` and `epsg`, then the information needed to specify the data to be stored
         in the dataset like `dtype`, `rows`, `columns`, `cell_size`, `bands` and `no_data_value`.

          ```python
          >>> cell_size = 10
          >>> rows = 5
          >>> columns = 5
          >>> dtype = "float32"
          >>> bands = 1
          >>> top_left_corner = (0, 0)
          >>> epsg = 32618
          >>> no_data_value = -9999
          >>> path = "create-new-dataset.tif"
          >>> dataset = Dataset.create(cell_size, rows, columns, dtype, bands, top_left_corner, epsg, no_data_value, path)
          >>> print(dataset)
          <BLANKLINE>
                      Cell size: 10.0
                      Dimension: 5 * 5
                      EPSG: 32618
                      Number of Bands: 1
                      Band names: ['Band_1']
                      Mask: -9999.0
                      Data type: float32
                      File: create-new-dataset.tif
          <BLANKLINE>

          ```

        - If you check the value stored in the band using the `read_array` method, you will find that the band is
            full of the `no_data_value` value which we used here as -9999.

          ```python
          >>> print(dataset.read_array(band=0))
          [[-9999. -9999. -9999. -9999. -9999.]
           [-9999. -9999. -9999. -9999. -9999.]
           [-9999. -9999. -9999. -9999. -9999.]
           [-9999. -9999. -9999. -9999. -9999.]
           [-9999. -9999. -9999. -9999. -9999.]]

          ```
    """
    # Create the driver.
    dtype = numpy_to_gdal_dtype(dtype)
    dst = Dataset._create_dataset(columns, rows, bands, dtype, path=path)
    sr = Dataset._create_sr_from_epsg(epsg)
    geotransform = (
        top_left_corner[0],
        cell_size,
        0,
        top_left_corner[1],
        0,
        -1 * cell_size,
    )
    dst.SetGeoTransform(geotransform)
    # Set the projection.
    dst.SetProjection(sr.ExportToWkt())

    dst = cls(dst, access="write")
    if no_data_value is not None:
        dst._set_no_data_value(no_data_value=no_data_value)

    return dst

`create_from_array(arr, top_left_corner=None, cell_size=None, geo=None, epsg=4326, no_data_value=DEFAULT_NO_DATA_VALUE, driver_type='MEM', path=None)` `classmethod` #

Create a new dataset from an array.

Parameters:

Name	Type	Description	Default
`arr`	`ndarray`	Numpy array.	required
`top_left_corner`	`Tuple[float, float]`	The coordinates of the top left corner of the dataset.	`None`
`cell_size`	`int \| float`	Cell size in the same units of the coordinate reference system defined by the `epsg` parameter.	`None`
`geo`	`Tuple[float, float, float, float, float, float]`	Geotransform tuple (minimum lon/x, pixel-size, rotation, maximum lat/y, rotation, pixel-size).	`None`
`epsg`	`int`	Integer reference number to the projection (https://epsg.io/).	`4326`
`no_data_value`	`Any`	No data value to mask the cells out of the domain. The default is -9999.	`DEFAULT_NO_DATA_VALUE`
`driver_type`	`str`	Driver type ["GTiff", "MEM", "netcdf"]. Default is "MEM".	`'MEM'`
`path`	`str`	Path to save the driver.	`None`

Returns:

Name	Type	Description
`Dataset`	`Dataset`	Dataset object will be returned.

Hint

The geo parameter can replace both the cell_size and the top_left_corner parameters.
The function checks first if the geo parameter is defined; it will ignore the cell_size and the top_left_corner parameters if given.

Examples:

Create dataset using the cell_size and top_left_corner parameters.

>>> import numpy as np
>>> arr = np.random.rand(4, 10, 10)
>>> top_left_corner = (0, 0)
>>> cell_size = 0.05
>>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size,epsg=4326)
>>> print(dataset)
<BLANKLINE>
            Cell size: 0.05
            Dimension: 10 * 10
            EPSG: 4326
            Number of Bands: 4
            Band names: ['Band_1', 'Band_2', 'Band_3', 'Band_4']
            Mask: -9999.0
            Data type: float64
            File: ...
<BLANKLINE>

Create dataset using the geo parameter.
To create the same dataset using the geotransform parameter, we will use the dataset top_left_corner coordinates and the cell_size to create it.

>>> geotransform = (0, 0.05, 0, 0, 0, -0.05)
>>> dataset = Dataset.create_from_array(arr, geo=geotransform, epsg=4326)
>>> print(dataset)
<BLANKLINE>
            Cell size: 0.05
            Dimension: 10 * 10
            EPSG: 4326
            Number of Bands: 4
            Band names: ['Band_1', 'Band_2', 'Band_3', 'Band_4']
            Mask: -9999.0
            Data type: float64
            File: ...
<BLANKLINE>

Source code in pyramids/dataset.py

@classmethod
def create_from_array(
    cls,
    arr: np.ndarray,
    top_left_corner: Tuple[float, float] = None,
    cell_size: Union[int, float] = None,
    geo: Tuple[float, float, float, float, float, float] = None,
    epsg: Union[str, int] = 4326,
    no_data_value: Union[Any, list] = DEFAULT_NO_DATA_VALUE,
    driver_type: str = "MEM",
    path: str = None,
) -> "Dataset":
    """Create a new dataset from an array.

    Args:
        arr (np.ndarray):
            Numpy array.
        top_left_corner (Tuple[float, float], optional):
            The coordinates of the top left corner of the dataset.
        cell_size (int|float, optional):
            Cell size in the same units of the coordinate reference system defined by the `epsg` parameter.
        geo (Tuple[float, float, float, float, float, float], optional):
            Geotransform tuple (minimum lon/x, pixel-size, rotation, maximum lat/y, rotation, pixel-size).
        epsg (int):
            Integer reference number to the projection (https://epsg.io/).
        no_data_value (Any, optional):
            No data value to mask the cells out of the domain. The default is -9999.
        driver_type (str, optional):
            Driver type ["GTiff", "MEM", "netcdf"]. Default is "MEM".
        path (str, optional):
            Path to save the driver.

    Returns:
        Dataset:
            Dataset object will be returned.

    Hint:
        - The `geo` parameter can replace both the `cell_size` and the `top_left_corner` parameters.
        - The function checks first if the `geo` parameter is defined; it will ignore the `cell_size` and the `top_left_corner` parameters if given.

    Examples:
        - Create dataset using the `cell_size` and `top_left_corner` parameters.

          ```python
          >>> import numpy as np
          >>> arr = np.random.rand(4, 10, 10)
          >>> top_left_corner = (0, 0)
          >>> cell_size = 0.05
          >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size,epsg=4326)
          >>> print(dataset)
          <BLANKLINE>
                      Cell size: 0.05
                      Dimension: 10 * 10
                      EPSG: 4326
                      Number of Bands: 4
                      Band names: ['Band_1', 'Band_2', 'Band_3', 'Band_4']
                      Mask: -9999.0
                      Data type: float64
                      File: ...
          <BLANKLINE>

          ```

        - Create dataset using the `geo` parameter.

          - To create the same dataset using the `geotransform` parameter, we will use the dataset `top_left_corner`
            coordinates and the `cell_size` to create it.

          ```python
          >>> geotransform = (0, 0.05, 0, 0, 0, -0.05)
          >>> dataset = Dataset.create_from_array(arr, geo=geotransform, epsg=4326)
          >>> print(dataset)
          <BLANKLINE>
                      Cell size: 0.05
                      Dimension: 10 * 10
                      EPSG: 4326
                      Number of Bands: 4
                      Band names: ['Band_1', 'Band_2', 'Band_3', 'Band_4']
                      Mask: -9999.0
                      Data type: float64
                      File: ...
          <BLANKLINE>

          ```
    """
    if geo is None:
        if top_left_corner is None or cell_size is None:
            raise ValueError(
                "Either top_left_corner and cell_size or geo should be provided."
            )
        geo = (
            top_left_corner[0],
            cell_size,
            0,
            top_left_corner[1],
            0,
            -1 * cell_size,
        )

    if arr.ndim == 2:
        bands = 1
        rows = int(arr.shape[0])
        cols = int(arr.shape[1])
    else:
        bands = arr.shape[0]
        rows = int(arr.shape[1])
        cols = int(arr.shape[2])

    dst_obj = cls._create_gtiff_from_array(
        arr,
        cols,
        rows,
        bands,
        geo,
        epsg,
        no_data_value,
        driver_type=driver_type,
        path=path,
    )

    return dst_obj

`dataset_like(src, array, path=None)` `classmethod` #

Create a new dataset like another dataset.

dataset_like method creates a Dataset from an array like another source dataset. The new dataset will have the same projection, coordinates or the top left corner of the original dataset, cell size, no_data_velue, and number of rows and columns. the array and the source dataset should have the same number of columns and rows

Parameters:

Name	Type	Description	Default
`src`	`Dataset`	source raster to get the spatial information	required
`array`	`ndarray`	data to store in the new dataset.	required
`path`	`str`	path to save the new dataset, if not given, the method will return in-memory dataset.	`None`

Returns:

Name	Type	Description
`Dataset`	`Dataset`	if the `path` is given, the method will save the new raster to the given path, else the method will return an in-memory dataset.

Hint

If the given array is 3D, the bands have to be the first dimension, the x/lon has to be the second dimension, and the y/lon has to be the third dimension of the array.

Examples:

Create a source dataset and then create another dataset like it:

>>> import numpy as np
>>> arr = np.random.rand(5, 5)
>>> top_left_corner = (0, 0)
>>> cell_size = 0.05
>>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)

Now let's create another dataset from the previous dataset using the dataset_like:

>>> new_arr = np.random.rand(5, 5)
>>> dataset_new = Dataset.dataset_like(dataset, new_arr)
>>> print(dataset)
<BLANKLINE>
            Cell size: 0.05
            Dimension: 5 * 5
            EPSG: 4326
            Number of Bands: 1
            Band names: ['Band_1']
            Mask: -9999.0
            Data type: float64
            File:...
<BLANKLINE>

Source code in pyramids/dataset.py

@classmethod
def dataset_like(
    cls,
    src: "Dataset",
    array: np.ndarray,
    path: str = None,
) -> "Dataset":
    """Create a new dataset like another dataset.

    dataset_like method creates a Dataset from an array like another source dataset. The new dataset
    will have the same `projection`, `coordinates` or the `top left corner` of the original dataset,
    `cell size`, `no_data_velue`, and number of `rows` and `columns`.
    the array and the source dataset should have the same number of columns and rows

    Args:
        src (Dataset):
            source raster to get the spatial information
        array (ndarray):
            data to store in the new dataset.
        path (str, optional):
            path to save the new dataset, if not given, the method will return in-memory dataset.

    Returns:
        Dataset:
            if the `path` is given, the method will save the new raster to the given path, else the method will
            return an in-memory dataset.

    Hint:
        - If the given array is 3D, the bands have to be the first dimension, the x/lon has to be the second
          dimension, and the y/lon has to be the third dimension of the array.

    Examples:
        - Create a source dataset and then create another dataset like it:

          ```python
          >>> import numpy as np
          >>> arr = np.random.rand(5, 5)
          >>> top_left_corner = (0, 0)
          >>> cell_size = 0.05
          >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)

          ```

        - Now let's create another `dataset` from the previous dataset using the `dataset_like`:

          ```python
          >>> new_arr = np.random.rand(5, 5)
          >>> dataset_new = Dataset.dataset_like(dataset, new_arr)
          >>> print(dataset)
          <BLANKLINE>
                      Cell size: 0.05
                      Dimension: 5 * 5
                      EPSG: 4326
                      Number of Bands: 1
                      Band names: ['Band_1']
                      Mask: -9999.0
                      Data type: float64
                      File:...
          <BLANKLINE>

          ```

    """
    if not isinstance(array, np.ndarray):
        raise TypeError("array should be of type numpy array")

    if array.ndim == 2:
        bands = 1
    else:
        bands = array.shape[0]

    dtype = numpy_to_gdal_dtype(array)

    dst = Dataset._create_dataset(src.columns, src.rows, bands, dtype, path=path)

    dst.SetGeoTransform(src.geotransform)
    dst.SetProjection(src.crs)
    # setting the NoDataValue does not accept double precision numbers
    dst_obj = cls(dst, access="write")
    dst_obj._set_no_data_value(no_data_value=src.no_data_value[0])

    if bands == 1:
        dst_obj.raster.GetRasterBand(1).WriteArray(array)
    else:
        for band_i in range(bands):
            dst_obj.raster.GetRasterBand(band_i + 1).WriteArray(array[band_i, :, :])

    if path is not None:
        dst_obj.raster.FlushCache()

    return dst_obj

`write_array(array, top_left_corner=None)` #

Write an array to the dataset at the given xoff, yoff position.

Parameters:

Name	Type	Description	Default
`array`	`ndarray`	The array to write	required
`top_left_corner`	`List[float, float]`	indices [row, column]/[y_offset, x_offset] of the cell to write the array to. If None, the array will be written to the top left corner of the dataset.	`None`

Raises:

Type	Description
`Exception`	If the array is not written successfully.

Hint

The Dataset has to be opened in a write mode read_only=False.

Returns: None

Examples:

First, create a dataset on disk:

>>> import numpy as np
>>> arr = np.random.rand(5, 5)
>>> top_left_corner = (0, 0)
>>> cell_size = 0.05
>>> path = 'write_array.tif'
>>> dataset = Dataset.create_from_array(
...     arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326, path=path
... )
>>> dataset = None

In a later session you can read the dataset in a write mode and update it:

>>> dataset = Dataset.read_file(path, read_only=False)
>>> arr = np.array([[1, 2], [3, 4]])
>>> dataset.write_array(arr, top_left_corner=[1, 1])
>>> dataset.read_array()    # doctest: +SKIP
array([[0.77359738, 0.64789596, 0.37912658, 0.03673771, 0.69571106],
       [0.60804387, 1.        , 2.        , 0.501909  , 0.99597122],
       [0.83879291, 3.        , 4.        , 0.33058081, 0.59824467],
       [0.774213  , 0.94338147, 0.16443719, 0.28041457, 0.61914179],
       [0.97201104, 0.81364799, 0.35157525, 0.65554998, 0.8589739 ]])

Source code in pyramids/dataset.py

def write_array(self, array: np.array, top_left_corner: List[Any] = None):
    """Write an array to the dataset at the given xoff, yoff position.

    Args:
        array (np.ndarray):
            The array to write
        top_left_corner (List[float, float]):
            indices [row, column]/[y_offset, x_offset] of the cell to write the array to. If None, the array will
            be written to the top left corner of the dataset.

    Raises:
        Exception: If the array is not written successfully.

    Hint:
        - The `Dataset` has to be opened in a write mode `read_only=False`.

    Returns:
    None

    Examples:
        - First, create a dataset on disk:

          ```python
          >>> import numpy as np
          >>> arr = np.random.rand(5, 5)
          >>> top_left_corner = (0, 0)
          >>> cell_size = 0.05
          >>> path = 'write_array.tif'
          >>> dataset = Dataset.create_from_array(
          ...     arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326, path=path
          ... )
          >>> dataset = None

          ```

        - In a later session you can read the dataset in a `write` mode and update it:

          ```python
          >>> dataset = Dataset.read_file(path, read_only=False)
          >>> arr = np.array([[1, 2], [3, 4]])
          >>> dataset.write_array(arr, top_left_corner=[1, 1])
          >>> dataset.read_array()    # doctest: +SKIP
          array([[0.77359738, 0.64789596, 0.37912658, 0.03673771, 0.69571106],
                 [0.60804387, 1.        , 2.        , 0.501909  , 0.99597122],
                 [0.83879291, 3.        , 4.        , 0.33058081, 0.59824467],
                 [0.774213  , 0.94338147, 0.16443719, 0.28041457, 0.61914179],
                 [0.97201104, 0.81364799, 0.35157525, 0.65554998, 0.8589739 ]])

          ```
    """
    yoff, xoff = top_left_corner
    try:
        self._raster.WriteArray(array, xoff=xoff, yoff=yoff)
        self._raster.FlushCache()
    except Exception as e:
        raise e

`set_crs(crs=None, epsg=None)` #

Set the Coordinate Reference System (CRS).

Set the Coordinate Reference System (CRS) of a

Parameters:

Name Type Description Default

crs

str

Optional if epsg is specified. WKT string. i.e.

'GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84", 6378137,298.257223563,AUTHORITY["EPSG","7030"],
AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0,AUTHORITY["EPSG","8901"]],UNIT["degree",
0.0174532925199433,AUTHORITY["EPSG","9122"]],AXIS["Latitude",NORTH],AXIS["Longitude",EAST],
AUTHORITY["EPSG","4326"]]'

None

epsg

int

Optional if crs is specified. EPSG code specifying the projection.

None

Source code in pyramids/dataset.py

def set_crs(self, crs: Optional = None, epsg: int = None):
    """Set the Coordinate Reference System (CRS).

        Set the Coordinate Reference System (CRS) of a

    Args:
        crs (str):
            Optional if epsg is specified. WKT string. i.e.
                ```
                'GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84", 6378137,298.257223563,AUTHORITY["EPSG","7030"],
                AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0,AUTHORITY["EPSG","8901"]],UNIT["degree",
                0.0174532925199433,AUTHORITY["EPSG","9122"]],AXIS["Latitude",NORTH],AXIS["Longitude",EAST],
                AUTHORITY["EPSG","4326"]]'
                ```
        epsg (int):
            Optional if crs is specified. EPSG code specifying the projection.
    """
    # first change the projection of the gdal dataset object
    # second change the epsg attribute of the Dataset object
    if self.driver_type == "ascii":
        raise TypeError(
            "Setting CRS for ASCII file is not possible, you can save the files to a geotiff and then reset the crs"
        )
    else:
        if crs is not None:
            self.raster.SetProjection(crs)
            self._epsg = FeatureCollection.get_epsg_from_prj(crs)
        else:
            sr = Dataset._create_sr_from_epsg(epsg)
            self.raster.SetProjection(sr.ExportToWkt())
            self._epsg = epsg

`to_crs(to_epsg, method='nearest neighbor', maintain_alignment=False, inplace=False)` #

Reproject the dataset to any projection.

(default the WGS84 web mercator projection, without resampling)

Parameters:

Name	Type	Description	Default
`to_epsg`	`int`	reference number to the new projection (https://epsg.io/). Default 3857 is the reference number of WGS84 web mercator.	required
`method`	`str`	resampling method. Default is "nearest neighbor". See https://gisgeography.com/raster-resampling/. Allowed values: "nearest neighbor", "cubic", "bilinear".	`'nearest neighbor'`
`maintain_alignment`	`bool`	True to maintain the number of rows and columns of the raster the same after reprojection. Default is False.	`False`
`inplace`	`bool`	True to make changes inplace. Default is False.	`False`

Returns:

Name	Type	Description
`Dataset`	`Union[Dataset, None]`	Dataset object, if inplace is True, the method returns None.

Examples:

Create a dataset and reproject it:

>>> import numpy as np
>>> arr = np.random.rand(4, 5, 5)
>>> top_left_corner = (0, 0)
>>> cell_size = 0.05
>>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
>>> print(dataset)
<BLANKLINE>
            Cell size: 0.05
            Dimension: 5 * 5
            EPSG: 4326
            Number of Bands: 4
            Band names: ['Band_1', 'Band_2', 'Band_3', 'Band_4']
            Mask: -9999.0
            Data type: float64
            File:...
<BLANKLINE>
>>> print(dataset.crs)
GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0,AUTHORITY["EPSG","8901"]],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AXIS["Latitude",NORTH],AXIS["Longitude",EAST],AUTHORITY["EPSG","4326"]]
>>> print(dataset.epsg)
4326
>>> reprojected_dataset = dataset.to_crs(to_epsg=3857)
>>> print(reprojected_dataset)
<BLANKLINE>
            Cell size: 5565.983370404396
            Dimension: 5 * 5
            EPSG: 3857
            Number of Bands: 4
            Band names: ['Band_1', 'Band_2', 'Band_3', 'Band_4']
            Mask: -9999.0
            Data type: float64
            File:...
<BLANKLINE>
>>> print(reprojected_dataset.crs)
PROJCS["WGS 84 / Pseudo-Mercator",GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0,AUTHORITY["EPSG","8901"]],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AUTHORITY["EPSG","4326"]],PROJECTION["Mercator_1SP"],PARAMETER["central_meridian",0],PARAMETER["scale_factor",1],PARAMETER["false_easting",0],PARAMETER["false_northing",0],UNIT["metre",1,AUTHORITY["EPSG","9001"]],AXIS["Easting",EAST],AXIS["Northing",NORTH],EXTENSION["PROJ4","+proj=merc +a=6378137 +b=6378137 +lat_ts=0 +lon_0=0 +x_0=0 +y_0=0 +k=1 +units=m +nadgrids=@null +wktext +no_defs"],AUTHORITY["EPSG","3857"]]
>>> print(reprojected_dataset.epsg)
3857

Source code in pyramids/dataset.py

def to_crs(
    self,
    to_epsg: int,
    method: str = "nearest neighbor",
    maintain_alignment: int = False,
    inplace: bool = False,
) -> Union["Dataset", None]:
    """Reproject the dataset to any projection.

        (default the WGS84 web mercator projection, without resampling)

    Args:
        to_epsg (int):
            reference number to the new projection (https://epsg.io/). Default 3857 is the reference number of WGS84
            web mercator.
        method (str):
            resampling method. Default is "nearest neighbor". See https://gisgeography.com/raster-resampling/.
            Allowed values: "nearest neighbor", "cubic", "bilinear".
        maintain_alignment (bool):
            True to maintain the number of rows and columns of the raster the same after reprojection.
            Default is False.
        inplace (bool):
            True to make changes inplace. Default is False.

    Returns:
        Dataset:
            Dataset object, if inplace is True, the method returns None.

    Examples:
        - Create a dataset and reproject it:

          ```python
          >>> import numpy as np
          >>> arr = np.random.rand(4, 5, 5)
          >>> top_left_corner = (0, 0)
          >>> cell_size = 0.05
          >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
          >>> print(dataset)
          <BLANKLINE>
                      Cell size: 0.05
                      Dimension: 5 * 5
                      EPSG: 4326
                      Number of Bands: 4
                      Band names: ['Band_1', 'Band_2', 'Band_3', 'Band_4']
                      Mask: -9999.0
                      Data type: float64
                      File:...
          <BLANKLINE>
          >>> print(dataset.crs)
          GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0,AUTHORITY["EPSG","8901"]],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AXIS["Latitude",NORTH],AXIS["Longitude",EAST],AUTHORITY["EPSG","4326"]]
          >>> print(dataset.epsg)
          4326
          >>> reprojected_dataset = dataset.to_crs(to_epsg=3857)
          >>> print(reprojected_dataset)
          <BLANKLINE>
                      Cell size: 5565.983370404396
                      Dimension: 5 * 5
                      EPSG: 3857
                      Number of Bands: 4
                      Band names: ['Band_1', 'Band_2', 'Band_3', 'Band_4']
                      Mask: -9999.0
                      Data type: float64
                      File:...
          <BLANKLINE>
          >>> print(reprojected_dataset.crs)
          PROJCS["WGS 84 / Pseudo-Mercator",GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0,AUTHORITY["EPSG","8901"]],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AUTHORITY["EPSG","4326"]],PROJECTION["Mercator_1SP"],PARAMETER["central_meridian",0],PARAMETER["scale_factor",1],PARAMETER["false_easting",0],PARAMETER["false_northing",0],UNIT["metre",1,AUTHORITY["EPSG","9001"]],AXIS["Easting",EAST],AXIS["Northing",NORTH],EXTENSION["PROJ4","+proj=merc +a=6378137 +b=6378137 +lat_ts=0 +lon_0=0 +x_0=0 +y_0=0 +k=1 +units=m +nadgrids=@null +wktext +no_defs"],AUTHORITY["EPSG","3857"]]
          >>> print(reprojected_dataset.epsg)
          3857

          ```

    """
    if not isinstance(to_epsg, int):
        raise TypeError(
            "please enter correct integer number for to_epsg more information "
            f"https://epsg.io/, given {type(to_epsg)}"
        )
    if not isinstance(method, str):
        raise TypeError(
            "Please enter a correct method, for more information, see documentation "
        )
    if method not in INTERPOLATION_METHODS.keys():
        raise ValueError(
            f"The given interpolation method: {method} does not exist, existing methods are {INTERPOLATION_METHODS.keys()}"
        )

    method = INTERPOLATION_METHODS.get(method)

    if maintain_alignment:
        dst_obj = self._reproject_with_ReprojectImage(to_epsg, method)
    else:
        dst = gdal.Warp("", self.raster, dstSRS=f"EPSG:{to_epsg}", format="VRT")
        dst_obj = Dataset(dst)

    if inplace:
        self.__init__(dst_obj.raster)
    else:
        return dst_obj

`count_domain_cells(band=0)` #

Count cells inside the domain.

Parameters:

Name	Type	Description	Default
`band`	`int`	Band index. Default is 0.	`0`

Returns:

Name	Type	Description
`int`	`int`	Number of cells.

Source code in pyramids/dataset.py

def count_domain_cells(self, band: int = 0) -> int:
    """Count cells inside the domain.

    Args:
        band (int):
            Band index. Default is 0.

    Returns:
        int:
            Number of cells.
    """
    arr = self.read_array(band=band)
    domain_count = np.size(arr[:, :]) - np.count_nonzero(
        (arr[np.isclose(arr, self.no_data_value[band], rtol=0.001)])
    )
    return domain_count

`change_no_data_value(new_value, old_value=None)` #

Change No Data Value.

- Set the no data value in all raster bands.
- Fill the whole raster with the no_data_value.
- Change the no_data_value in the array in all bands.

Parameters:

Name	Type	Description	Default
`new_value`	`numeric`	No data value to set in the raster bands.	required
`old_value`	`numeric`	Old no data value that is already in the raster bands.	`None`

Warning

The change_no_data_value method creates a new dataset in memory in order to change the no_data_value in the raster bands.

Examples:

Create a Dataset (4 bands, 10 rows, 10 columns) at lon/lat (0, 0):

>>> dataset = Dataset.create(
...     cell_size=0.05, rows=3, columns=3, bands=1, top_left_corner=(0, 0),dtype="float32",
...     epsg=4326, no_data_value=-9
... )
>>> arr = dataset.read_array()
>>> print(arr)
[[-9. -9. -9.]
 [-9. -9. -9.]
 [-9. -9. -9.]]
>>> print(dataset.no_data_value) # doctest: +SKIP
[-9.0]

The dataset is full of the no_data_value. Now change it using change_no_data_value:

>>> new_dataset = dataset.change_no_data_value(-10, -9)
>>> arr = new_dataset.read_array()
>>> print(arr)
[[-10. -10. -10.]
 [-10. -10. -10.]
 [-10. -10. -10.]]
>>> print(new_dataset.no_data_value) # doctest: +SKIP
[-10.0]

Source code in pyramids/dataset.py

def change_no_data_value(self, new_value: Any, old_value: Any = None):
    """Change No Data Value.

        - Set the no data value in all raster bands.
        - Fill the whole raster with the no_data_value.
        - Change the no_data_value in the array in all bands.

    Args:
        new_value (numeric):
            No data value to set in the raster bands.
        old_value (numeric):
            Old no data value that is already in the raster bands.

    Warning:
        The `change_no_data_value` method creates a new dataset in memory in order to change the `no_data_value` in the raster bands.

    Examples:
        - Create a Dataset (4 bands, 10 rows, 10 columns) at lon/lat (0, 0):

          ```python
          >>> dataset = Dataset.create(
          ...     cell_size=0.05, rows=3, columns=3, bands=1, top_left_corner=(0, 0),dtype="float32",
          ...     epsg=4326, no_data_value=-9
          ... )
          >>> arr = dataset.read_array()
          >>> print(arr)
          [[-9. -9. -9.]
           [-9. -9. -9.]
           [-9. -9. -9.]]
          >>> print(dataset.no_data_value) # doctest: +SKIP
          [-9.0]

          ```

        - The dataset is full of the no_data_value. Now change it using `change_no_data_value`:

          ```python
          >>> new_dataset = dataset.change_no_data_value(-10, -9)
          >>> arr = new_dataset.read_array()
          >>> print(arr)
          [[-10. -10. -10.]
           [-10. -10. -10.]
           [-10. -10. -10.]]
          >>> print(new_dataset.no_data_value) # doctest: +SKIP
          [-10.0]

          ```
    """
    if not isinstance(new_value, list):
        new_value = [new_value] * self.band_count

    if old_value is not None and not isinstance(old_value, list):
        old_value = [old_value] * self.band_count

    dst = gdal.GetDriverByName("MEM").CreateCopy("", self.raster, 0)
    # create a new dataset
    new_dataset = Dataset(dst, "write")
    # the new_value could change inside the _set_no_data_value method before it is used to set the no_data_value
    # attribute in the gdal object/pyramids object and to fill the band.
    new_dataset._set_no_data_value(new_value)
    # now we have to use the no_data_value value in the no_data_value attribute in the Dataset object as it is
    # updated.
    new_value = new_dataset.no_data_value
    for band in range(self.band_count):
        arr = self.read_array(band)
        try:
            if old_value is not None:
                arr[np.isclose(arr, old_value, rtol=0.001)] = new_value[band]
            else:
                arr[np.isnan(arr)] = new_value[band]
        except TypeError:
            raise NoDataValueError(
                f"The dtype of the given no_data_value: {new_value[band]} differs from the dtype of the "
                f"band: {gdal_to_numpy_dtype(self.gdal_dtype[band])}"
            )
        new_dataset.raster.GetRasterBand(band + 1).WriteArray(arr)
    return new_dataset

`get_cell_coords(location='center', mask=False)` #

Get coordinates for the center/corner of cells inside the dataset domain.

Returns the coordinates of the cell centers inside the domain (only the cells that do not have nodata value)

Parameters:

Name	Type	Description	Default
`location`	`str`	Location of the coordinates. Use `center` for the center of a cell, `corner` for the corner of the cell (top-left corner).	`'center'`
`mask`	`bool`	True to exclude the cells out of the domain. Default is False.	`False`

Returns:

Type	Description
`ndarray`	np.ndarray: Array with a list of the coordinates to be interpolated, without the NaN.
`ndarray`	np.ndarray: Array with all the centers of cells in the domain of the DEM.

Examples:

Create Dataset consists of 1 bands, 3 rows, 3 columns, at the point lon/lat (0, 0).

>>> import numpy as np
>>> arr = np.random.randint(1,3, size=(3, 3))
>>> top_left_corner = (0, 0)
>>> cell_size = 0.05
>>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)

Get the coordinates of the center of cells inside the domain.

>>> coords = dataset.get_cell_coords()
>>> print(coords)
[[ 0.025 -0.025]
 [ 0.075 -0.025]
 [ 0.125 -0.025]
 [ 0.025 -0.075]
 [ 0.075 -0.075]
 [ 0.125 -0.075]
 [ 0.025 -0.125]
 [ 0.075 -0.125]
 [ 0.125 -0.125]]

Get the coordinates of the top left corner of cells inside the domain.

>>> coords = dataset.get_cell_coords(location="corner")
>>> print(coords)
[[ 0.    0.  ]
 [ 0.05  0.  ]
 [ 0.1   0.  ]
 [ 0.   -0.05]
 [ 0.05 -0.05]
 [ 0.1  -0.05]
 [ 0.   -0.1 ]
 [ 0.05 -0.1 ]
 [ 0.1  -0.1 ]]

Source code in pyramids/dataset.py

def get_cell_coords(
    self, location: str = "center", mask: bool = False
) -> np.ndarray:
    """Get coordinates for the center/corner of cells inside the dataset domain.

    Returns the coordinates of the cell centers inside the domain (only the cells that
    do not have nodata value)

    Args:
        location (str):
            Location of the coordinates. Use `center` for the center of a cell, `corner` for the corner of the
            cell (top-left corner).
        mask (bool):
            True to exclude the cells out of the domain. Default is False.

    Returns:
        np.ndarray:
            Array with a list of the coordinates to be interpolated, without the NaN.
        np.ndarray:
            Array with all the centers of cells in the domain of the DEM.

    Examples:
        - Create `Dataset` consists of 1 bands, 3 rows, 3 columns, at the point lon/lat (0, 0).

          ```python
          >>> import numpy as np
          >>> arr = np.random.randint(1,3, size=(3, 3))
          >>> top_left_corner = (0, 0)
          >>> cell_size = 0.05
          >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)

          ```

        - Get the coordinates of the center of cells inside the domain.

          ```python
          >>> coords = dataset.get_cell_coords()
          >>> print(coords)
          [[ 0.025 -0.025]
           [ 0.075 -0.025]
           [ 0.125 -0.025]
           [ 0.025 -0.075]
           [ 0.075 -0.075]
           [ 0.125 -0.075]
           [ 0.025 -0.125]
           [ 0.075 -0.125]
           [ 0.125 -0.125]]

          ```

        - Get the coordinates of the top left corner of cells inside the domain.

          ```python
          >>> coords = dataset.get_cell_coords(location="corner")
          >>> print(coords)
          [[ 0.    0.  ]
           [ 0.05  0.  ]
           [ 0.1   0.  ]
           [ 0.   -0.05]
           [ 0.05 -0.05]
           [ 0.1  -0.05]
           [ 0.   -0.1 ]
           [ 0.05 -0.1 ]
           [ 0.1  -0.1 ]]

          ```
    """
    # check the location parameter
    location = location.lower()
    if location not in ["center", "corner"]:
        raise ValueError(
            "The location parameter can have one of these values: 'center', 'corner', "
            f"but the value: {location} is given."
        )

    if location == "center":
        # Adding 0.5*cell size to get the center
        add_value = 0.5
    else:
        add_value = 0
    # Getting data for the whole grid
    (
        x_init,
        cell_size_x,
        xy_span,
        y_init,
        yy_span,
        cell_size_y,
    ) = self.geotransform
    if cell_size_x != cell_size_y:
        if np.abs(cell_size_x) != np.abs(cell_size_y):
            self.logger.warning(
                f"The given raster does not have a square cells, the cell size is {cell_size_x}*{cell_size_y} "
            )

    # data in the array
    no_val = self.no_data_value[0] if self.no_data_value[0] is not None else np.nan
    arr = self.read_array(band=0)
    if mask is not None and no_val not in arr:
        self.logger.warning(
            "The no data value does not exist in the band, so all the cells will be considered, and the "
            "mask will not be considered."
        )

    if mask:
        mask = [no_val]
    else:
        mask = None
    indices = get_indices2(arr, mask=mask)

    # exclude the no_data_values cells.
    f1 = [i[0] for i in indices]
    f2 = [i[1] for i in indices]
    x = [x_init + cell_size_x * (i + add_value) for i in f2]
    y = [y_init + cell_size_y * (i + add_value) for i in f1]
    coords = np.array(list(zip(x, y)))

    return coords

`get_cell_polygons(mask=False)` #

Get a polygon shapely geometry for the raster cells.

Parameters:

Name	Type	Description	Default
`mask`	`bool`	True to get the polygons of the cells inside the domain.	`False`

Returns:

Name	Type	Description
`GeoDataFrame`	`GeoDataFrame`	With two columns, geometry, and id.

Examples:

Create Dataset consists of 1 band, 3 rows, 3 columns, at the point lon/lat (0, 0).

>>> import numpy as np
>>> arr = np.random.randint(1,3, size=(3, 3))
>>> top_left_corner = (0, 0)
>>> cell_size = 0.05
>>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)

Get the coordinates of the center of cells inside the domain.

>>> gdf = dataset.get_cell_polygons()
>>> print(gdf)
                                       geometry  id
0  POLYGON ((0 0, 0.05 0, 0.05 -0.05, 0 -0.05, 0 0))   0
1  POLYGON ((0.05 0, 0.1 0, 0.1 -0.05, 0.05 -0.05...   1
2  POLYGON ((0.1 0, 0.15 0, 0.15 -0.05, 0.1 -0.05...   2
3  POLYGON ((0 -0.05, 0.05 -0.05, 0.05 -0.1, 0 -0...   3
4  POLYGON ((0.05 -0.05, 0.1 -0.05, 0.1 -0.1, 0.0...   4
5  POLYGON ((0.1 -0.05, 0.15 -0.05, 0.15 -0.1, 0....   5
6  POLYGON ((0 -0.1, 0.05 -0.1, 0.05 -0.15, 0 -0....   6
7  POLYGON ((0.05 -0.1, 0.1 -0.1, 0.1 -0.15, 0.05...   7
8  POLYGON ((0.1 -0.1, 0.15 -0.1, 0.15 -0.15, 0.1...   8
>>> fig, ax = dataset.plot()
>>> gdf.plot(ax=ax, facecolor='none', edgecolor="gray", linewidth=2)
<Axes: >

get_cell_polygons

Source code in pyramids/dataset.py

def get_cell_polygons(self, mask: bool = False) -> GeoDataFrame:
    """Get a polygon shapely geometry for the raster cells.

    Args:
        mask (bool):
            True to get the polygons of the cells inside the domain.

    Returns:
        GeoDataFrame:
            With two columns, geometry, and id.

    Examples:
        - Create `Dataset` consists of 1 band, 3 rows, 3 columns, at the point lon/lat (0, 0).

          ```python
          >>> import numpy as np
          >>> arr = np.random.randint(1,3, size=(3, 3))
          >>> top_left_corner = (0, 0)
          >>> cell_size = 0.05
          >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)

          ```

        - Get the coordinates of the center of cells inside the domain.

          ```python
          >>> gdf = dataset.get_cell_polygons()
          >>> print(gdf)
                                                 geometry  id
          0  POLYGON ((0 0, 0.05 0, 0.05 -0.05, 0 -0.05, 0 0))   0
          1  POLYGON ((0.05 0, 0.1 0, 0.1 -0.05, 0.05 -0.05...   1
          2  POLYGON ((0.1 0, 0.15 0, 0.15 -0.05, 0.1 -0.05...   2
          3  POLYGON ((0 -0.05, 0.05 -0.05, 0.05 -0.1, 0 -0...   3
          4  POLYGON ((0.05 -0.05, 0.1 -0.05, 0.1 -0.1, 0.0...   4
          5  POLYGON ((0.1 -0.05, 0.15 -0.05, 0.15 -0.1, 0....   5
          6  POLYGON ((0 -0.1, 0.05 -0.1, 0.05 -0.15, 0 -0....   6
          7  POLYGON ((0.05 -0.1, 0.1 -0.1, 0.1 -0.15, 0.05...   7
          8  POLYGON ((0.1 -0.1, 0.15 -0.1, 0.15 -0.15, 0.1...   8
          >>> fig, ax = dataset.plot()
          >>> gdf.plot(ax=ax, facecolor='none', edgecolor="gray", linewidth=2)
          <Axes: >

          ```

    ![get_cell_polygons](./../_images/dataset/get_cell_polygons.png)
    """
    coords = self.get_cell_coords(location="corner", mask=mask)
    cell_size = self.geotransform[1]
    epsg = self._get_epsg()
    x = np.zeros((coords.shape[0], 4))
    y = np.zeros((coords.shape[0], 4))
    # fill the top left corner point
    x[:, 0] = coords[:, 0]
    y[:, 0] = coords[:, 1]
    # fill the top right
    x[:, 1] = x[:, 0] + cell_size
    y[:, 1] = y[:, 0]
    # fill the bottom right
    x[:, 2] = x[:, 0] + cell_size
    y[:, 2] = y[:, 0] - cell_size

    # fill the bottom left
    x[:, 3] = x[:, 0]
    y[:, 3] = y[:, 0] - cell_size

    coords_tuples = [list(zip(x[:, i], y[:, i])) for i in range(4)]
    polys_coords = [
        (
            coords_tuples[0][i],
            coords_tuples[1][i],
            coords_tuples[2][i],
            coords_tuples[3][i],
        )
        for i in range(len(x))
    ]
    polygons = list(map(FeatureCollection.create_polygon, polys_coords))
    gdf = gpd.GeoDataFrame(geometry=polygons)
    gdf.set_crs(epsg=epsg, inplace=True)
    gdf["id"] = gdf.index
    return gdf

`get_cell_points(location='center', mask=False)` #

Get a point shapely geometry for the raster cells center point.

Parameters:

Name	Type	Description	Default
`location`	`str`	Location of the point, ["corner", "center"]. Default is "center".	`'center'`
`mask`	`bool`	True to get the polygons of the cells inside the domain.	`False`

Returns:

Name	Type	Description
`GeoDataFrame`	`GeoDataFrame`	With two columns, geometry, and id.

Examples:

Create Dataset consists of 1 band, 3 rows, 3 columns, at the point lon/lat (0, 0).

>>> import numpy as np
>>> arr = np.random.randint(1,3, size=(3, 3))
>>> top_left_corner = (0, 0)
>>> cell_size = 0.05
>>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)

Get the coordinates of the center of cells inside the domain.

>>> gdf = dataset.get_cell_points()
>>> print(gdf)
               geometry  id
0  POINT (0.025 -0.025)   0
1  POINT (0.075 -0.025)   1
2  POINT (0.125 -0.025)   2
3  POINT (0.025 -0.075)   3
4  POINT (0.075 -0.075)   4
5  POINT (0.125 -0.075)   5
6  POINT (0.025 -0.125)   6
7  POINT (0.075 -0.125)   7
8  POINT (0.125 -0.125)   8
>>> fig, ax = dataset.plot()
>>> gdf.plot(ax=ax, facecolor='black', linewidth=2)
<Axes: >

get_cell_points

Get the coordinates of the top left corner of cells inside the domain.

>>> gdf = dataset.get_cell_points(location="corner")
>>> print(gdf)
            geometry  id
0         POINT (0 0)   0
1      POINT (0.05 0)   1
2       POINT (0.1 0)   2
3     POINT (0 -0.05)   3
4  POINT (0.05 -0.05)   4
5   POINT (0.1 -0.05)   5
6      POINT (0 -0.1)   6
7   POINT (0.05 -0.1)   7
8    POINT (0.1 -0.1)   8
>>> fig, ax = dataset.plot()
>>> gdf.plot(ax=ax, facecolor='black', linewidth=4)
<Axes: >

get_cell_points-corner

Source code in pyramids/dataset.py

def get_cell_points(self, location: str = "center", mask=False) -> GeoDataFrame:
    """Get a point shapely geometry for the raster cells center point.

    Args:
        location (str):
            Location of the point, ["corner", "center"]. Default is "center".
        mask (bool):
            True to get the polygons of the cells inside the domain.

    Returns:
        GeoDataFrame:
            With two columns, geometry, and id.

    Examples:
        - Create `Dataset` consists of 1 band, 3 rows, 3 columns, at the point lon/lat (0, 0).

          ```python
          >>> import numpy as np
          >>> arr = np.random.randint(1,3, size=(3, 3))
          >>> top_left_corner = (0, 0)
          >>> cell_size = 0.05
          >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)

          ```

        - Get the coordinates of the center of cells inside the domain.

          ```python
          >>> gdf = dataset.get_cell_points()
          >>> print(gdf)
                         geometry  id
          0  POINT (0.025 -0.025)   0
          1  POINT (0.075 -0.025)   1
          2  POINT (0.125 -0.025)   2
          3  POINT (0.025 -0.075)   3
          4  POINT (0.075 -0.075)   4
          5  POINT (0.125 -0.075)   5
          6  POINT (0.025 -0.125)   6
          7  POINT (0.075 -0.125)   7
          8  POINT (0.125 -0.125)   8
          >>> fig, ax = dataset.plot()
          >>> gdf.plot(ax=ax, facecolor='black', linewidth=2)
          <Axes: >

          ```

        ![get_cell_points](./../_images/dataset/get_cell_points.png)

        - Get the coordinates of the top left corner of cells inside the domain.

          ```python
          >>> gdf = dataset.get_cell_points(location="corner")
          >>> print(gdf)
                      geometry  id
          0         POINT (0 0)   0
          1      POINT (0.05 0)   1
          2       POINT (0.1 0)   2
          3     POINT (0 -0.05)   3
          4  POINT (0.05 -0.05)   4
          5   POINT (0.1 -0.05)   5
          6      POINT (0 -0.1)   6
          7   POINT (0.05 -0.1)   7
          8    POINT (0.1 -0.1)   8
          >>> fig, ax = dataset.plot()
          >>> gdf.plot(ax=ax, facecolor='black', linewidth=4)
          <Axes: >

          ```

        ![get_cell_points-corner](./../_images/dataset/get_cell_points-corner.png)
    """
    coords = self.get_cell_coords(location=location, mask=mask)
    epsg = self._get_epsg()

    coords_tuples = list(zip(coords[:, 0], coords[:, 1]))
    points = FeatureCollection.create_point(coords_tuples)
    gdf = gpd.GeoDataFrame(geometry=points)
    gdf.set_crs(epsg=epsg, inplace=True)
    gdf["id"] = gdf.index
    return gdf

`to_file(path, band=0, tile_length=None, creation_options=None)` #

Save dataset to tiff file.

`to_file` saves a raster to disk, the type of the driver (georiff/netcdf/ascii) will be implied from the
extension at the end of the given path.

Parameters:

Name	Type	Description	Default
`path`	`str`	A path including the name of the dataset.	required
`band`	`int`	Band index, needed only in case of ascii drivers. Default is 0.	`0`
`tile_length`	`int`	Length of the tiles in the driver. Default is 256.	`None`
`creation_options`	`Optional[List[str]]`	List[str], Default is None List of strings that will be passed to the GDAL driver during the creation of the dataset. i.e., ['PREDICTOR=2']	`None`

Examples:

Create a Dataset with 4 bands, 5 rows, 5 columns, at the point lon/lat (0, 0):

>>> import numpy as np
>>> arr = np.random.rand(4, 5, 5)
>>> top_left_corner = (0, 0)
>>> cell_size = 0.05
>>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
>>> print(dataset.file_name)
<BLANKLINE>

Now save the dataset as a geotiff file:

>>> dataset.to_file("my-dataset.tif")
>>> print(dataset.file_name)
my-dataset.tif

Source code in pyramids/dataset.py

def to_file(
    self,
    path: str,
    band: int = 0,
    tile_length: Optional[int] = None,
    creation_options: Optional[List[str]] = None,
) -> None:
    """Save dataset to tiff file.

        `to_file` saves a raster to disk, the type of the driver (georiff/netcdf/ascii) will be implied from the
        extension at the end of the given path.

    Args:
        path (str):
            A path including the name of the dataset.
        band (int):
            Band index, needed only in case of ascii drivers. Default is 0.
        tile_length (int, optional):
            Length of the tiles in the driver. Default is 256.
        creation_options: List[str], Default is None
            List of strings that will be passed to the GDAL driver during the creation of the dataset.
            i.e., ['PREDICTOR=2']

    Examples:
        - Create a Dataset with 4 bands, 5 rows, 5 columns, at the point lon/lat (0, 0):

          ```python
          >>> import numpy as np
          >>> arr = np.random.rand(4, 5, 5)
          >>> top_left_corner = (0, 0)
          >>> cell_size = 0.05
          >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
          >>> print(dataset.file_name)
          <BLANKLINE>

          ```

        - Now save the dataset as a geotiff file:

          ```python
          >>> dataset.to_file("my-dataset.tif")
          >>> print(dataset.file_name)
          my-dataset.tif

          ```
    """
    if not isinstance(path, str):
        raise TypeError("path input should be string type")

    extension = path.split(".")[-1]
    driver = CATALOG.get_driver_name_by_extension(extension)
    driver_name = CATALOG.get_gdal_name(driver)

    if driver == "ascii":
        arr = self.read_array(band=band)
        no_data_value = self.no_data_value[band]
        xmin, ymin, _, _ = self.bbox
        _io.to_ascii(arr, self.cell_size, xmin, ymin, no_data_value, path)
    else:
        # saving rasters with color table fails with a runtime error
        options = ["COMPRESS=DEFLATE"]
        if tile_length is not None:
            options += [
                "TILED=YES",
                f"TILE_LENGTH={tile_length}",
            ]
        if self._block_size is not None and self._block_size != []:
            options += [
                "BLOCKXSIZE={}".format(self._block_size[0][0]),
                "BLOCKYSIZE={}".format(self._block_size[0][1]),
            ]
        if creation_options is not None:
            options += creation_options

        try:
            dst = gdal.GetDriverByName(driver_name).CreateCopy(
                path, self.raster, 0, options=options
            )
            self.__init__(dst, "write")
            # flush the data to the dataset on disk.
            dst.FlushCache()
        except RuntimeError:
            if not os.path.exists(path):
                raise FailedToSaveError(
                    f"Failed to save the {driver_name} raster to the path: {path}"
                )

`convert_longitude(inplace=False)` #

Convert Longitude.

convert the longitude from 0-360 to -180 - 180.
currently the function works correctly if the raster covers the whole world, it means that the columns in the rasters covers from longitude 0 to 360.

Parameters:

Name	Type	Description	Default
`inplace`	`bool`	True to make the changes in place.	`False`

Returns:

Name	Type	Description
`Dataset`	`Optional[Dataset]`	The converted dataset if inplace is False; otherwise None.

Source code in pyramids/dataset.py

def convert_longitude(self, inplace: bool = False) -> Optional["Dataset"]:
    """Convert Longitude.

    - convert the longitude from 0-360 to -180 - 180.
    - currently the function works correctly if the raster covers the whole world, it means that the columns
        in the rasters covers from longitude 0 to 360.

    Args:
        inplace (bool):
            True to make the changes in place.

    Returns:
        Dataset:
            The converted dataset if inplace is False; otherwise None.
    """
    # dst = gdal.Warp(
    #     "",
    #     self.raster,
    #     dstSRS="+proj=longlat +ellps=WGS84 +datum=WGS84 +lon_0=0 +over",
    #     format="VRT",
    # )
    lon = self.lon
    src = self.raster
    # create a copy
    drv = gdal.GetDriverByName("MEM")
    dst = drv.CreateCopy("", src, 0)
    # convert the 0 to 360 to -180 to 180
    if lon[-1] <= 180:
        raise ValueError("The raster should cover the whole globe")

    first_to_translated = np.where(lon > 180)[0][0]

    ind = list(range(first_to_translated, len(lon)))
    ind_2 = list(range(0, first_to_translated))

    for band in range(self.band_count):
        arr = self.read_array(band=band)
        arr_rearranged = arr[:, ind + ind_2]
        dst.GetRasterBand(band + 1).WriteArray(arr_rearranged)

    # correct the geotransform
    top_left_corner = self.top_left_corner
    gt = list(self.geotransform)
    if lon[-1] > 180:
        new_gt = top_left_corner[0] - 180
        gt[0] = new_gt

    dst.SetGeoTransform(gt)
    if not inplace:
        return Dataset(dst)
    else:
        self.__init__(dst)

`to_feature_collection(vector_mask=None, add_geometry=None, tile=False, tile_size=256, touch=True)` #

Convert a dataset to a vector.

The function does the following

Flatten the array in each band in the raster then mask the values if a vector_mask file is given otherwise it will flatten all values.
Put the values for each band in a column in a dataframe under the name of the raster band, but if no meta-data in the raster band exists, an index number will be used [1, 2, 3, ...]
The function has an add_geometry parameter with two possible values ["point", "polygon"], which you can specify the type of shapely geometry you want to create from each cell,
- If point is chosen, the created point will be at the center of each cell
- If a polygon is chosen, a square polygon will be created that covers the entire cell.

Parameters:

Name	Type	Description	Default
`vector_mask`	`GeoDataFrame`	GeoDataFrame for the vector_mask. If given, it will be used to clip the raster.	`None`
`add_geometry`	`str`	"Polygon" or "Point" if you want to add a polygon geometry of the cells as column in dataframe. Default is None.	`None`
`tile`	`bool`	True to use tiles in extracting the values from the raster. Default is False.	`False`
`tile_size`	`int`	Tile size. Default is 1500.	`256`
`touch`	`bool`	Include the cells that touch the polygon not only those that lie entirely inside the polygon mask. Default is True.	`True`

Returns:

Type	Description
`Union[DataFrame, GeoDataFrame]`	DataFrame \| GeoDataFrame: The resulting frame will have the band value under the name of the band (if the raster file has metadata; if not, the bands will be indexed from 1 to the number of bands).

Examples:

Create a dataset from array with 2 bands and 3*3 array each:

>>> import numpy as np
>>> arr = np.random.rand(2, 3, 3)
>>> top_left_corner = (0, 0)
>>> cell_size = 0.05
>>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
>>> print(dataset.read_array(band=0)) # doctest: +SKIP
[[0.88625832 0.81804328 0.99372706]
 [0.85333054 0.35448201 0.78079262]
 [0.43887136 0.68166208 0.53170966]]
>>> print(dataset.read_array(band=1)) # doctest: +SKIP
[[0.07051872 0.67650833 0.17625027]
 [0.41258071 0.38327938 0.18783139]
 [0.83741314 0.70446373 0.64913575]]

Convert the dataset to dataframe by calling the to_feature_collection method:

>>> df = dataset.to_feature_collection()
>>> print(df) # doctest: +SKIP
     Band_1    Band_2
0  0.886258  0.070519
1  0.818043  0.676508
2  0.993727  0.176250
3  0.853331  0.412581
4  0.354482  0.383279
5  0.780793  0.187831
6  0.438871  0.837413
7  0.681662  0.704464
8  0.531710  0.649136

Convert the dataset into geodataframe with either a polygon or a point geometry that represents each cell. To specify the geometry type use the parameter add_geometry:

>>> gdf = dataset.to_feature_collection(add_geometry="point")
>>> print(gdf) # doctest: +SKIP
     Band_1    Band_2                  geometry
0  0.886258  0.070519  POINT (0.02500 -0.02500)
1  0.818043  0.676508  POINT (0.07500 -0.02500)
2  0.993727  0.176250  POINT (0.12500 -0.02500)
3  0.853331  0.412581  POINT (0.02500 -0.07500)
4  0.354482  0.383279  POINT (0.07500 -0.07500)
5  0.780793  0.187831  POINT (0.12500 -0.07500)
6  0.438871  0.837413  POINT (0.02500 -0.12500)
7  0.681662  0.704464  POINT (0.07500 -0.12500)
8  0.531710  0.649136  POINT (0.12500 -0.12500)
>>> gdf = dataset.to_feature_collection(add_geometry="polygon")
>>> print(gdf) # doctest: +SKIP
     Band_1    Band_2                                           geometry
0  0.886258  0.070519  POLYGON ((0.00000 0.00000, 0.05000 0.00000, 0....
1  0.818043  0.676508  POLYGON ((0.05000 0.00000, 0.10000 0.00000, 0....
2  0.993727  0.176250  POLYGON ((0.10000 0.00000, 0.15000 0.00000, 0....
3  0.853331  0.412581  POLYGON ((0.00000 -0.05000, 0.05000 -0.05000, ...
4  0.354482  0.383279  POLYGON ((0.05000 -0.05000, 0.10000 -0.05000, ...
5  0.780793  0.187831  POLYGON ((0.10000 -0.05000, 0.15000 -0.05000, ...
6  0.438871  0.837413  POLYGON ((0.00000 -0.10000, 0.05000 -0.10000, ...
7  0.681662  0.704464  POLYGON ((0.05000 -0.10000, 0.10000 -0.10000, ...
8  0.531710  0.649136  POLYGON ((0.10000 -0.10000, 0.15000 -0.10000, ...

Use a mask to crop part of the dataset, and then convert the cropped part to a dataframe/geodataframe:

Create a mask that covers only the cell in the middle of the dataset.

>>> import geopandas as gpd
>>> from shapely.geometry import Polygon
>>> poly = gpd.GeoDataFrame(
...             geometry=[Polygon([(0.05, -0.05), (0.05, -0.1), (0.1, -0.1), (0.1, -0.05)])], crs=4326
... )
>>> df = dataset.to_feature_collection(vector_mask=poly)
>>> print(df) # doctest: +SKIP
     Band_1    Band_2
0  0.354482  0.383279

If you have a big dataset, and you want to convert it to dataframe in tiles (do not read the whole dataset at once but in tiles), you can use the tile and the tile_size parameters. The values will be the same as above; the difference is reading in chunks:

>>> gdf = dataset.to_feature_collection(tile=True, tile_size=1)
>>> print(gdf) # doctest: +SKIP
     Band_1    Band_2
0  0.886258  0.070519
1  0.818043  0.676508
2  0.993727  0.176250
3  0.853331  0.412581
4  0.354482  0.383279
5  0.780793  0.187831
6  0.438871  0.837413
7  0.681662  0.704464
8  0.531710  0.649136

Source code in pyramids/dataset.py

def to_feature_collection(
    self,
    vector_mask: GeoDataFrame = None,
    add_geometry: str = None,
    tile: bool = False,
    tile_size: int = 256,
    touch: bool = True,
) -> Union[DataFrame, GeoDataFrame]:
    """Convert a dataset to a vector.

    The function does the following:
        - Flatten the array in each band in the raster then mask the values if a vector_mask file is given
            otherwise it will flatten all values.
        - Put the values for each band in a column in a dataframe under the name of the raster band,
            but if no meta-data in the raster band exists, an index number will be used [1, 2, 3, ...]
        - The function has an add_geometry parameter with two possible values ["point", "polygon"], which you can
            specify the type of shapely geometry you want to create from each cell,

            - If point is chosen, the created point will be at the center of each cell
            - If a polygon is chosen, a square polygon will be created that covers the entire cell.

    Args:
        vector_mask (GeoDataFrame, optional):
            GeoDataFrame for the vector_mask. If given, it will be used to clip the raster.
        add_geometry (str):
            "Polygon" or "Point" if you want to add a polygon geometry of the cells as column in dataframe.
            Default is None.
        tile (bool):
            True to use tiles in extracting the values from the raster. Default is False.
        tile_size (int):
            Tile size. Default is 1500.
        touch (bool):
            Include the cells that touch the polygon not only those that lie entirely inside the polygon mask.
            Default is True.

    Returns:
        DataFrame | GeoDataFrame:
            The resulting frame will have the band value under the name of the band (if the raster file has
            metadata; if not, the bands will be indexed from 1 to the number of bands).

    Examples:
        - Create a dataset from array with 2 bands and 3*3 array each:

          ```python
          >>> import numpy as np
          >>> arr = np.random.rand(2, 3, 3)
          >>> top_left_corner = (0, 0)
          >>> cell_size = 0.05
          >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
          >>> print(dataset.read_array(band=0)) # doctest: +SKIP
          [[0.88625832 0.81804328 0.99372706]
           [0.85333054 0.35448201 0.78079262]
           [0.43887136 0.68166208 0.53170966]]
          >>> print(dataset.read_array(band=1)) # doctest: +SKIP
          [[0.07051872 0.67650833 0.17625027]
           [0.41258071 0.38327938 0.18783139]
           [0.83741314 0.70446373 0.64913575]]

          ```

        - Convert the dataset to dataframe by calling the `to_feature_collection` method:

          ```python
          >>> df = dataset.to_feature_collection()
          >>> print(df) # doctest: +SKIP
               Band_1    Band_2
          0  0.886258  0.070519
          1  0.818043  0.676508
          2  0.993727  0.176250
          3  0.853331  0.412581
          4  0.354482  0.383279
          5  0.780793  0.187831
          6  0.438871  0.837413
          7  0.681662  0.704464
          8  0.531710  0.649136

          ```

        - Convert the dataset into geodataframe with either a polygon or a point geometry that represents each cell.
            To specify the geometry type use the parameter `add_geometry`:

              ```python
              >>> gdf = dataset.to_feature_collection(add_geometry="point")
              >>> print(gdf) # doctest: +SKIP
                   Band_1    Band_2                  geometry
              0  0.886258  0.070519  POINT (0.02500 -0.02500)
              1  0.818043  0.676508  POINT (0.07500 -0.02500)
              2  0.993727  0.176250  POINT (0.12500 -0.02500)
              3  0.853331  0.412581  POINT (0.02500 -0.07500)
              4  0.354482  0.383279  POINT (0.07500 -0.07500)
              5  0.780793  0.187831  POINT (0.12500 -0.07500)
              6  0.438871  0.837413  POINT (0.02500 -0.12500)
              7  0.681662  0.704464  POINT (0.07500 -0.12500)
              8  0.531710  0.649136  POINT (0.12500 -0.12500)
              >>> gdf = dataset.to_feature_collection(add_geometry="polygon")
              >>> print(gdf) # doctest: +SKIP
                   Band_1    Band_2                                           geometry
              0  0.886258  0.070519  POLYGON ((0.00000 0.00000, 0.05000 0.00000, 0....
              1  0.818043  0.676508  POLYGON ((0.05000 0.00000, 0.10000 0.00000, 0....
              2  0.993727  0.176250  POLYGON ((0.10000 0.00000, 0.15000 0.00000, 0....
              3  0.853331  0.412581  POLYGON ((0.00000 -0.05000, 0.05000 -0.05000, ...
              4  0.354482  0.383279  POLYGON ((0.05000 -0.05000, 0.10000 -0.05000, ...
              5  0.780793  0.187831  POLYGON ((0.10000 -0.05000, 0.15000 -0.05000, ...
              6  0.438871  0.837413  POLYGON ((0.00000 -0.10000, 0.05000 -0.10000, ...
              7  0.681662  0.704464  POLYGON ((0.05000 -0.10000, 0.10000 -0.10000, ...
              8  0.531710  0.649136  POLYGON ((0.10000 -0.10000, 0.15000 -0.10000, ...

              ```

        - Use a mask to crop part of the dataset, and then convert the cropped part to a dataframe/geodataframe:

          - Create a mask that covers only the cell in the middle of the dataset.

              ```python
              >>> import geopandas as gpd
              >>> from shapely.geometry import Polygon
              >>> poly = gpd.GeoDataFrame(
              ...             geometry=[Polygon([(0.05, -0.05), (0.05, -0.1), (0.1, -0.1), (0.1, -0.05)])], crs=4326
              ... )
              >>> df = dataset.to_feature_collection(vector_mask=poly)
              >>> print(df) # doctest: +SKIP
                   Band_1    Band_2
              0  0.354482  0.383279

              ```

        - If you have a big dataset, and you want to convert it to dataframe in tiles (do not read the whole dataset
            at once but in tiles), you can use the `tile` and the `tile_size` parameters. The values will be the
            same as above; the difference is reading in chunks:

              ```python
              >>> gdf = dataset.to_feature_collection(tile=True, tile_size=1)
              >>> print(gdf) # doctest: +SKIP
                   Band_1    Band_2
              0  0.886258  0.070519
              1  0.818043  0.676508
              2  0.993727  0.176250
              3  0.853331  0.412581
              4  0.354482  0.383279
              5  0.780793  0.187831
              6  0.438871  0.837413
              7  0.681662  0.704464
              8  0.531710  0.649136

              ```

    """
    # Get raster band names. open the dataset using gdal.Open
    band_names = self.band_names

    # Create a mask from the pixels touched by the vector_mask.
    if vector_mask is not None:
        src = self.crop(mask=vector_mask, touch=touch)
    else:
        src = self

    if tile:
        df_list = []  # DataFrames of each tile.
        for arr in self.get_tile(tile_size):
            # Assume multi-band
            idx = (1, 2)
            if arr.ndim == 2:
                # Handle single band rasters
                idx = (0, 1)

            mask_arr = np.ones((arr.shape[idx[0]], arr.shape[idx[1]]))
            pixels = get_pixels(arr, mask_arr).transpose()
            df_list.append(pd.DataFrame(pixels, columns=band_names))

        # Merge all the tiles.
        df = pd.concat(df_list)
    else:
        arr = src.read_array()

        if self.band_count == 1:
            pixels = arr.flatten()
        else:
            pixels = (
                arr.flatten()
                .reshape(src.band_count, src.columns * src.rows)
                .transpose()
            )
        df = pd.DataFrame(pixels, columns=band_names)
        # mask no data values.
        if src.no_data_value[0] is not None:
            df.replace(src.no_data_value[0], np.nan, inplace=True)
        df.dropna(axis=0, inplace=True, ignore_index=True)

    if add_geometry:
        if add_geometry.lower() == "point":
            coords = src.get_cell_points(mask=True)
        else:
            coords = src.get_cell_polygons(mask=True)

    df.drop(columns=["burn_value", "geometry"], errors="ignore", inplace=True)
    if add_geometry:
        df = gpd.GeoDataFrame(df.loc[:], geometry=coords["geometry"].to_list())
        df.set_crs(coords.crs.to_epsg())

    return df

`apply(func, band=0)` #

Apply a function to all domain cells.

apply method executes a mathematical operation on the raster array.
The apply method executes the function only on one cell at a time.

Parameters:

Name	Type	Description	Default
`func`	`function`	Defined function that takes one input (the cell value).	required
`band`	`int`	Band number.	`0`

Returns:

Name	Type	Description
`Dataset`	`Dataset`	Dataset object.

Examples:

Create a dataset from an array filled with values between -1 and 1:

>>> import numpy as np
>>> arr = np.random.uniform(-1, 1, size=(5, 5))
>>> top_left_corner = (0, 0)
>>> cell_size = 0.05
>>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
>>> print(dataset.read_array()) # doctest: +SKIP
[[ 0.94997539 -0.80083622 -0.30948769 -0.77439961 -0.83836424]
 [-0.36810158 -0.23979251  0.88051216 -0.46882913  0.64511056]
 [ 0.50585374 -0.46905902  0.67856589  0.2779605   0.05589759]
 [ 0.63382852 -0.49259597  0.18471423 -0.49308984 -0.52840286]
 [-0.34076174 -0.53073014 -0.18485789 -0.40033474 -0.38962938]]

Apply the absolute function to the dataset:

>>> abs_dataset = dataset.apply(np.abs)
>>> print(abs_dataset.read_array()) # doctest: +SKIP
[[0.94997539 0.80083622 0.30948769 0.77439961 0.83836424]
 [0.36810158 0.23979251 0.88051216 0.46882913 0.64511056]
 [0.50585374 0.46905902 0.67856589 0.2779605  0.05589759]
 [0.63382852 0.49259597 0.18471423 0.49308984 0.52840286]
 [0.34076174 0.53073014 0.18485789 0.40033474 0.38962938]]

Source code in pyramids/dataset.py

def apply(self, func, band: int = 0) -> "Dataset":
    """Apply a function to all domain cells.

    - apply method executes a mathematical operation on the raster array.
    - The apply method executes the function only on one cell at a time.

    Args:
        func (function):
            Defined function that takes one input (the cell value).
        band (int):
            Band number.

    Returns:
        Dataset:
            Dataset object.

    Examples:
        - Create a dataset from an array filled with values between -1 and 1:

          ```python
          >>> import numpy as np
          >>> arr = np.random.uniform(-1, 1, size=(5, 5))
          >>> top_left_corner = (0, 0)
          >>> cell_size = 0.05
          >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
          >>> print(dataset.read_array()) # doctest: +SKIP
          [[ 0.94997539 -0.80083622 -0.30948769 -0.77439961 -0.83836424]
           [-0.36810158 -0.23979251  0.88051216 -0.46882913  0.64511056]
           [ 0.50585374 -0.46905902  0.67856589  0.2779605   0.05589759]
           [ 0.63382852 -0.49259597  0.18471423 -0.49308984 -0.52840286]
           [-0.34076174 -0.53073014 -0.18485789 -0.40033474 -0.38962938]]

          ```

        - Apply the absolute function to the dataset:

          ```python
          >>> abs_dataset = dataset.apply(np.abs)
          >>> print(abs_dataset.read_array()) # doctest: +SKIP
          [[0.94997539 0.80083622 0.30948769 0.77439961 0.83836424]
           [0.36810158 0.23979251 0.88051216 0.46882913 0.64511056]
           [0.50585374 0.46905902 0.67856589 0.2779605  0.05589759]
           [0.63382852 0.49259597 0.18471423 0.49308984 0.52840286]
           [0.34076174 0.53073014 0.18485789 0.40033474 0.38962938]]

          ```
    """
    if not callable(func):
        raise TypeError("The second argument should be a function")

    no_data_value = self.no_data_value[band]
    src_array = self.read_array(band)
    dtype = self.gdal_dtype[band]

    # fill the new array with the nodata value
    new_array = np.ones((self.rows, self.columns)) * no_data_value
    # execute the function on each cell
    # TODO: optimize executing a function over a whole array
    for i in range(self.rows):
        for j in range(self.columns):
            if not np.isclose(src_array[i, j], no_data_value, rtol=0.001):
                new_array[i, j] = func(src_array[i, j])

    # create the output raster
    dst = Dataset._create_dataset(self.columns, self.rows, 1, dtype, driver="MEM")
    # set the geotransform
    dst.SetGeoTransform(self.geotransform)
    # set the projection
    dst.SetProjection(self.crs)
    dst_obj = Dataset(dst)
    dst_obj._set_no_data_value(no_data_value=no_data_value)
    dst_obj.raster.GetRasterBand(band + 1).WriteArray(new_array)

    return dst_obj

`fill(value, inplace=False, path=None)` #

Fill the domain cells with a certain value.

Fill takes a raster and fills it with one value

Parameters:

Name	Type	Description	Default
`value`	`float \| int`	Numeric value to fill.	required
`inplace`	`bool`	If True, the original dataset will be modified. If False, a new dataset will be created. Default is False.	`False`
`path`	`str`	Path including the extension (.tif).	`None`

Returns:

Name	Type	Description
`Dataset`	`Union[Dataset, None]`	The resulting dataset if inplace is False; otherwise None.

Examples:

Create a Dataset with 1 band, 5 rows, 5 columns, at the point lon/lat (0, 0):

>>> import numpy as np
>>> arr = np.random.randint(1, 5, size=(5, 5))
>>> top_left_corner = (0, 0)
>>> cell_size = 0.05
>>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
>>> print(dataset.read_array()) # doctest: +SKIP
[[1 1 3 1 2]
 [2 2 2 1 2]
 [2 2 3 1 3]
 [3 4 3 3 4]
 [4 4 2 1 1]]
>>> new_dataset = dataset.fill(10)
>>> print(new_dataset.read_array())
[[10 10 10 10 10]
 [10 10 10 10 10]
 [10 10 10 10 10]
 [10 10 10 10 10]
 [10 10 10 10 10]]

Source code in pyramids/dataset.py

def fill(
    self, value: Union[float, int], inplace: bool = False, path: str = None
) -> Union["Dataset", None]:
    """Fill the domain cells with a certain value.

        Fill takes a raster and fills it with one value

    Args:
        value (float | int):
            Numeric value to fill.
        inplace (bool):
            If True, the original dataset will be modified. If False, a new dataset will be created. Default is False.
        path (str):
            Path including the extension (.tif).

    Returns:
        Dataset:
            The resulting dataset if inplace is False; otherwise None.

    Examples:
        - Create a Dataset with 1 band, 5 rows, 5 columns, at the point lon/lat (0, 0):

          ```python
          >>> import numpy as np
          >>> arr = np.random.randint(1, 5, size=(5, 5))
          >>> top_left_corner = (0, 0)
          >>> cell_size = 0.05
          >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
          >>> print(dataset.read_array()) # doctest: +SKIP
          [[1 1 3 1 2]
           [2 2 2 1 2]
           [2 2 3 1 3]
           [3 4 3 3 4]
           [4 4 2 1 1]]
          >>> new_dataset = dataset.fill(10)
          >>> print(new_dataset.read_array())
          [[10 10 10 10 10]
           [10 10 10 10 10]
           [10 10 10 10 10]
           [10 10 10 10 10]
           [10 10 10 10 10]]

          ```
    """
    no_data_value = self.no_data_value[0]
    src_array = self.raster.ReadAsArray()

    if no_data_value is None:
        no_data_value = np.nan

    if not np.isnan(no_data_value):
        src_array[~np.isclose(src_array, no_data_value, rtol=0.000001)] = value
    else:
        src_array[~np.isnan(src_array)] = value

    dst = Dataset.dataset_like(self, src_array, path=path)
    if inplace:
        self.__init__(dst.raster)
    else:
        return dst

`resample(cell_size, method='nearest neighbor')` #

resample.

resample method reprojects a raster to any projection (default the WGS84 web mercator projection, without resampling). The function returns a GDAL in-memory file object.

Parameters:

Name	Type	Description	Default
`cell_size`	`int`	New cell size to resample the raster. If None, raster will not be resampled.	required
`method`	`str`	Resampling method: "nearest neighbor", "cubic", or "bilinear". Default is "nearest neighbor".	`'nearest neighbor'`

Returns:

Name	Type	Description
`Dataset`	`Dataset`	Dataset object.

Examples:

Create a Dataset with 4 bands, 10 rows, 10 columns, at lon/lat (0, 0):

>>> import numpy as np
>>> arr = np.random.rand(4, 10, 10)
>>> top_left_corner = (0, 0)
>>> cell_size = 0.05
>>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
>>> print(dataset)
<BLANKLINE>
            Cell size: 0.05
            Dimension: 10 * 10
            EPSG: 4326
            Number of Bands: 4
            Band names: ['Band_1', 'Band_2', 'Band_3', 'Band_4']
            Mask: -9999.0
            Data type: float64
            File: ...
<BLANKLINE>
>>> dataset.plot(band=0)
(<Figure size 800x800 with 2 Axes>, <Axes: >)

Resample the raster to a new cell size of 0.1:

>>> new_dataset = dataset.resample(cell_size=0.1)
>>> print(new_dataset)
<BLANKLINE>
            Cell size: 0.1
            Dimension: 5 * 5
            EPSG: 4326
            Number of Bands: 4
            Band names: ['Band_1', 'Band_2', 'Band_3', 'Band_4']
            Mask: -9999.0
            Data type: float64
            File:...
<BLANKLINE>
>>> new_dataset.plot(band=0)
(<Figure size 800x800 with 2 Axes>, <Axes: >)

Resampling the dataset from cell_size 0.05 to 0.1 degrees reduced the number of cells to 5 in each dimension instead of 10.

Source code in pyramids/dataset.py

def resample(
    self, cell_size: Union[int, float], method: str = "nearest neighbor"
) -> "Dataset":
    """resample.

    resample method reprojects a raster to any projection (default the WGS84 web mercator projection,
    without resampling). The function returns a GDAL in-memory file object.

    Args:
        cell_size (int):
            New cell size to resample the raster. If None, raster will not be resampled.
        method (str):
            Resampling method: "nearest neighbor", "cubic", or "bilinear". Default is "nearest neighbor".

    Returns:
        Dataset:
            Dataset object.

    Examples:
        - Create a Dataset with 4 bands, 10 rows, 10 columns, at lon/lat (0, 0):

          ```python
          >>> import numpy as np
          >>> arr = np.random.rand(4, 10, 10)
          >>> top_left_corner = (0, 0)
          >>> cell_size = 0.05
          >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
          >>> print(dataset)
          <BLANKLINE>
                      Cell size: 0.05
                      Dimension: 10 * 10
                      EPSG: 4326
                      Number of Bands: 4
                      Band names: ['Band_1', 'Band_2', 'Band_3', 'Band_4']
                      Mask: -9999.0
                      Data type: float64
                      File: ...
          <BLANKLINE>
          >>> dataset.plot(band=0)
          (<Figure size 800x800 with 2 Axes>, <Axes: >)

          ```
          ![resample-source](./../_images/dataset/resample-source.png)

        - Resample the raster to a new cell size of 0.1:

          ```python
          >>> new_dataset = dataset.resample(cell_size=0.1)
          >>> print(new_dataset)
          <BLANKLINE>
                      Cell size: 0.1
                      Dimension: 5 * 5
                      EPSG: 4326
                      Number of Bands: 4
                      Band names: ['Band_1', 'Band_2', 'Band_3', 'Band_4']
                      Mask: -9999.0
                      Data type: float64
                      File:...
          <BLANKLINE>
          >>> new_dataset.plot(band=0)
          (<Figure size 800x800 with 2 Axes>, <Axes: >)

          ```
          ![resample-new](./../_images/dataset/resample-new.png)

        - Resampling the dataset from cell_size 0.05 to 0.1 degrees reduced the number of cells to 5 in each dimension instead of 10.
    """
    if not isinstance(method, str):
        raise TypeError(
            "Please enter a correct method, for more information, see documentation"
        )
    if method not in INTERPOLATION_METHODS.keys():
        raise ValueError(
            f"The given interpolation method does not exist, existing methods are {INTERPOLATION_METHODS.keys()}"
        )

    method = INTERPOLATION_METHODS.get(method)

    sr_src = osr.SpatialReference(wkt=self.crs)

    ulx = self.geotransform[0]
    uly = self.geotransform[3]
    # transform the right lower corner point
    lrx = self.geotransform[0] + self.geotransform[1] * self.columns
    lry = self.geotransform[3] + self.geotransform[5] * self.rows

    # new geotransform
    new_geo = (
        self.geotransform[0],
        cell_size,
        self.geotransform[2],
        self.geotransform[3],
        self.geotransform[4],
        -1 * cell_size,
    )
    # create a new raster
    cols = int(np.round(abs(lrx - ulx) / cell_size))
    rows = int(np.round(abs(uly - lry) / cell_size))
    dtype = self.gdal_dtype[0]
    bands = self.band_count

    dst = Dataset._create_dataset(cols, rows, bands, dtype)
    # set the geotransform
    dst.SetGeoTransform(new_geo)
    # set the projection
    dst.SetProjection(sr_src.ExportToWkt())
    dst_obj = Dataset(dst, "write")
    # set the no data value
    dst_obj._set_no_data_value(self.no_data_value)
    # perform the projection & resampling
    gdal.ReprojectImage(
        self.raster,
        dst_obj.raster,
        sr_src.ExportToWkt(),
        sr_src.ExportToWkt(),
        method,
    )

    return dst_obj

`fill_gaps(mask, src_array)` #

Fill gaps in src_array using nearest neighbors where mask indicates valid cells.

Parameters:

Name	Type	Description	Default
`mask`	`Dataset \| ndarray`	Mask dataset or array used to determine valid cells.	required
`src_array`	`ndarray`	Source array whose gaps will be filled.	required

Returns:

Type	Description
`ndarray`	np.ndarray: The source array with gaps filled where applicable.

Source code in pyramids/dataset.py

def fill_gaps(self, mask, src_array: np.ndarray) -> np.ndarray:
    """Fill gaps in src_array using nearest neighbors where mask indicates valid cells.

    Args:
        mask (Dataset | np.ndarray):
            Mask dataset or array used to determine valid cells.
        src_array (np.ndarray):
            Source array whose gaps will be filled.

    Returns:
        np.ndarray: The source array with gaps filled where applicable.
    """
    # align function only equate the no of rows and columns only
    # match no_data_value inserts no_data_value in src raster to all places like mask
    # still places that has no_data_value in the src raster, but it is not no_data_value in the mask
    # and now has to be filled with values
    # compare no of element that is not no_data_value in both rasters to make sure they are matched
    # if both inputs are rasters
    mask_array = mask.read_array()
    row = mask.rows
    col = mask.columns
    mask_noval = mask.no_data_value[0]

    if isinstance(mask, Dataset) and isinstance(self, Dataset):
        # there might be cells that are out of domain in the src but not out of domain in the mask
        # so change all the src_noval to mask_noval in the src_array
        # src_array[np.isclose(src_array, self.no_data_value[0], rtol=0.001)] = mask_noval
        # then count them (out of domain cells) in the src_array
        elem_src = src_array.size - np.count_nonzero(
            (src_array[np.isclose(src_array, self.no_data_value[0], rtol=0.001)])
        )
        # count the out of domain cells in the mask
        elem_mask = mask_array.size - np.count_nonzero(
            (mask_array[np.isclose(mask_array, mask_noval, rtol=0.001)])
        )

        # if not equal, then store indices of those cells that don't match
        if elem_mask > elem_src:
            rows = [
                i
                for i in range(row)
                for j in range(col)
                if np.isclose(src_array[i, j], self.no_data_value[0], rtol=0.001)
                and not np.isclose(mask_array[i, j], mask_noval, rtol=0.001)
            ]
            cols = [
                j
                for i in range(row)
                for j in range(col)
                if np.isclose(src_array[i, j], self.no_data_value[0], rtol=0.001)
                and not np.isclose(mask_array[i, j], mask_noval, rtol=0.001)
            ]
        # interpolate those missing cells by the nearest neighbor
        if elem_mask > elem_src:
            src_array = Dataset._nearest_neighbour(
                src_array, self.no_data_value[0], rows, cols
            )
        return src_array

`align(alignment_src)` #

Align the current dataset (rows and columns) to match a given dataset.

Copies spatial properties from alignment_src to the current raster

The coordinate system
The number of rows and columns
Cell size

Then resamples values from the current dataset using the nearest neighbor interpolation.

Parameters:

Name	Type	Description	Default
`alignment_src`	`Dataset`	Spatial information source raster to get the spatial information (coordinate system, number of rows and columns). The data values of the current dataset are resampled to this alignment.	required

Returns:

Name	Type	Description
`Dataset`	`Dataset`	The aligned dataset.

Examples:

The source dataset has a top_left_corner at (0, 0) with a 5*5 alignment, and a 0.05 degree cell size.

>>> import numpy as np
>>> arr = np.random.rand(5, 5)
>>> top_left_corner = (0, 0)
>>> cell_size = 0.05
>>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
>>> print(dataset)
<BLANKLINE>
            Cell size: 0.05
            Dimension: 5 * 5
            EPSG: 4326
            Number of Bands: 1
            Band names: ['Band_1']
            Mask: -9999.0
            Data type: float64
            File:...
<BLANKLINE>

The dataset to be aligned has a top_left_corner at (-0.1, 0.1) (i.e., it has two more rows on top of the dataset, and two columns on the left of the dataset).

>>> arr = np.random.rand(10, 10)
>>> top_left_corner = (-0.1, 0.1)
>>> cell_size = 0.07
>>> dataset_target = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size,
... epsg=4326)
>>> print(dataset_target)
<BLANKLINE>
            Cell size: 0.07
            Dimension: 10 * 10
            EPSG: 4326
            Number of Bands: 1
            Band names: ['Band_1']
            Mask: -9999.0
            Data type: float64
            File:...
<BLANKLINE>

align-source-target

Now call the align method and use the dataset as the alignment source.

>>> aligned_dataset = dataset_target.align(dataset)
>>> print(aligned_dataset)
<BLANKLINE>
            Cell size: 0.05
            Dimension: 5 * 5
            EPSG: 4326
            Number of Bands: 1
            Band names: ['Band_1']
            Mask: -9999.0
            Data type: float64
            File:...
<BLANKLINE>

align-result

Source code in pyramids/dataset.py

def align(
    self,
    alignment_src: "Dataset",
) -> "Dataset":
    """Align the current dataset (rows and columns) to match a given dataset.

    Copies spatial properties from alignment_src to the current raster:
        - The coordinate system
        - The number of rows and columns
        - Cell size
    Then resamples values from the current dataset using the nearest neighbor interpolation.

    Args:
        alignment_src (Dataset):
            Spatial information source raster to get the spatial information (coordinate system, number of rows and
            columns). The data values of the current dataset are resampled to this alignment.

    Returns:
        Dataset: The aligned dataset.

    Examples:
        - The source dataset has a `top_left_corner` at (0, 0) with a 5*5 alignment, and a 0.05 degree cell size.

          ```python
          >>> import numpy as np
          >>> arr = np.random.rand(5, 5)
          >>> top_left_corner = (0, 0)
          >>> cell_size = 0.05
          >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
          >>> print(dataset)
          <BLANKLINE>
                      Cell size: 0.05
                      Dimension: 5 * 5
                      EPSG: 4326
                      Number of Bands: 1
                      Band names: ['Band_1']
                      Mask: -9999.0
                      Data type: float64
                      File:...
          <BLANKLINE>

          ```

        - The dataset to be aligned has a top_left_corner at (-0.1, 0.1) (i.e., it has two more rows on top of the
          dataset, and two columns on the left of the dataset).

          ```python
          >>> arr = np.random.rand(10, 10)
          >>> top_left_corner = (-0.1, 0.1)
          >>> cell_size = 0.07
          >>> dataset_target = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size,
          ... epsg=4326)
          >>> print(dataset_target)
          <BLANKLINE>
                      Cell size: 0.07
                      Dimension: 10 * 10
                      EPSG: 4326
                      Number of Bands: 1
                      Band names: ['Band_1']
                      Mask: -9999.0
                      Data type: float64
                      File:...
          <BLANKLINE>

          ```

        ![align-source-target](./../_images/dataset/align-source-target.png)

        - Now call the `align` method and use the dataset as the alignment source.

          ```python
          >>> aligned_dataset = dataset_target.align(dataset)
          >>> print(aligned_dataset)
          <BLANKLINE>
                      Cell size: 0.05
                      Dimension: 5 * 5
                      EPSG: 4326
                      Number of Bands: 1
                      Band names: ['Band_1']
                      Mask: -9999.0
                      Data type: float64
                      File:...
          <BLANKLINE>

          ```

        ![align-result](./../_images/dataset/align-result.png)
    """
    if isinstance(alignment_src, Dataset):
        src = alignment_src
    else:
        raise TypeError(
            "First parameter should be a Dataset read using Dataset.openRaster or a path to the raster, "
            f"given {type(alignment_src)}"
        )

    # reproject the raster to match the projection of alignment_src
    if not self.epsg == src.epsg:
        reprojected_RasterB = self.to_crs(src.epsg)
    else:
        reprojected_RasterB = self
    # create a new raster
    dst = Dataset._create_dataset(
        src.columns, src.rows, self.band_count, src.gdal_dtype[0], driver="MEM"
    )
    # set the geotransform
    dst.SetGeoTransform(src.geotransform)
    # set the projection
    dst.SetProjection(src.crs)
    # set the no data value
    dst_obj = Dataset(dst)
    dst_obj._set_no_data_value(self.no_data_value)
    # perform the projection & resampling
    method = gdal.GRA_NearestNeighbour
    # resample the reprojected_RasterB
    gdal.ReprojectImage(
        reprojected_RasterB.raster,
        dst_obj.raster,
        src.crs,
        src.crs,
        method,
    )

    return dst_obj

`correct_wrap_cutline_error(src)` `staticmethod` #

Correct wrap cutline error.

https://github.com/Serapieum-of-alex/pyramids/issues/74

Source code in pyramids/dataset.py

@staticmethod
def correct_wrap_cutline_error(src: "Dataset"):
    """Correct wrap cutline error.

    https://github.com/Serapieum-of-alex/pyramids/issues/74
    """
    big_array = src.read_array()
    value_to_remove = src.no_data_value[0]
    """Remove rows and columns that are all filled with a certain value from a 2D array."""
    # Find rows and columns to be removed
    if big_array.ndim == 2:
        rows_to_remove = np.all(big_array == value_to_remove, axis=1)
        cols_to_remove = np.all(big_array == value_to_remove, axis=0)
        # Use boolean indexing to remove rows and columns
        small_array = big_array[~rows_to_remove][:, ~cols_to_remove]
    elif big_array.ndim == 3:
        rows_to_remove = np.all(big_array == value_to_remove, axis=(0, 2))
        cols_to_remove = np.all(big_array == value_to_remove, axis=(0, 1))
        # Use boolean indexing to remove rows and columns
        # first remove the rows then the columns
        small_array = big_array[:, ~rows_to_remove, :]
        small_array = small_array[:, :, ~cols_to_remove]
        n_rows = np.count_nonzero(~rows_to_remove)
        n_cols = np.count_nonzero(~cols_to_remove)
        small_array = small_array.reshape((src.band_count, n_rows, n_cols))
    else:
        raise ValueError("Array must be 2D or 3D")

    x_ind = np.where(~rows_to_remove)[0][0]
    y_ind = np.where(~cols_to_remove)[0][0]
    new_x = src.x[y_ind] - src.cell_size / 2
    new_y = src.y[x_ind] + src.cell_size / 2
    new_gt = (new_x, src.cell_size, 0, new_y, 0, -src.cell_size)
    new_src = src.create_from_array(
        small_array, geo=new_gt, epsg=src.epsg, no_data_value=src.no_data_value
    )
    return new_src

`crop(mask, touch=True, inplace=False)` #

Crop dataset using dataset/feature collection.

Crop/Clip the Dataset object using a polygon/raster.

Parameters:

Name	Type	Description	Default
`mask`	`GeoDataFrame \| Dataset`	GeoDataFrame with a polygon geometry, or a Dataset object.	required
`touch`	`bool`	Include the cells that touch the polygon, not only those that lie entirely inside the polygon mask. Default is True.	`True`
`inplace`	`bool`	If True, apply changes in place. Default is False.	`False`

Returns:

Type	Description
`Union[Dataset, None]`	Dataset \| None: The cropped raster. If inplace is True, the method will change the raster in place and return None.

Hint

If the mask is a dataset with multi-bands, the crop method will use the first band as the mask.

Examples:

Crop the raster using a polygon mask.
The polygon covers 4 cells in the 3rd and 4th rows and 3rd and 4th column arr[2:4, 2:4], so the result dataset will have the same number of bands 4, 2 rows and 2 columns.
First, create the dataset to have 4 bands, 10 rows and 10 columns; the dataset has a cell size of 0.05 degree, the top left corner of the dataset is (0, 0).

>>> import numpy as np
>>> import geopandas as gpd
>>> from shapely.geometry import Polygon
>>> arr = np.random.rand(4, 10, 10)
>>> cell_size = 0.05
>>> top_left_corner = (0, 0)
>>> dataset = Dataset.create_from_array(
...         arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326
... )

- Second, create the polygon using shapely polygon, and use the xmin, ymin, xmax, ymax = [0.1, -0.2, 0.2 -0.1] to cover the 4 cells.

```python
>>> mask = gpd.GeoDataFrame(geometry=[Polygon([(0.1, -0.1), (0.1, -0.2), (0.2, -0.2), (0.2, -0.1)])], crs=4326)

```

Pass the geodataframe to the crop method using the mask parameter.

>>> cropped_dataset = dataset.crop(mask=mask)

- Check the cropped dataset:

>>> print(cropped_dataset.shape)
(4, 2, 2)
>>> print(cropped_dataset.geotransform)
(0.1, 0.05, 0.0, -0.1, 0.0, -0.05)
>>> print(cropped_dataset.read_array(band=0))# doctest: +SKIP
[[0.00921161 0.90841171]
 [0.355636   0.18650262]]
>>> print(arr[0, 2:4, 2:4])# doctest: +SKIP
[[0.00921161 0.90841171]
 [0.355636   0.18650262]]

- Crop a raster using another raster mask:

Create a mask dataset with the same extent of the polygon we used in the previous example.

>>> geotransform = (0.1, 0.05, 0.0, -0.1, 0.0, -0.05)
>>> mask_dataset = Dataset.create_from_array(np.random.rand(2, 2), geo=geotransform, epsg=4326)

- Then use the mask dataset to crop the dataset.

>>> cropped_dataset_2 = dataset.crop(mask=mask_dataset)
>>> print(cropped_dataset_2.shape)
(4, 2, 2)

- Check the cropped dataset:

>>> print(cropped_dataset_2.geotransform)
(0.1, 0.05, 0.0, -0.1, 0.0, -0.05)
>>> print(cropped_dataset_2.read_array(band=0))# doctest: +SKIP
[[0.00921161 0.90841171]
 [0.355636   0.18650262]]
>>> print(arr[0, 2:4, 2:4])# doctest: +SKIP
 [[0.00921161 0.90841171]
 [0.355636   0.18650262]]

Source code in pyramids/dataset.py

def crop(
    self,
    mask: Union[GeoDataFrame, FeatureCollection],
    touch: bool = True,
    inplace: bool = False,
) -> Union["Dataset", None]:
    """Crop dataset using dataset/feature collection.

        Crop/Clip the Dataset object using a polygon/raster.

    Args:
        mask (GeoDataFrame | Dataset):
            GeoDataFrame with a polygon geometry, or a Dataset object.
        touch (bool):
            Include the cells that touch the polygon, not only those that lie entirely inside the polygon mask.
            Default is True.
        inplace (bool):
            If True, apply changes in place. Default is False.

    Returns:
        Dataset | None:
            The cropped raster. If inplace is True, the method will change the raster in place and return None.

    Hint:
        - If the mask is a dataset with multi-bands, the `crop` method will use the first band as the mask.

    Examples:
        - Crop the raster using a polygon mask.

          - The polygon covers 4 cells in the 3rd and 4th rows and 3rd and 4th column `arr[2:4, 2:4]`, so the result
            dataset will have the same number of bands `4`, 2 rows and 2 columns.
          - First, create the dataset to have 4 bands, 10 rows and 10 columns; the dataset has a cell size of 0.05
            degree, the top left corner of the dataset is (0, 0).

          ```python
          >>> import numpy as np
          >>> import geopandas as gpd
          >>> from shapely.geometry import Polygon
          >>> arr = np.random.rand(4, 10, 10)
          >>> cell_size = 0.05
          >>> top_left_corner = (0, 0)
          >>> dataset = Dataset.create_from_array(
          ...         arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326
          ... )

          ```
        - Second, create the polygon using shapely polygon, and use the xmin, ymin, xmax, ymax = [0.1, -0.2, 0.2 -0.1]
            to cover the 4 cells.

            ```python
            >>> mask = gpd.GeoDataFrame(geometry=[Polygon([(0.1, -0.1), (0.1, -0.2), (0.2, -0.2), (0.2, -0.1)])], crs=4326)

            ```
        - Pass the `geodataframe` to the crop method using the `mask` parameter.

          ```python
          >>> cropped_dataset = dataset.crop(mask=mask)

          ```
        - Check the cropped dataset:

          ```python
          >>> print(cropped_dataset.shape)
          (4, 2, 2)
          >>> print(cropped_dataset.geotransform)
          (0.1, 0.05, 0.0, -0.1, 0.0, -0.05)
          >>> print(cropped_dataset.read_array(band=0))# doctest: +SKIP
          [[0.00921161 0.90841171]
           [0.355636   0.18650262]]
          >>> print(arr[0, 2:4, 2:4])# doctest: +SKIP
          [[0.00921161 0.90841171]
           [0.355636   0.18650262]]

          ```
        - Crop a raster using another raster mask:

          - Create a mask dataset with the same extent of the polygon we used in the previous example.

          ```python
          >>> geotransform = (0.1, 0.05, 0.0, -0.1, 0.0, -0.05)
          >>> mask_dataset = Dataset.create_from_array(np.random.rand(2, 2), geo=geotransform, epsg=4326)

          ```
        - Then use the mask dataset to crop the dataset.

          ```python
          >>> cropped_dataset_2 = dataset.crop(mask=mask_dataset)
          >>> print(cropped_dataset_2.shape)
          (4, 2, 2)

          ```
        - Check the cropped dataset:

          ```python
          >>> print(cropped_dataset_2.geotransform)
          (0.1, 0.05, 0.0, -0.1, 0.0, -0.05)
          >>> print(cropped_dataset_2.read_array(band=0))# doctest: +SKIP
          [[0.00921161 0.90841171]
           [0.355636   0.18650262]]
          >>> print(arr[0, 2:4, 2:4])# doctest: +SKIP
           [[0.00921161 0.90841171]
           [0.355636   0.18650262]]

          ```

    """
    if isinstance(mask, GeoDataFrame):
        dst = self._crop_with_polygon_warp(mask, touch=touch)
    elif isinstance(mask, Dataset):
        dst = self._crop_with_raster(mask)
    else:
        raise TypeError(
            "The second parameter: mask could be either GeoDataFrame or Dataset object"
        )

    if inplace:
        self.__init__(dst.raster)
    else:
        return dst

`map_to_array_coordinates(points)` #

Convert coordinates of points to array indices.

map_to_array_coordinates locates a point with real coordinates (x, y) or (lon, lat) on the array by finding the cell indices (row, column) of the nearest cell in the raster.
The point coordinate system of the raster has to be projected to be able to calculate the distance.

Parameters:

Name	Type	Description	Default
`points`	`GeoDataFrame \| DataFrame \| FeatureCollection`	GeoDataFrame: GeoDataFrame with POINT geometry. DataFrame: DataFrame with x, y columns.	required

Returns:

Type	Description
`ndarray`	np.ndarray: Array with shape (N, 2) containing the row and column indices in the array.

Examples:

Create Dataset consisting of 2 bands, 10 rows, 10 columns, at the point lon/lat (0, 0).

>>> import numpy as np
>>> import pandas as pd
>>> arr = np.random.randint(1, 3, size=(2, 10, 10))
>>> top_left_corner = (0, 0)
>>> cell_size = 0.05
>>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)

- DataFrame with x, y columns:

We can give the function a DataFrame with x, y columns to array the coordinates of the points that are located within the dataset domain.

>>> points = pd.DataFrame({"x": [0.025, 0.175, 0.375], "y": [0.025, 0.225, 0.125]})
>>> indices = dataset.map_to_array_coordinates(points)
>>> print(indices)
[[0 0]
 [0 3]
 [0 7]]

- GeoDataFrame with POINT geometry:

We can give the function a GeoDataFrame with POINT geometry to array the coordinates of the points that locate within the dataset domain.

>>> from shapely.geometry import Point
>>> from geopandas import GeoDataFrame
>>> points = GeoDataFrame({"geometry": [Point(0.025, 0.025), Point(0.175, 0.225), Point(0.375, 0.125)]})
>>> indices = dataset.map_to_array_coordinates(points)
>>> print(indices)
[[0 0]
 [0 3]
 [0 7]]

Source code in pyramids/dataset.py

def map_to_array_coordinates(
    self,
    points: Union[GeoDataFrame, FeatureCollection, DataFrame],
) -> np.ndarray:
    """Convert coordinates of points to array indices.

    - map_to_array_coordinates locates a point with real coordinates (x, y) or (lon, lat) on the array by finding
        the cell indices (row, column) of the nearest cell in the raster.
    - The point coordinate system of the raster has to be projected to be able to calculate the distance.

    Args:
        points (GeoDataFrame | pandas.DataFrame | FeatureCollection):
            - GeoDataFrame: GeoDataFrame with POINT geometry.
            - DataFrame: DataFrame with x, y columns.

    Returns:
        np.ndarray:
            Array with shape (N, 2) containing the row and column indices in the array.

    Examples:
        - Create `Dataset` consisting of 2 bands, 10 rows, 10 columns, at the point lon/lat (0, 0).

          ```python
          >>> import numpy as np
          >>> import pandas as pd
          >>> arr = np.random.randint(1, 3, size=(2, 10, 10))
          >>> top_left_corner = (0, 0)
          >>> cell_size = 0.05
          >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)

          ```
        - DataFrame with x, y columns:

          - We can give the function a DataFrame with x, y columns to array the coordinates of the points that are located within the dataset domain.

          ```python
          >>> points = pd.DataFrame({"x": [0.025, 0.175, 0.375], "y": [0.025, 0.225, 0.125]})
          >>> indices = dataset.map_to_array_coordinates(points)
          >>> print(indices)
          [[0 0]
           [0 3]
           [0 7]]

          ```
        - GeoDataFrame with POINT geometry:

          - We can give the function a GeoDataFrame with POINT geometry to array the coordinates of the points that locate within the dataset domain.

          ```python
          >>> from shapely.geometry import Point
          >>> from geopandas import GeoDataFrame
          >>> points = GeoDataFrame({"geometry": [Point(0.025, 0.025), Point(0.175, 0.225), Point(0.375, 0.125)]})
          >>> indices = dataset.map_to_array_coordinates(points)
          >>> print(indices)
          [[0 0]
           [0 3]
           [0 7]]

          ```
    """
    if isinstance(points, GeoDataFrame):
        points = FeatureCollection(points)
    elif isinstance(points, DataFrame):
        if all(elem not in points.columns for elem in ["x", "y"]):
            raise ValueError(
                "If the input is a DataFrame, it should have two columns x, and y"
            )
    else:
        if not isinstance(points, FeatureCollection):
            raise TypeError(
                "please check points input it should be GeoDataFrame/DataFrame/FeatureCollection - given"
                f" {type(points)}"
            )
    if not isinstance(points, DataFrame):
        # get the x, y coordinates.
        points.xy()
        points = points.feature.loc[:, ["x", "y"]].values
    else:
        points = points.loc[:, ["x", "y"]].values

    # since the first row is x-coords so the first column in the indices is the column index
    indices = locate_values(points, self.x, self.y)
    # rearrange the columns to make the row index first
    indices = indices[:, [1, 0]]
    return indices

`array_to_map_coordinates(rows_index, column_index, center=False)` #

Convert array indices to map coordinates.

array_to_map_coordinates converts the array indices (rows, cols) to real coordinates (x, y) or (lon, lat).

Parameters:

Name	Type	Description	Default
`rows_index`	`List[Number] \| ndarray`	The row indices of the cells in the raster array.	required
`column_index`	`List[Number] \| ndarray`	The column indices of the cells in the raster array.	required
`center`	`bool`	If True, the coordinates will be the center of the cell. Default is False.	`False`

Returns:

Type	Description
`Tuple[List[Number], List[Number]]`	Tuple[List[Number], List[Number]]: A tuple of two lists: the x coordinates and the y coordinates of the cells.

Examples:

Create Dataset consisting of 1 band, 10 rows, 10 columns, at the point lon/lat (0, 0):

>>> import numpy as np
>>> import pandas as pd
>>> arr = np.random.randint(1, 3, size=(10, 10))
>>> top_left_corner = (0, 0)
>>> cell_size = 0.05
>>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)

Now call the function with two lists of row and column indices:

>>> rows_index = [1, 3, 5]
>>> column_index = [2, 4, 6]
>>> coords = dataset.array_to_map_coordinates(rows_index, column_index)
>>> print(coords) # doctest: +SKIP
([0.1, 0.2, 0.3], [-0.05, -0.15, -0.25])

Source code in pyramids/dataset.py

def array_to_map_coordinates(
    self,
    rows_index: Union[List[Number], np.ndarray],
    column_index: Union[List[Number], np.ndarray],
    center: bool = False,
) -> Tuple[List[Number], List[Number]]:
    """Convert array indices to map coordinates.

    array_to_map_coordinates converts the array indices (rows, cols) to real coordinates (x, y) or (lon, lat).

    Args:
        rows_index (List[Number] | np.ndarray):
            The row indices of the cells in the raster array.
        column_index (List[Number] | np.ndarray):
            The column indices of the cells in the raster array.
        center (bool):
            If True, the coordinates will be the center of the cell. Default is False.

    Returns:
        Tuple[List[Number], List[Number]]:
            A tuple of two lists: the x coordinates and the y coordinates of the cells.

    Examples:
        - Create `Dataset` consisting of 1 band, 10 rows, 10 columns, at the point lon/lat (0, 0):

          ```python
          >>> import numpy as np
          >>> import pandas as pd
          >>> arr = np.random.randint(1, 3, size=(10, 10))
          >>> top_left_corner = (0, 0)
          >>> cell_size = 0.05
          >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)

          ```

        - Now call the function with two lists of row and column indices:

          ```python
          >>> rows_index = [1, 3, 5]
          >>> column_index = [2, 4, 6]
          >>> coords = dataset.array_to_map_coordinates(rows_index, column_index)
          >>> print(coords) # doctest: +SKIP
          ([0.1, 0.2, 0.3], [-0.05, -0.15, -0.25])

          ```
    """
    top_left_x, top_left_y = self.top_left_corner
    cell_size = self.cell_size
    if center:
        # for the top left corner of the cell
        top_left_x += cell_size / 2
        top_left_y -= cell_size / 2

    x_coord_fn = lambda x: top_left_x + x * cell_size
    y_coord_fn = lambda y: top_left_y - y * cell_size

    x_coords = list(map(x_coord_fn, column_index))
    y_coords = list(map(y_coord_fn, rows_index))

    return x_coords, y_coords

`extract(band=None, exclude_value=None, feature=None)` #

Extract.

Extract method gets all the values in a raster, and excludes the values in the exclude_value parameter.
If the feature parameter is given, the raster will be clipped to the extent of the given feature and the values within the feature are extracted.

Parameters:

Name	Type	Description	Default
`band`	`int`	Band index. Default is None.	`None`
`exclude_value`	`Numeric`	Values to exclude from extracted values. If the dataset is multi-band, the values in `exclude_value` will be filtered out from the first band only.	`None`
`feature`	`FeatureCollection \| GeoDataFrame`	Vector data containing point geometries at which to extract the values. Default is None.	`None`

Returns:

Type	Description
`ndarray`	np.ndarray: The extracted values from each band in the dataset will be in one row in the returned array.

Examples:

Extract all values from the dataset:

First, create a dataset with 2 bands, 4 rows and 4 columns:

>>> import numpy as np
>>> arr = np.random.randint(1, 5, size=(2, 4, 4))
>>> top_left_corner = (0, 0)
>>> cell_size = 0.05
>>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
>>> print(dataset)
<BLANKLINE>
            Cell size: 0.05
            Dimension: 4 * 4
            EPSG: 4326
            Number of Bands: 2
            Band names: ['Band_1', 'Band_2']
            Mask: -9999.0
            Data type: int32
            File:...
<BLANKLINE>
>>> print(dataset.read_array()) # doctest: +SKIP
[[[1 3 3 4]
  [1 4 2 4]
  [2 4 2 1]
  [1 3 2 3]]
 [[3 2 1 3]
  [4 3 2 2]
  [2 2 3 4]
  [1 4 1 4]]]

Now, extract the values in the dataset:

>>> values = dataset.extract()
>>> print(values) # doctest: +SKIP
[[1 3 3 4 1 4 2 4 2 4 2 1 1 3 2 3]
 [3 2 1 3 4 3 2 2 2 2 3 4 1 4 1 4]]

Extract all the values except 2:

>>> values = dataset.extract(exclude_value=2)
>>> print(values) # doctest: +SKIP

Extract values at the location of the given point geometries:

>>> import geopandas as gpd
>>> from shapely.geometry import Point

Create the points using shapely and GeoPandas to cover the 4 cells with xmin, ymin, xmax, ymax = [0.1, -0.2, 0.2, -0.1]:

>>> points = gpd.GeoDataFrame(geometry=[Point(0.1, -0.1), Point(0.1, -0.2), Point(0.2, -0.2), Point(0.2, -0.1)],crs=4326)
>>> values = dataset.extract(feature=points)
>>> print(values) # doctest: +SKIP
[[4 3 3 4]
 [3 4 4 2]]

Source code in pyramids/dataset.py

def extract(
    self,
    band: int = None,
    exclude_value: Any = None,
    feature: Union[FeatureCollection, GeoDataFrame] = None,
) -> np.ndarray:
    """Extract.

    - Extract method gets all the values in a raster, and excludes the values in the exclude_value parameter.
    - If the feature parameter is given, the raster will be clipped to the extent of the given feature and the
      values within the feature are extracted.

    Args:
        band (int, optional):
            Band index. Default is None.
        exclude_value (Numeric, optional):
            Values to exclude from extracted values. If the dataset is multi-band, the values in `exclude_value`
            will be filtered out from the first band only.
        feature (FeatureCollection | GeoDataFrame, optional):
            Vector data containing point geometries at which to extract the values. Default is None.

    Returns:
        np.ndarray:
            The extracted values from each band in the dataset will be in one row in the returned array.

    Examples:
        - Extract all values from the dataset:

          - First, create a dataset with 2 bands, 4 rows and 4 columns:

            ```python
            >>> import numpy as np
            >>> arr = np.random.randint(1, 5, size=(2, 4, 4))
            >>> top_left_corner = (0, 0)
            >>> cell_size = 0.05
            >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
            >>> print(dataset)
            <BLANKLINE>
                        Cell size: 0.05
                        Dimension: 4 * 4
                        EPSG: 4326
                        Number of Bands: 2
                        Band names: ['Band_1', 'Band_2']
                        Mask: -9999.0
                        Data type: int32
                        File:...
            <BLANKLINE>
            >>> print(dataset.read_array()) # doctest: +SKIP
            [[[1 3 3 4]
              [1 4 2 4]
              [2 4 2 1]
              [1 3 2 3]]
             [[3 2 1 3]
              [4 3 2 2]
              [2 2 3 4]
              [1 4 1 4]]]

            ```

          - Now, extract the values in the dataset:

            ```python
            >>> values = dataset.extract()
            >>> print(values) # doctest: +SKIP
            [[1 3 3 4 1 4 2 4 2 4 2 1 1 3 2 3]
             [3 2 1 3 4 3 2 2 2 2 3 4 1 4 1 4]]

            ```

          - Extract all the values except 2:

            ```python
            >>> values = dataset.extract(exclude_value=2)
            >>> print(values) # doctest: +SKIP

            ```

        - Extract values at the location of the given point geometries:

          ```python
          >>> import geopandas as gpd
          >>> from shapely.geometry import Point
          ```

          - Create the points using shapely and GeoPandas to cover the 4 cells with xmin, ymin, xmax, ymax = [0.1, -0.2, 0.2, -0.1]:

            ```python
            >>> points = gpd.GeoDataFrame(geometry=[Point(0.1, -0.1), Point(0.1, -0.2), Point(0.2, -0.2), Point(0.2, -0.1)],crs=4326)
            >>> values = dataset.extract(feature=points)
            >>> print(values) # doctest: +SKIP
            [[4 3 3 4]
             [3 4 4 2]]

            ```
    """
    # Optimize: make the read_array return only the array for inside the mask feature, and not to read the whole
    #  raster
    arr = self.read_array(band=band)
    no_data_value = (
        self.no_data_value[0] if self.no_data_value[0] is not None else np.nan
    )
    if feature is None:
        mask = (
            [no_data_value, exclude_value]
            if exclude_value is not None
            else [no_data_value]
        )
        values = get_pixels2(arr, mask)
    else:
        indices = self.map_to_array_coordinates(feature)
        if arr.ndim > 2:
            values = arr[:, indices[:, 0], indices[:, 1]]
        else:
            values = arr[indices[:, 0], indices[:, 1]]

    return values

`overlay(classes_map, band=0, exclude_value=None)` #

Overlay.

Overlay method extracts all the values in the dataset for each class in the given class map.

Parameters:

Name	Type	Description	Default
`classes_map`	`Dataset`	Dataset object for the raster that has classes you want to overlay with the raster.	required
`band`	`int`	If the raster is multi-band, choose the band you want to overlay with the classes map. Default is 0.	`0`
`exclude_value`	`Numeric`	Values you want to exclude from extracted values. Default is None.	`None`

Returns:

Name	Type	Description
`Dict`	`Dict[List[float], List[float]]`	Dictionary with class values as keys (from the class map), and for each key a list of all the intersected values in the base map.

Examples:

Read the dataset:

>>> dataset = Dataset.read_file("examples/data/geotiff/raster-folder/MSWEP_1979.01.01.tif")
>>> dataset.plot(figsize=(6, 8)) # doctest: +SKIP

rhine-rainfall

Read the classes dataset:

>>> classes = Dataset.read_file("examples/data/geotiff/rhine-classes.tif")
>>> classes.plot(figsize=(6, 8), color_scale=4, bounds=[1,2,3,4,5,6]) # doctest: +SKIP

rhine-classes

Overlay the dataset with the classes dataset:

>>> classes_dict = dataset.overlay(classes)
>>> print(classes_dict.keys()) # doctest: +SKIP
dict_keys([1, 2, 3, 4, 5])

You can use the key 1 to get the values that overlay class 1.

Source code in pyramids/dataset.py

def overlay(
    self,
    classes_map,
    band: int = 0,
    exclude_value: Union[float, int] = None,
) -> Dict[List[float], List[float]]:
    """Overlay.

    Overlay method extracts all the values in the dataset for each class in the given class map.

    Args:
        classes_map (Dataset):
            Dataset object for the raster that has classes you want to overlay with the raster.
        band (int):
            If the raster is multi-band, choose the band you want to overlay with the classes map. Default is 0.
        exclude_value (Numeric, optional):
            Values you want to exclude from extracted values. Default is None.

    Returns:
        Dict:
            Dictionary with class values as keys (from the class map), and for each key a list of all the intersected
            values in the base map.

    Examples:
        - Read the dataset:

          ```python
          >>> dataset = Dataset.read_file("examples/data/geotiff/raster-folder/MSWEP_1979.01.01.tif")
          >>> dataset.plot(figsize=(6, 8)) # doctest: +SKIP

          ```

          ![rhine-rainfall](./../_images/dataset/rhine-rainfall.png)

        - Read the classes dataset:

          ```python
          >>> classes = Dataset.read_file("examples/data/geotiff/rhine-classes.tif")
          >>> classes.plot(figsize=(6, 8), color_scale=4, bounds=[1,2,3,4,5,6]) # doctest: +SKIP

          ```

          ![rhine-classes](./../_images/dataset/rhine-classes.png)

        - Overlay the dataset with the classes dataset:

          ```python
          >>> classes_dict = dataset.overlay(classes)
          >>> print(classes_dict.keys()) # doctest: +SKIP
          dict_keys([1, 2, 3, 4, 5])

          ```

        - You can use the key `1` to get the values that overlay class 1.
    """
    if not self._check_alignment(classes_map):
        raise AlignmentError(
            "The class Dataset is not aligned with the current raster, please use the method "
            "'align' to align both rasters."
        )
    arr = self.read_array(band=band)
    no_data_value = (
        self.no_data_value[0] if self.no_data_value[0] is not None else np.nan
    )
    mask = (
        [no_data_value, exclude_value]
        if exclude_value is not None
        else [no_data_value]
    )
    ind = get_indices2(arr, mask)
    classes = classes_map.read_array()
    values = dict()

    # extract values
    for i, ind_i in enumerate(ind):
        # first check if the sub-basin has a list in the dict if not create a list
        key = classes[ind_i[0], ind_i[1]]
        if key not in list(values.keys()):
            values[key] = list()

        values[key].append(arr[ind_i[0], ind_i[1]])

    return values

`get_mask(band=0)` #

Get the mask array.

Parameters:

Name	Type	Description	Default
`band`	`int`	Band index. Default is 0.	`0`

Returns:

Type	Description
`ndarray`	np.ndarray: Array of the mask. 0 value for cells out of the domain, and 255 for cells in the domain.

Source code in pyramids/dataset.py

def get_mask(self, band: int = 0) -> np.ndarray:
    """Get the mask array.

    Args:
        band (int):
            Band index. Default is 0.

    Returns:
        np.ndarray:
            Array of the mask. 0 value for cells out of the domain, and 255 for cells in the domain.
    """
    # TODO: there is a CreateMaskBand method in the gdal.Dataset class, it creates a mask band for the dataset
    #   either internally or externally.
    arr = self._iloc(band).GetMaskBand().ReadAsArray()
    return arr

`footprint(band=0, exclude_values=None)` #

Extract the real coverage of the values in a certain band.

Parameters:

Name	Type	Description	Default
`band`	`int`	Band index. Default is 0.	`0`
`exclude_values`	`Optional[List[Any]]`	If you want to exclude a certain value in the raster with another value inter the two values as a list of tuples a [(value_to_be_exclude_valuesd, new_value)]. Example of exclude_values usage: `>>> exclude_values = [0]` This parameter is introduced particularly in the case of rasters that has the no_data_value stored in the `no_data_value` property does not match the value stored in the band, so this option can correct this behavior.	`None`

Returns:

Name	Type	Description
`GeoDataFrame`	`Union[GeoDataFrame, None]`	geodataframe containing the polygon representing the extent of the raster. the extent column should contain a value of 2 only. if the dataset had separate polygons, each polygon will be in a separate row.

Examples:

The following raster dataset has flood depth stored in its values, and the non-flooded cells are filled with zero, so to extract the flood extent, we need to exclude the zero flood depth cells.

>>> dataset = Dataset.read_file("examples/data/geotiff/rhine-flood.tif")
>>> dataset.plot()
(<Figure size 800x800 with 2 Axes>, <Axes: >)

dataset-footprint-rhine-flood

Now, to extract the footprint of the dataset band, we need to specify the exclude_values parameter with the value of the non-flooded cells.

>>> extent = dataset.footprint(band=0, exclude_values=[0])
>>> print(extent)
   Band_1                                           geometry
0     2.0  POLYGON ((4070974.182 3181069.473, 4070974.182...
1     2.0  POLYGON ((4077674.182 3181169.473, 4077674.182...
2     2.0  POLYGON ((4091174.182 3169169.473, 4091174.182...
3     2.0  POLYGON ((4088574.182 3176269.473, 4088574.182...
4     2.0  POLYGON ((4082974.182 3167869.473, 4082974.182...
5     2.0  POLYGON ((4092274.182 3168269.473, 4092274.182...
6     2.0  POLYGON ((4072474.182 3181169.473, 4072474.182...

>>> extent.plot()
<Axes: >

dataset-footprint-rhine-flood-extent

Source code in pyramids/dataset.py

def footprint(
    self,
    band: int = 0,
    exclude_values: Optional[List[Any]] = None,
) -> Union[GeoDataFrame, None]:
    """Extract the real coverage of the values in a certain band.

    Args:
        band (int):
            Band index. Default is 0.
        exclude_values (Optional[List[Any]]):
            If you want to exclude a certain value in the raster with another value inter the two values as a
            list of tuples a [(value_to_be_exclude_valuesd, new_value)].

            - Example of exclude_values usage:

              ```python
              >>> exclude_values = [0]

              ```

            - This parameter is introduced particularly in the case of rasters that has the no_data_value stored in
              the `no_data_value` property does not match the value stored in the band, so this option can correct
              this behavior.

    Returns:
        GeoDataFrame:
            - geodataframe containing the polygon representing the extent of the raster. the extent column should
              contain a value of 2 only.
            - if the dataset had separate polygons, each polygon will be in a separate row.

    Examples:
        - The following raster dataset has flood depth stored in its values, and the non-flooded cells are filled with
          zero, so to extract the flood extent, we need to exclude the zero flood depth cells.

          ```python
          >>> dataset = Dataset.read_file("examples/data/geotiff/rhine-flood.tif")
          >>> dataset.plot()
          (<Figure size 800x800 with 2 Axes>, <Axes: >)

          ```

        ![dataset-footprint-rhine-flood](./../_images/dataset/dataset-footprint-rhine-flood.png)

        - Now, to extract the footprint of the dataset band, we need to specify the `exclude_values` parameter with the
          value of the non-flooded cells.

          ```python
          >>> extent = dataset.footprint(band=0, exclude_values=[0])
          >>> print(extent)
             Band_1                                           geometry
          0     2.0  POLYGON ((4070974.182 3181069.473, 4070974.182...
          1     2.0  POLYGON ((4077674.182 3181169.473, 4077674.182...
          2     2.0  POLYGON ((4091174.182 3169169.473, 4091174.182...
          3     2.0  POLYGON ((4088574.182 3176269.473, 4088574.182...
          4     2.0  POLYGON ((4082974.182 3167869.473, 4082974.182...
          5     2.0  POLYGON ((4092274.182 3168269.473, 4092274.182...
          6     2.0  POLYGON ((4072474.182 3181169.473, 4072474.182...

          >>> extent.plot()
          <Axes: >

          ```

        ![dataset-footprint-rhine-flood-extent](./../_images/dataset/dataset-footprint-rhine-flood-extent.png)

    """
    arr = self.read_array(band=band)
    no_data_val = self.no_data_value[band]

    if no_data_val is None:
        if not (np.isnan(arr)).any():
            self.logger.warning(
                "The nodata value stored in the raster does not exist in the raster "
                "so either the raster extent is all full of data, or the no_data_value stored in the raster is"
                " not correct"
            )
    else:
        if not (np.isclose(arr, no_data_val, rtol=0.00001)).any():
            self.logger.warning(
                "the nodata value stored in the raster does not exist in the raster "
                "so either the raster extent is all full of data, or the no_data_value stored in the raster is"
                " not correct"
            )
    # if you want to exclude_values any value in the raster
    if exclude_values:
        for val in exclude_values:
            try:
                # in case the val2 is None, and the array is int type, the following line will give error as None
                # is considered as float
                arr[np.isclose(arr, val)] = no_data_val
            except TypeError:
                arr = arr.astype(np.float32)
                arr[np.isclose(arr, val)] = no_data_val

    # replace all the values with 2
    if no_data_val is None:
        # check if the whole raster is full of no_data_value
        if (np.isnan(arr)).all():
            self.logger.warning("the raster is full of no_data_value")
            return None

        arr[~np.isnan(arr)] = 2
    else:
        # check if the whole raster is full of no_data_value
        if (np.isclose(arr, no_data_val, rtol=0.00001)).all():
            self.logger.warning("the raster is full of no_data_value")
            return None

        arr[~np.isclose(arr, no_data_val, rtol=0.00001)] = 2
    new_dataset = self.create_from_array(
        arr, geo=self.geotransform, epsg=self.epsg, no_data_value=self.no_data_value
    )
    # then convert the raster into polygon
    gdf = new_dataset.cluster2(band=band)
    gdf.rename(columns={"Band_1": self.band_names[band]}, inplace=True)

    return gdf

`normalize(array)` `staticmethod` #

Normalize numpy arrays into scale 0.0–1.0.

Parameters:

Name	Type	Description	Default
`array`	`ndarray`	Numpy array to normalize.	required

Returns:

Type	Description
`ndarray`	np.ndarray: Normalized array.

Source code in pyramids/dataset.py

@staticmethod
def normalize(array: np.ndarray) -> np.ndarray:
    """Normalize numpy arrays into scale 0.0–1.0.

    Args:
        array (np.ndarray): Numpy array to normalize.

    Returns:
        np.ndarray: Normalized array.
    """
    array_min = array.min()
    array_max = array.max()
    val = (array - array_min) / (array_max - array_min)
    return val

`get_tile(size=256)` #

Get tile.

Parameters:

Name	Type	Description	Default
`size`	`int`	Size of the window in pixels. One value is required which is used for both the x and y size. e.g., 256 means a 256x256 window. Default is 256.	`256`

Yields:

Type	Description
`ndarray`	np.ndarray: Dataset array with a shape `[band, y, x]`.

Examples:

First, we will create a dataset with 3 rows and 5 columns.

>>> import numpy as np
>>> arr = np.random.rand(3, 5)
>>> top_left_corner = (0, 0)
>>> cell_size = 0.05
>>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
>>> print(dataset)
<BLANKLINE>
            Cell size: 0.05
            Dimension: 3 * 5
            EPSG: 4326
            Number of Bands: 1
            Band names: ['Band_1']
            Mask: -9999.0
            Data type: float64
            File:...
<BLANKLINE>

>>> print(dataset.read_array())   # doctest: +SKIP
[[0.55332314 0.48364841 0.67794589 0.6901816  0.70516817]
 [0.82518332 0.75657103 0.45693945 0.44331782 0.74677865]
 [0.22231314 0.96283065 0.15201337 0.03522544 0.44616888]]

- The get_tile method splits the domain into tiles of the specified size using the _window function.

>>> tile_dimensions = list(dataset._window(2))
>>> print(tile_dimensions)
[(0, 0, 2, 2), (2, 0, 2, 2), (4, 0, 1, 2), (0, 2, 2, 1), (2, 2, 2, 1), (4, 2, 1, 1)]

So the first two chunks are 22, 21 chunk, then two 12 chunks, and the last chunk is 11.
The get_tile method returns a generator object that can be used to iterate over the smaller chunks of the data.

>>> tiles_generator = dataset.get_tile(size=2)
>>> print(tiles_generator)  # doctest: +SKIP
<generator object Dataset.get_tile at 0x00000145AA39E680>
>>> print(list(tiles_generator))  # doctest: +SKIP
[
    array([[0.55332314, 0.48364841],
           [0.82518332, 0.75657103]]),
    array([[0.67794589, 0.6901816 ],
           [0.45693945, 0.44331782]]),
    array([[0.70516817], [0.74677865]]),
    array([[0.22231314, 0.96283065]]),
    array([[0.15201337, 0.03522544]]),
    array([[0.44616888]])
]

Source code in pyramids/dataset.py

def get_tile(self, size=256) -> Generator[np.ndarray, None, None]:
    """Get tile.

    Args:
        size (int):
            Size of the window in pixels. One value is required which is used for both the x and y size. e.g., 256
            means a 256x256 window. Default is 256.

    Yields:
        np.ndarray:
            Dataset array with a shape `[band, y, x]`.

    Examples:
        - First, we will create a dataset with 3 rows and 5 columns.

          ```python
          >>> import numpy as np
          >>> arr = np.random.rand(3, 5)
          >>> top_left_corner = (0, 0)
          >>> cell_size = 0.05
          >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
          >>> print(dataset)
          <BLANKLINE>
                      Cell size: 0.05
                      Dimension: 3 * 5
                      EPSG: 4326
                      Number of Bands: 1
                      Band names: ['Band_1']
                      Mask: -9999.0
                      Data type: float64
                      File:...
          <BLANKLINE>

          >>> print(dataset.read_array())   # doctest: +SKIP
          [[0.55332314 0.48364841 0.67794589 0.6901816  0.70516817]
           [0.82518332 0.75657103 0.45693945 0.44331782 0.74677865]
           [0.22231314 0.96283065 0.15201337 0.03522544 0.44616888]]

          ```
        - The `get_tile` method splits the domain into tiles of the specified `size` using the `_window` function.

          ```python
          >>> tile_dimensions = list(dataset._window(2))
          >>> print(tile_dimensions)
          [(0, 0, 2, 2), (2, 0, 2, 2), (4, 0, 1, 2), (0, 2, 2, 1), (2, 2, 2, 1), (4, 2, 1, 1)]

          ```
          ![get_tile](./../_images/dataset/get_tile.png)

        - So the first two chunks are 2*2, 2*1 chunk, then two 1*2 chunks, and the last chunk is 1*1.
        - The `get_tile` method returns a generator object that can be used to iterate over the smaller chunks of
            the data.

          ```python
          >>> tiles_generator = dataset.get_tile(size=2)
          >>> print(tiles_generator)  # doctest: +SKIP
          <generator object Dataset.get_tile at 0x00000145AA39E680>
          >>> print(list(tiles_generator))  # doctest: +SKIP
          [
              array([[0.55332314, 0.48364841],
                     [0.82518332, 0.75657103]]),
              array([[0.67794589, 0.6901816 ],
                     [0.45693945, 0.44331782]]),
              array([[0.70516817], [0.74677865]]),
              array([[0.22231314, 0.96283065]]),
              array([[0.15201337, 0.03522544]]),
              array([[0.44616888]])
          ]

          ```
    """
    for xoff, yoff, xsize, ysize in self._window(size=size):
        # read the array at certain indices
        yield self.raster.ReadAsArray(
            xoff=xoff, yoff=yoff, xsize=xsize, ysize=ysize
        )

`cluster(lower_bound, upper_bound)` #

Group all the connected values between two bounds.

Parameters:

Name	Type	Description	Default
`lower_bound`	`Number`	Lower bound of the cluster.	required
`upper_bound`	`Number`	Upper bound of the cluster.	required

Returns:

Type	Description
`Tuple[ndarray, int, list, list]`	tuple[np.ndarray, int, list, list]: - cluster (np.ndarray): Array with integers representing the cluster number per cell. - count (int): Number of clusters in the array. - position (list[list[int, int]]): List of [row, col] indices for the position of each value. - values (list[Number]): Values stored in each cell in the cluster.

Examples:

First, we will create a dataset with 10 rows and 10 columns.

>>> import numpy as np
>>> np.random.seed(10)
>>> arr = np.random.randint(1, 5, size=(5, 5))
>>> print(arr) # doctest: +SKIP
[[2 3 3 2 3]
 [3 4 1 1 1]
 [1 3 3 2 2]
 [4 1 1 3 2]
 [2 4 2 3 2]]
>>> top_left_corner = (0, 0)
>>> cell_size = 0.05
>>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
>>> dataset.plot(
...     color_scale=4, bounds=[1, 1.9, 4.1, 5], display_cell_value=True, num_size=12,
...     background_color_threshold=5
... )  # doctest: +SKIP

Now let's cluster the values in the dataset that are between 2 and 4.

>>> lower_value = 2
>>> upper_value = 4
>>> cluster_array, count, position, values = dataset.cluster(lower_value, upper_value)

- The first returned output is a binary array with 1 indicating that the cell value is inside the cluster, and 0 is outside.

>>> print(cluster_array)  # doctest: +SKIP
[[1. 1. 1. 1. 1.]
 [1. 1. 0. 0. 0.]
 [0. 1. 1. 1. 1.]
 [1. 0. 0. 1. 1.]
 [1. 1. 1. 1. 1.]]

- The second returned value is the number of connected clusters.

>>> print(count) # doctest: +SKIP
2

- The third returned value is the indices of the cells that belong to the cluster.

>>> print(position) # doctest: +SKIP
[[1, 0], [2, 1], [2, 2], [3, 3], [4, 3], [4, 4], [3, 4], [2, 4], [2, 3], [4, 2], [4, 1], [3, 0], [4, 0], [1, 1], [0, 2], [0, 3], [0, 4], [0, 1], [0, 0]]

- The fourth returned value is a list of the values that are in the cluster (extracted from these cells).

>>> print(values) # doctest: +SKIP
[3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 4, 4, 2, 4, 3, 2, 3, 3, 2]

Source code in pyramids/dataset.py

def cluster(
    self, lower_bound: Any, upper_bound: Any
) -> Tuple[np.ndarray, int, list, list]:
    """Group all the connected values between two bounds.

    Args:
        lower_bound (Number):
            Lower bound of the cluster.
        upper_bound (Number):
            Upper bound of the cluster.

    Returns:
        tuple[np.ndarray, int, list, list]:
            - cluster (np.ndarray):
                Array with integers representing the cluster number per cell.
            - count (int):
                Number of clusters in the array.
            - position (list[list[int, int]]):
                List of [row, col] indices for the position of each value.
            - values (list[Number]):
                Values stored in each cell in the cluster.

    Examples:
        - First, we will create a dataset with 10 rows and 10 columns.

          ```python
          >>> import numpy as np
          >>> np.random.seed(10)
          >>> arr = np.random.randint(1, 5, size=(5, 5))
          >>> print(arr) # doctest: +SKIP
          [[2 3 3 2 3]
           [3 4 1 1 1]
           [1 3 3 2 2]
           [4 1 1 3 2]
           [2 4 2 3 2]]
          >>> top_left_corner = (0, 0)
          >>> cell_size = 0.05
          >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
          >>> dataset.plot(
          ...     color_scale=4, bounds=[1, 1.9, 4.1, 5], display_cell_value=True, num_size=12,
          ...     background_color_threshold=5
          ... )  # doctest: +SKIP

          ```
          ![cluster](./../_images/dataset/cluster.png)

        - Now let's cluster the values in the dataset that are between 2 and 4.

          ```python
          >>> lower_value = 2
          >>> upper_value = 4
          >>> cluster_array, count, position, values = dataset.cluster(lower_value, upper_value)

          ```
        - The first returned output is a binary array with 1 indicating that the cell value is inside the cluster, and 0 is outside.

          ```python
          >>> print(cluster_array)  # doctest: +SKIP
          [[1. 1. 1. 1. 1.]
           [1. 1. 0. 0. 0.]
           [0. 1. 1. 1. 1.]
           [1. 0. 0. 1. 1.]
           [1. 1. 1. 1. 1.]]

          ```
        - The second returned value is the number of connected clusters.

          ```python
          >>> print(count) # doctest: +SKIP
          2

          ```
        - The third returned value is the indices of the cells that belong to the cluster.

          ```python
          >>> print(position) # doctest: +SKIP
          [[1, 0], [2, 1], [2, 2], [3, 3], [4, 3], [4, 4], [3, 4], [2, 4], [2, 3], [4, 2], [4, 1], [3, 0], [4, 0], [1, 1], [0, 2], [0, 3], [0, 4], [0, 1], [0, 0]]

          ```
        - The fourth returned value is a list of the values that are in the cluster (extracted from these cells).

          ```python
          >>> print(values) # doctest: +SKIP
          [3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 4, 4, 2, 4, 3, 2, 3, 3, 2]

          ```

    """
    data = self.read_array()
    position = []
    values = []
    count = 1
    cluster = np.zeros(shape=(data.shape[0], data.shape[1]))

    for i in range(data.shape[0]):
        for j in range(data.shape[1]):
            if lower_bound <= data[i, j] <= upper_bound and cluster[i, j] == 0:
                self._group_neighbours(
                    data,
                    i,
                    j,
                    lower_bound,
                    upper_bound,
                    position,
                    values,
                    count,
                    cluster,
                )
                if cluster[i, j] == 0:
                    position.append([i, j])
                    values.append(data[i, j])
                    cluster[i, j] = count
                count += 1

    return cluster, count, position, values

`cluster2(band=None)` #

Cluster the connected equal cells into polygons.

Creates vector polygons for all connected regions of pixels in the raster sharing a common pixel value (group neighboring cells with the same value into one polygon).

Parameters:

Name	Type	Description	Default
`band`	`int \| List[int] \| None`	Band index 0, 1, 2, 3, …	`None`

Returns:

Name	Type	Description
`GeoDataFrame`	`GeoDataFrame`	GeodataFrame containing polygon geomtries for all connected regions.

Examples:

First, we will create a 10*10 dataset full of random integer between 1, and 5.

>>> import numpy as np
>>> np.random.seed(200)
>>> arr = np.random.randint(1, 5, size=(10, 10))
>>> print(arr)  # doctest: +SKIP
[[3 2 1 1 3 4 1 4 2 3]
 [4 2 2 4 3 3 1 2 4 4]
 [4 2 4 2 3 4 2 1 4 3]
 [3 2 1 4 3 3 4 1 1 4]
 [1 2 4 2 2 1 3 2 3 1]
 [1 4 4 4 1 1 4 2 1 1]
 [1 3 2 3 3 4 1 3 1 3]
 [4 1 3 3 3 4 1 4 1 1]
 [2 1 3 3 4 2 2 1 3 4]
 [2 3 2 2 4 2 1 3 2 2]]
>>> top_left_corner = (0, 0)
>>> cell_size = 0.05
>>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)

Now, let's cluster the connected equal cells into polygons.

>>> gdf = dataset.cluster2()
>>> print(gdf)  # doctest: +SKIP
    Band_1                                           geometry
0        3  POLYGON ((0 0, 0 -0.05, 0.05 -0.05, 0.05 0, 0 0))
1        1  POLYGON ((0.1 0, 0.1 -0.05, 0.2 -0.05, 0.2 0, ...
2        4  POLYGON ((0.25 0, 0.25 -0.05, 0.3 -0.05, 0.3 0...
3        4  POLYGON ((0.35 0, 0.35 -0.05, 0.4 -0.05, 0.4 0...
4        2  POLYGON ((0.4 0, 0.4 -0.05, 0.45 -0.05, 0.45 0...
5        3  POLYGON ((0.45 0, 0.45 -0.05, 0.5 -0.05, 0.5 0...
6        1  POLYGON ((0.3 0, 0.3 -0.1, 0.35 -0.1, 0.35 0, ...
7        4  POLYGON ((0.15 -0.05, 0.15 -0.1, 0.2 -0.1, 0.2...
8        2  POLYGON ((0.35 -0.05, 0.35 -0.1, 0.4 -0.1, 0.4...
9        4  POLYGON ((0 -0.05, 0 -0.15, 0.05 -0.15, 0.05 -...
10       4  POLYGON ((0.4 -0.05, 0.4 -0.15, 0.45 -0.15, 0....
11       4  POLYGON ((0.1 -0.1, 0.1 -0.15, 0.15 -0.15, 0.1...

Source code in pyramids/dataset.py

def cluster2(
    self,
    band: Union[int, List[int]] = None,
) -> GeoDataFrame:
    """Cluster the connected equal cells into polygons.

    - Creates vector polygons for all connected regions of pixels in the raster sharing a common
        pixel value (group neighboring cells with the same value into one polygon).

    Args:
        band (int | List[int] | None):
            Band index 0, 1, 2, 3, …

    Returns:
        GeoDataFrame:
            GeodataFrame containing polygon geomtries for all connected regions.

    Examples:
        - First, we will create a 10*10 dataset full of random integer between 1, and 5.

          ```python
          >>> import numpy as np
          >>> np.random.seed(200)
          >>> arr = np.random.randint(1, 5, size=(10, 10))
          >>> print(arr)  # doctest: +SKIP
          [[3 2 1 1 3 4 1 4 2 3]
           [4 2 2 4 3 3 1 2 4 4]
           [4 2 4 2 3 4 2 1 4 3]
           [3 2 1 4 3 3 4 1 1 4]
           [1 2 4 2 2 1 3 2 3 1]
           [1 4 4 4 1 1 4 2 1 1]
           [1 3 2 3 3 4 1 3 1 3]
           [4 1 3 3 3 4 1 4 1 1]
           [2 1 3 3 4 2 2 1 3 4]
           [2 3 2 2 4 2 1 3 2 2]]
          >>> top_left_corner = (0, 0)
          >>> cell_size = 0.05
          >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)

          ```

        - Now, let's cluster the connected equal cells into polygons.

          ```python
          >>> gdf = dataset.cluster2()
          >>> print(gdf)  # doctest: +SKIP
              Band_1                                           geometry
          0        3  POLYGON ((0 0, 0 -0.05, 0.05 -0.05, 0.05 0, 0 0))
          1        1  POLYGON ((0.1 0, 0.1 -0.05, 0.2 -0.05, 0.2 0, ...
          2        4  POLYGON ((0.25 0, 0.25 -0.05, 0.3 -0.05, 0.3 0...
          3        4  POLYGON ((0.35 0, 0.35 -0.05, 0.4 -0.05, 0.4 0...
          4        2  POLYGON ((0.4 0, 0.4 -0.05, 0.45 -0.05, 0.45 0...
          5        3  POLYGON ((0.45 0, 0.45 -0.05, 0.5 -0.05, 0.5 0...
          6        1  POLYGON ((0.3 0, 0.3 -0.1, 0.35 -0.1, 0.35 0, ...
          7        4  POLYGON ((0.15 -0.05, 0.15 -0.1, 0.2 -0.1, 0.2...
          8        2  POLYGON ((0.35 -0.05, 0.35 -0.1, 0.4 -0.1, 0.4...
          9        4  POLYGON ((0 -0.05, 0 -0.15, 0.05 -0.15, 0.05 -...
          10       4  POLYGON ((0.4 -0.05, 0.4 -0.15, 0.45 -0.15, 0....
          11       4  POLYGON ((0.1 -0.1, 0.1 -0.15, 0.15 -0.15, 0.1...

          ```

    """
    if band is None:
        band = 0

    name = self.band_names[band]
    gdf = self._band_to_polygon(band, name)

    return gdf

`create_overviews(resampling_method='nearest', overview_levels=None)` #

Create overviews for the dataset.

Parameters:

Name	Type	Description	Default
`resampling_method`	`str`	The resampling method used to create the overviews. Possible values are "NEAREST", "CUBIC", "AVERAGE", "GAUSS", "CUBICSPLINE", "LANCZOS", "MODE", "AVERAGE_MAGPHASE", "RMS", "BILINEAR". Defaults to "nearest".	`'nearest'`
`overview_levels`	`list`	The overview levels. Restricted to typical power-of-two reduction factors. Defaults to [2, 4, 8, 16, 32].	`None`

Returns:

Name	Type	Description
`None`	`None`	Creates internal or external overviews depending on the dataset access mode. See Notes.

Notes

External (.ovr file): If the dataset is read with read_only=True then the overviews file will be created as an external .ovr file in the same directory of the dataset.
Internal: If the dataset is read with read_only=False then the overviews will be created internally in the dataset, and the dataset needs to be saved/flushed to persist the changes to disk.
You can check the count per band via the overview_count property.

Examples:

Create a Dataset with 4 bands, 10 rows, 10 columns, at the point lon/lat (0, 0):

>>> import numpy as np
>>> arr = np.random.rand(4, 10, 10)
>>> top_left_corner = (0, 0)
>>> cell_size = 0.05
>>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)

- Now, create overviews using the default parameters:

>>> dataset.create_overviews()
>>> print(dataset.overview_count)  # doctest: +SKIP
[4, 4, 4, 4]

- For each band, there are 4 overview levels you can use to plot the bands:

>>> dataset.plot(band=0, overview=True, overview_index=0) # doctest: +SKIP

However, the dataset originally is 10*10, but the first overview level (2) displays half of the cells by aggregating all the cells using the nearest neighbor. The second level displays only 3 cells in each:

>>> dataset.plot(band=0, overview=True, overview_index=1)   # doctest: +SKIP

For the third overview level:

>>> dataset.plot(band=0, overview=True, overview_index=2)       # doctest: +SKIP

See Also

Dataset.recreate_overviews: Recreate the dataset overviews if they exist
Dataset.get_overview: Get an overview of a band
Dataset.overview_count: Number of overviews
Dataset.read_overview_array: Read overview values
Dataset.plot: Plot a band

Source code in pyramids/dataset.py

def create_overviews(
    self, resampling_method: str = "nearest", overview_levels: list = None
) -> None:
    """Create overviews for the dataset.

    Args:
        resampling_method (str):
            The resampling method used to create the overviews. Possible values are
            "NEAREST", "CUBIC", "AVERAGE", "GAUSS", "CUBICSPLINE", "LANCZOS", "MODE",
            "AVERAGE_MAGPHASE", "RMS", "BILINEAR". Defaults to "nearest".
        overview_levels (list, optional):
            The overview levels. Restricted to typical power-of-two reduction factors. Defaults to [2, 4, 8, 16,
            32].

    Returns:
        None:
            Creates internal or external overviews depending on the dataset access mode. See Notes.

    Notes:
        - External (.ovr file): If the dataset is read with `read_only=True` then the overviews file will be created
          as an external .ovr file in the same directory of the dataset.
        - Internal: If the dataset is read with `read_only=False` then the overviews will be created internally in
          the dataset, and the dataset needs to be saved/flushed to persist the changes to disk.
        - You can check the count per band via the `overview_count` property.

    Examples:
        - Create a Dataset with 4 bands, 10 rows, 10 columns, at the point lon/lat (0, 0):

          ```python
          >>> import numpy as np
          >>> arr = np.random.rand(4, 10, 10)
          >>> top_left_corner = (0, 0)
          >>> cell_size = 0.05
          >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)

          ```
        - Now, create overviews using the default parameters:

          ```python
          >>> dataset.create_overviews()
          >>> print(dataset.overview_count)  # doctest: +SKIP
          [4, 4, 4, 4]

          ```
        - For each band, there are 4 overview levels you can use to plot the bands:

          ```python
          >>> dataset.plot(band=0, overview=True, overview_index=0) # doctest: +SKIP

          ```
          ![overviews-level-0](./../_images/dataset/overviews-level-0.png)

        - However, the dataset originally is 10*10, but the first overview level (2) displays half of the cells by
          aggregating all the cells using the nearest neighbor. The second level displays only 3 cells in each:

          ```python
          >>> dataset.plot(band=0, overview=True, overview_index=1)   # doctest: +SKIP

          ```
          ![overviews-level-1](./../_images/dataset/overviews-level-1.png)

        - For the third overview level:

          ```python
          >>> dataset.plot(band=0, overview=True, overview_index=2)       # doctest: +SKIP

          ```
          ![overviews-level-2](./../_images/dataset/overviews-level-2.png)

    See Also:
        - Dataset.recreate_overviews: Recreate the dataset overviews if they exist
        - Dataset.get_overview: Get an overview of a band
        - Dataset.overview_count: Number of overviews
        - Dataset.read_overview_array: Read overview values
        - Dataset.plot: Plot a band
    """
    if overview_levels is None:
        overview_levels = OVERVIEW_LEVELS
    else:
        if not isinstance(overview_levels, list):
            raise TypeError("overview_levels should be a list")

        # if self.raster.HasArbitraryOverviews():
        if not all(elem in OVERVIEW_LEVELS for elem in overview_levels):
            raise ValueError(
                "overview_levels are restricted to the typical power-of-two reduction factors "
                "(like 2, 4, 8, 16, etc.)"
            )

    if resampling_method.upper() not in RESAMPLING_METHODS:
        raise ValueError(f"resampling_method should be one of {RESAMPLING_METHODS}")
    # Define the overview levels (the reduction factor).
    # e.g., 2 means the overview will be half the resolution of the original dataset.

    # Build overviews using nearest neighbor resampling
    # NEAREST is the resampling method used. Other methods include AVERAGE, GAUSS, etc.
    self.raster.BuildOverviews(resampling_method, overview_levels)

`recreate_overviews(resampling_method='nearest')` #

Recreate overviews for the dataset.

Parameters:

Name	Type	Description	Default
`resampling_method`	`str`	Resampling method used to recreate overviews. Possible values are "NEAREST", "CUBIC", "AVERAGE", "GAUSS", "CUBICSPLINE", "LANCZOS", "MODE", "AVERAGE_MAGPHASE", "RMS", "BILINEAR". Defaults to "nearest".	`'nearest'`

Raises:

Type	Description
`ValueError`	If resampling_method is not one of the allowed values above.
`ReadOnlyError`	If overviews are internal and the dataset is opened read-only. Read with read_only=False.

See Also

Dataset.create_overviews: Recreate the dataset overviews if they exist.
Dataset.get_overview: Get an overview of a band.
Dataset.overview_count: Number of overviews.
Dataset.read_overview_array: Read overview values.
Dataset.plot: Plot a band.

Source code in pyramids/dataset.py

def recreate_overviews(self, resampling_method: str = "nearest"):
    """Recreate overviews for the dataset.

    Args:
        resampling_method (str): Resampling method used to recreate overviews. Possible values are
            "NEAREST", "CUBIC", "AVERAGE", "GAUSS", "CUBICSPLINE", "LANCZOS", "MODE",
            "AVERAGE_MAGPHASE", "RMS", "BILINEAR". Defaults to "nearest".

    Raises:
        ValueError:
            If resampling_method is not one of the allowed values above.
        ReadOnlyError:
            If overviews are internal and the dataset is opened read-only. Read with read_only=False.

    See Also:
        - Dataset.create_overviews: Recreate the dataset overviews if they exist.
        - Dataset.get_overview: Get an overview of a band.
        - Dataset.overview_count: Number of overviews.
        - Dataset.read_overview_array: Read overview values.
        - Dataset.plot: Plot a band.
    """
    if resampling_method.upper() not in RESAMPLING_METHODS:
        raise ValueError(f"resampling_method should be one of {RESAMPLING_METHODS}")
    # Build overviews using nearest neighbor resampling
    # nearest is the resampling method used. Other methods include AVERAGE, GAUSS, etc.
    try:
        for i in range(self.band_count):
            band = self._iloc(i)
            for j in range(self.overview_count[i]):
                ovr = self.get_overview(i, j)
                # TODO: if this method takes a long time, we can use the gdal.RegenerateOverviews() method
                #  which is faster but it does not give the option to choose the resampling method. and the
                #  overviews has to be given to the function as a list.
                #  overviews = [band.GetOverview(i) for i in range(band.GetOverviewCount())]
                #  band.RegenerateOverviews(overviews) or gdal.RegenerateOverviews(overviews)
                gdal.RegenerateOverview(band, ovr, resampling_method)
    except RuntimeError:
        raise ReadOnlyError(
            "The Dataset is opened with a read only. Please read the dataset using read_only=False"
        )

`get_overview(band=0, overview_index=0)` #

Get an overview of a band.

Parameters:

Name	Type	Description	Default
`band`	`int`	The band index. Defaults to 0.	`0`
`overview_index`	`int`	Index of the overview. Defaults to 0.	`0`

Returns:

Type	Description
`Band`	gdal.Band: GDAL band object.

Examples:

Create Dataset consisting of 4 bands, 10 rows, 10 columns, at lon/lat (0, 0):

>>> import numpy as np
>>> arr = np.random.randint(1, 10, size=(4, 10, 10))
>>> print(arr[0, :, :]) # doctest: +SKIP
array([[6, 3, 3, 7, 4, 8, 4, 3, 8, 7],
       [6, 7, 3, 7, 8, 6, 3, 4, 3, 8],
       [5, 8, 9, 6, 7, 7, 5, 4, 6, 4],
       [2, 9, 9, 5, 8, 4, 9, 6, 8, 7],
       [5, 8, 3, 9, 1, 5, 7, 9, 5, 9],
       [8, 3, 7, 2, 2, 5, 2, 8, 7, 7],
       [1, 1, 4, 2, 2, 2, 6, 5, 9, 2],
       [6, 3, 2, 9, 8, 8, 1, 9, 7, 7],
       [4, 1, 3, 1, 6, 7, 5, 4, 8, 7],
       [9, 7, 2, 1, 4, 6, 1, 2, 3, 3]], dtype=int32)
>>> top_left_corner = (0, 0)
>>> cell_size = 0.05
>>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)

Now, create overviews using the default parameters and inspect them:

>>> dataset.create_overviews()
>>> print(dataset.overview_count)  # doctest: +SKIP
[4, 4, 4, 4]

>>> ovr = dataset.get_overview(band=0, overview_index=0)
>>> print(ovr)  # doctest: +SKIP
<osgeo.gdal.Band; proxy of <Swig Object of type 'GDALRasterBandShadow *' at 0x0000017E2B5AF1B0> >
>>> ovr.ReadAsArray()  # doctest: +SKIP
array([[6, 3, 4, 4, 8],
       [5, 9, 7, 5, 6],
       [5, 3, 1, 7, 5],
       [1, 4, 2, 6, 9],
       [4, 3, 6, 5, 8]], dtype=int32)
>>> ovr = dataset.get_overview(band=0, overview_index=1)
>>> ovr.ReadAsArray()  # doctest: +SKIP
array([[6, 7, 3],
       [2, 5, 6],
       [6, 9, 9]], dtype=int32)
>>> ovr = dataset.get_overview(band=0, overview_index=2)
>>> ovr.ReadAsArray()  # doctest: +SKIP
array([[6, 8],
       [8, 5]], dtype=int32)
>>> ovr = dataset.get_overview(band=0, overview_index=3)
>>> ovr.ReadAsArray()  # doctest: +SKIP
array([[6]], dtype=int32)

See Also

Dataset.create_overviews: Create the dataset overviews if they exist.
Dataset.create_overviews: Recreate the dataset overviews if they exist.
Dataset.overview_count: Number of overviews.
Dataset.read_overview_array: Read overview values.
Dataset.plot: Plot a band.

Source code in pyramids/dataset.py

def get_overview(self, band: int = 0, overview_index: int = 0) -> gdal.Band:
    """Get an overview of a band.

    Args:
        band (int):
            The band index. Defaults to 0.
        overview_index (int):
            Index of the overview. Defaults to 0.

    Returns:
        gdal.Band:
            GDAL band object.

    Examples:
        - Create `Dataset` consisting of 4 bands, 10 rows, 10 columns, at lon/lat (0, 0):

          ```python
          >>> import numpy as np
          >>> arr = np.random.randint(1, 10, size=(4, 10, 10))
          >>> print(arr[0, :, :]) # doctest: +SKIP
          array([[6, 3, 3, 7, 4, 8, 4, 3, 8, 7],
                 [6, 7, 3, 7, 8, 6, 3, 4, 3, 8],
                 [5, 8, 9, 6, 7, 7, 5, 4, 6, 4],
                 [2, 9, 9, 5, 8, 4, 9, 6, 8, 7],
                 [5, 8, 3, 9, 1, 5, 7, 9, 5, 9],
                 [8, 3, 7, 2, 2, 5, 2, 8, 7, 7],
                 [1, 1, 4, 2, 2, 2, 6, 5, 9, 2],
                 [6, 3, 2, 9, 8, 8, 1, 9, 7, 7],
                 [4, 1, 3, 1, 6, 7, 5, 4, 8, 7],
                 [9, 7, 2, 1, 4, 6, 1, 2, 3, 3]], dtype=int32)
          >>> top_left_corner = (0, 0)
          >>> cell_size = 0.05
          >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)

          ```

        - Now, create overviews using the default parameters and inspect them:

          ```python
          >>> dataset.create_overviews()
          >>> print(dataset.overview_count)  # doctest: +SKIP
          [4, 4, 4, 4]

          >>> ovr = dataset.get_overview(band=0, overview_index=0)
          >>> print(ovr)  # doctest: +SKIP
          <osgeo.gdal.Band; proxy of <Swig Object of type 'GDALRasterBandShadow *' at 0x0000017E2B5AF1B0> >
          >>> ovr.ReadAsArray()  # doctest: +SKIP
          array([[6, 3, 4, 4, 8],
                 [5, 9, 7, 5, 6],
                 [5, 3, 1, 7, 5],
                 [1, 4, 2, 6, 9],
                 [4, 3, 6, 5, 8]], dtype=int32)
          >>> ovr = dataset.get_overview(band=0, overview_index=1)
          >>> ovr.ReadAsArray()  # doctest: +SKIP
          array([[6, 7, 3],
                 [2, 5, 6],
                 [6, 9, 9]], dtype=int32)
          >>> ovr = dataset.get_overview(band=0, overview_index=2)
          >>> ovr.ReadAsArray()  # doctest: +SKIP
          array([[6, 8],
                 [8, 5]], dtype=int32)
          >>> ovr = dataset.get_overview(band=0, overview_index=3)
          >>> ovr.ReadAsArray()  # doctest: +SKIP
          array([[6]], dtype=int32)

          ```

    See Also:
        - Dataset.create_overviews: Create the dataset overviews if they exist.
        - Dataset.create_overviews: Recreate the dataset overviews if they exist.
        - Dataset.overview_count: Number of overviews.
        - Dataset.read_overview_array: Read overview values.
        - Dataset.plot: Plot a band.
    """
    band = self._iloc(band)
    n_views = band.GetOverviewCount()
    if n_views == 0:
        raise ValueError(
            "The band has no overviews, please use the `create_overviews` method to build the overviews"
        )

    if overview_index >= n_views:
        raise ValueError(f"overview_level should be less than {n_views}")

    # TODO:find away to create a Dataset object from the overview band and to return the Dataset object instead
    #  of the gdal band.
    return band.GetOverview(overview_index)

`read_overview_array(band=None, overview_index=0)` #

Read overview values.

- Read the values stored in a given band or overview.

Parameters:

Name	Type	Description	Default
`band`	`int \| None`	The band to read. If None and multiple bands exist, reads all bands at the given overview.	`None`
`overview_index`	`int`	Index of the overview. Defaults to 0.	`0`

Returns:

Type	Description
`ndarray`	np.ndarray: Array with the values in the raster.

Examples:

Create Dataset consisting of 4 bands, 10 rows, 10 columns, at lon/lat (0, 0):

>>> import numpy as np
>>> arr = np.random.randint(1, 10, size=(4, 10, 10))
>>> print(arr[0, :, :])     # doctest: +SKIP
array([[6, 3, 3, 7, 4, 8, 4, 3, 8, 7],
       [6, 7, 3, 7, 8, 6, 3, 4, 3, 8],
       [5, 8, 9, 6, 7, 7, 5, 4, 6, 4],
       [2, 9, 9, 5, 8, 4, 9, 6, 8, 7],
       [5, 8, 3, 9, 1, 5, 7, 9, 5, 9],
       [8, 3, 7, 2, 2, 5, 2, 8, 7, 7],
       [1, 1, 4, 2, 2, 2, 6, 5, 9, 2],
       [6, 3, 2, 9, 8, 8, 1, 9, 7, 7],
       [4, 1, 3, 1, 6, 7, 5, 4, 8, 7],
       [9, 7, 2, 1, 4, 6, 1, 2, 3, 3]], dtype=int32)
>>> top_left_corner = (0, 0)
>>> cell_size = 0.05
>>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)

Create overviews using the default parameters and read overview arrays:

>>> dataset.create_overviews()
>>> print(dataset.overview_count)  # doctest: +SKIP
[4, 4, 4, 4]

>>> arr = dataset.read_overview_array(band=0, overview_index=0)
>>> print(arr)  # doctest: +SKIP
array([[6, 3, 4, 4, 8],
       [5, 9, 7, 5, 6],
       [5, 3, 1, 7, 5],
       [1, 4, 2, 6, 9],
       [4, 3, 6, 5, 8]], dtype=int32)
>>> arr = dataset.read_overview_array(band=0, overview_index=1)
>>> print(arr)  # doctest: +SKIP
array([[6, 7, 3],
       [2, 5, 6],
       [6, 9, 9]], dtype=int32)
>>> arr = dataset.read_overview_array(band=0, overview_index=2)
>>> print(arr)  # doctest: +SKIP
array([[6, 8],
       [8, 5]], dtype=int32)
>>> arr = dataset.read_overview_array(band=0, overview_index=3)
>>> print(arr)  # doctest: +SKIP
array([[6]], dtype=int32)

See Also

Dataset.create_overviews: Create the dataset overviews.
Dataset.create_overviews: Recreate the dataset overviews if they exist.
Dataset.get_overview: Get an overview of a band.
Dataset.overview_count: Number of overviews.
Dataset.plot: Plot a band.

Source code in pyramids/dataset.py

def read_overview_array(
    self, band: int = None, overview_index: int = 0
) -> np.ndarray:
    """Read overview values.

        - Read the values stored in a given band or overview.

    Args:
        band (int | None):
            The band to read. If None and multiple bands exist, reads all bands at the given overview.
        overview_index (int):
            Index of the overview. Defaults to 0.

    Returns:
        np.ndarray:
            Array with the values in the raster.

    Examples:
        - Create `Dataset` consisting of 4 bands, 10 rows, 10 columns, at lon/lat (0, 0):

          ```python
          >>> import numpy as np
          >>> arr = np.random.randint(1, 10, size=(4, 10, 10))
          >>> print(arr[0, :, :])     # doctest: +SKIP
          array([[6, 3, 3, 7, 4, 8, 4, 3, 8, 7],
                 [6, 7, 3, 7, 8, 6, 3, 4, 3, 8],
                 [5, 8, 9, 6, 7, 7, 5, 4, 6, 4],
                 [2, 9, 9, 5, 8, 4, 9, 6, 8, 7],
                 [5, 8, 3, 9, 1, 5, 7, 9, 5, 9],
                 [8, 3, 7, 2, 2, 5, 2, 8, 7, 7],
                 [1, 1, 4, 2, 2, 2, 6, 5, 9, 2],
                 [6, 3, 2, 9, 8, 8, 1, 9, 7, 7],
                 [4, 1, 3, 1, 6, 7, 5, 4, 8, 7],
                 [9, 7, 2, 1, 4, 6, 1, 2, 3, 3]], dtype=int32)
          >>> top_left_corner = (0, 0)
          >>> cell_size = 0.05
          >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)

          ```

        - Create overviews using the default parameters and read overview arrays:

          ```python
          >>> dataset.create_overviews()
          >>> print(dataset.overview_count)  # doctest: +SKIP
          [4, 4, 4, 4]

          >>> arr = dataset.read_overview_array(band=0, overview_index=0)
          >>> print(arr)  # doctest: +SKIP
          array([[6, 3, 4, 4, 8],
                 [5, 9, 7, 5, 6],
                 [5, 3, 1, 7, 5],
                 [1, 4, 2, 6, 9],
                 [4, 3, 6, 5, 8]], dtype=int32)
          >>> arr = dataset.read_overview_array(band=0, overview_index=1)
          >>> print(arr)  # doctest: +SKIP
          array([[6, 7, 3],
                 [2, 5, 6],
                 [6, 9, 9]], dtype=int32)
          >>> arr = dataset.read_overview_array(band=0, overview_index=2)
          >>> print(arr)  # doctest: +SKIP
          array([[6, 8],
                 [8, 5]], dtype=int32)
          >>> arr = dataset.read_overview_array(band=0, overview_index=3)
          >>> print(arr)  # doctest: +SKIP
          array([[6]], dtype=int32)

          ```

    See Also:
        - Dataset.create_overviews: Create the dataset overviews.
        - Dataset.create_overviews: Recreate the dataset overviews if they exist.
        - Dataset.get_overview: Get an overview of a band.
        - Dataset.overview_count: Number of overviews.
        - Dataset.plot: Plot a band.
    """
    if band is None and self.band_count > 1:
        if any(elem == 0 for elem in self.overview_count):
            raise ValueError(
                "Some bands do not have overviews, please create overviews first"
            )
        # read the array from the first overview to get the size of the array.
        arr = self.get_overview(0, 0).ReadAsArray()
        arr = np.ones(
            (
                self.band_count,
                arr.shape[0],
                arr.shape[1],
            ),
            dtype=self.numpy_dtype[0],
        )
        for i in range(self.band_count):
            arr[i, :, :] = self.get_overview(i, overview_index).ReadAsArray()
    else:
        if band is None:
            band = 0
        else:
            if band > self.band_count - 1:
                raise ValueError(
                    f"band index should be between 0 and {self.band_count - 1}"
                )
            if self.overview_count[band] == 0:
                raise ValueError(
                    f"band {band} has no overviews, please create overviews first"
                )
        arr = self.get_overview(band, overview_index).ReadAsArray()

    return arr

`get_band_by_color(color_name)` #

Get the band associated with a given color.

Parameters:

Name	Type	Description	Default
`color_name`	`str`	One of ['undefined', 'gray_index', 'palette_index', 'red', 'green', 'blue', 'alpha', 'hue', 'saturation', 'lightness', 'cyan', 'magenta', 'yellow', 'black', 'YCbCr_YBand', 'YCbCr_CbBand', 'YCbCr_CrBand'].	required

Returns:

Name	Type	Description
`int`	`int`	Band index.

Examples:

Create Dataset consisting of 3 bands and assign RGB colors:

>>> arr = np.random.randint(1, 3, size=(3, 10, 10))
>>> top_left_corner = (0, 0)
>>> cell_size = 0.05
>>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
>>> dataset.band_color = {0: 'red', 1: 'green', 2: 'blue'}

Now use get_band_by_color to know which band is the red band, for example:

>>> band_index = dataset.get_band_by_color('red')
>>> print(band_index)
0

Source code in pyramids/dataset.py

def get_band_by_color(self, color_name: str) -> int:
    """Get the band associated with a given color.

    Args:
        color_name (str):
            One of ['undefined', 'gray_index', 'palette_index', 'red', 'green', 'blue', 'alpha', 'hue',
            'saturation', 'lightness', 'cyan', 'magenta', 'yellow', 'black', 'YCbCr_YBand', 'YCbCr_CbBand',
            'YCbCr_CrBand'].

    Returns:
        int:
            Band index.

    Examples:
        - Create `Dataset` consisting of 3 bands and assign RGB colors:

          ```python
          >>> arr = np.random.randint(1, 3, size=(3, 10, 10))
          >>> top_left_corner = (0, 0)
          >>> cell_size = 0.05
          >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
          >>> dataset.band_color = {0: 'red', 1: 'green', 2: 'blue'}

          ```

        - Now use `get_band_by_color` to know which band is the red band, for example:

          ```python
          >>> band_index = dataset.get_band_by_color('red')
          >>> print(band_index)
          0

          ```
    """
    colors = list(self.band_color.values())
    if color_name not in colors:
        band_index = None
    else:
        band_index = colors.index(color_name)
    return band_index

`get_histogram(band=0, bins=6, min_value=None, max_value=None, include_out_of_range=False, approx_ok=False)` #

Get histogram.

Parameters:

Name	Type	Description	Default
`band`	`int`	Band index. Default is 1.	`0`
`bins`	`int`	Number of bins. Default is 6.	`6`
`min_value`	`float`	Minimum value. Default is None.	`None`
`max_value`	`float`	Maximum value. Default is None.	`None`
`include_out_of_range`	`bool`	If True, add out-of-range values into the first and last buckets. Default is False.	`False`
`approx_ok`	`bool`	If True, compute an approximate histogram by using subsampling or overviews. Default is False.	`False`

Returns:

Type	Description
`tuple[list, list[tuple[Any, Any]]]`	tuple[list, list[tuple[Any, Any]]]: Histogram values and bin edges.

Hint

The value of the histogram will be stored in an xml file by the name of the raster file with the extension of .aux.xml.

The content of the file will be like the following:

    <PAMDataset>
      <PAMRasterBand band="1">
        <Description>Band_1</Description>
        <Histograms>
          <HistItem>
            <HistMin>0</HistMin>
            <HistMax>88</HistMax>
            <BucketCount>6</BucketCount>
            <IncludeOutOfRange>0</IncludeOutOfRange>
            <Approximate>0</Approximate>
            <HistCounts>75|6|0|4|2|1</HistCounts>
          </HistItem>
        </Histograms>
      </PAMRasterBand>
    </PAMDataset>

Examples:

Create Dataset consists of 4 bands, 10 rows, 10 columns, at the point lon/lat (0, 0).

```python

import numpy as np arr = np.random.randint(1, 12, size=(10, 10)) print(arr) # doctest: +SKIP [[ 4 1 1 2 6 9 2 5 1 8] [ 1 11 5 6 2 5 4 6 6 7] [ 5 2 10 4 8 11 4 11 11 1] [ 2 3 6 3 1 5 11 10 10 7] [ 8 2 11 3 1 3 5 4 10 10] [ 1 2 1 6 10 3 6 4 2 8] [ 9 5 7 9 7 8 1 11 4 4] [ 7 7 2 2 5 3 7 2 9 9] [ 2 10 3 2 1 11 5 9 8 11] [ 1 5 6 11 3 3 8 1 2 1]] top_left_corner = (0, 0) cell_size = 0.05 dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)

```

Now, let's get the histogram of the first band using the get_histogram method with the default parameters:

>>> hist, ranges = dataset.get_histogram(band=0)
>>> print(hist)  # doctest: +SKIP
[28, 17, 10, 15, 13, 7]
>>> print(ranges)   # doctest: +SKIP
[(1.0, 2.67), (2.67, 4.34), (4.34, 6.0), (6.0, 7.67), (7.67, 9.34), (9.34, 11.0)]

we can also exclude values from the histogram by using the min_value and max_value:

>>> hist, ranges = dataset.get_histogram(band=0, min_value=5, max_value=10)
>>> print(hist)  # doctest: +SKIP
[10, 8, 7, 7, 6, 0]
>>> print(ranges)   # doctest: +SKIP
[(1.0, 1.835), (1.835, 2.67), (2.67, 3.5), (3.5, 4.34), (4.34, 5.167), (5.167, 6.0)]

For datasets with big dimensions, computing the histogram can take some time; approximating the computation of the histogram can save a lot of computation time. When using the parameter approx_ok with a True value the histogram will be calculated from resampling the band or from the overviews if they exist.
```
>>> hist, ranges = dataset.get_histogram(band=0, approx_ok=True)
>>> print(hist)  # doctest: +SKIP
[28, 17, 10, 15, 13, 7]
>>> print(ranges)   # doctest: +SKIP
[(1.0, 2.67), (2.67, 4.34), (4.34, 6.0), (6.0, 7.67), (7.67, 9.34), (9.34, 11.0)]
```
As you see for small datasets, the approximation of the histogram will be the same as without approximation.

Source code in pyramids/dataset.py

def get_histogram(
    self,
    band: int = 0,
    bins: int = 6,
    min_value: float = None,
    max_value: float = None,
    include_out_of_range: bool = False,
    approx_ok: bool = False,
) -> tuple[list, list[tuple[Any, Any]]]:
    """Get histogram.

    Args:
        band (int, optional):
            Band index. Default is 1.
        bins (int, optional):
            Number of bins. Default is 6.
        min_value (float, optional):
            Minimum value. Default is None.
        max_value (float, optional):
            Maximum value. Default is None.
        include_out_of_range (bool, optional):
            If True, add out-of-range values into the first and last buckets. Default is False.
        approx_ok (bool, optional):
            If True, compute an approximate histogram by using subsampling or overviews. Default is False.

    Returns:
        tuple[list, list[tuple[Any, Any]]]:
            Histogram values and bin edges.

    Hint:
        - The value of the histogram will be stored in an xml file by the name of the raster file with the extension
            of .aux.xml.

        - The content of the file will be like the following:
          ```xml

              <PAMDataset>
                <PAMRasterBand band="1">
                  <Description>Band_1</Description>
                  <Histograms>
                    <HistItem>
                      <HistMin>0</HistMin>
                      <HistMax>88</HistMax>
                      <BucketCount>6</BucketCount>
                      <IncludeOutOfRange>0</IncludeOutOfRange>
                      <Approximate>0</Approximate>
                      <HistCounts>75|6|0|4|2|1</HistCounts>
                    </HistItem>
                  </Histograms>
                </PAMRasterBand>
              </PAMDataset>

          ```

    Examples:
        - Create `Dataset` consists of 4 bands, 10 rows, 10 columns, at the point lon/lat (0, 0).

          ```python
          >>> import numpy as np
          >>> arr = np.random.randint(1, 12, size=(10, 10))
          >>> print(arr)    # doctest: +SKIP
          [[ 4  1  1  2  6  9  2  5  1  8]
           [ 1 11  5  6  2  5  4  6  6  7]
           [ 5  2 10  4  8 11  4 11 11  1]
           [ 2  3  6  3  1  5 11 10 10  7]
           [ 8  2 11  3  1  3  5  4 10 10]
           [ 1  2  1  6 10  3  6  4  2  8]
           [ 9  5  7  9  7  8  1 11  4  4]
           [ 7  7  2  2  5  3  7  2  9  9]
           [ 2 10  3  2  1 11  5  9  8 11]
           [ 1  5  6 11  3  3  8  1  2  1]]
           >>> top_left_corner = (0, 0)
           >>> cell_size = 0.05
           >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)

           ```

        - Now, let's get the histogram of the first band using the `get_histogram` method with the default
            parameters:
            ```python
            >>> hist, ranges = dataset.get_histogram(band=0)
            >>> print(hist)  # doctest: +SKIP
            [28, 17, 10, 15, 13, 7]
            >>> print(ranges)   # doctest: +SKIP
            [(1.0, 2.67), (2.67, 4.34), (4.34, 6.0), (6.0, 7.67), (7.67, 9.34), (9.34, 11.0)]

            ```
        - we can also exclude values from the histogram by using the `min_value` and `max_value`:
            ```python
            >>> hist, ranges = dataset.get_histogram(band=0, min_value=5, max_value=10)
            >>> print(hist)  # doctest: +SKIP
            [10, 8, 7, 7, 6, 0]
            >>> print(ranges)   # doctest: +SKIP
            [(1.0, 1.835), (1.835, 2.67), (2.67, 3.5), (3.5, 4.34), (4.34, 5.167), (5.167, 6.0)]

            ```
        - For datasets with big dimensions, computing the histogram can take some time; approximating the computation
            of the histogram can save a lot of computation time. When using the parameter `approx_ok` with a `True`
            value the histogram will be calculated from resampling the band or from the overviews if they exist.
            ```python
            >>> hist, ranges = dataset.get_histogram(band=0, approx_ok=True)
            >>> print(hist)  # doctest: +SKIP
            [28, 17, 10, 15, 13, 7]
            >>> print(ranges)   # doctest: +SKIP
            [(1.0, 2.67), (2.67, 4.34), (4.34, 6.0), (6.0, 7.67), (7.67, 9.34), (9.34, 11.0)]

            ```
        - As you see for small datasets, the approximation of the histogram will be the same as without approximation.

    """
    band = self._iloc(band)
    min_val, max_val = band.ComputeRasterMinMax()
    if min_value is None:
        min_value = min_val
    if max_value is None:
        max_value = max_val

    bin_width = (max_value - min_value) / bins
    ranges = [
        (min_val + i * bin_width, min_val + (i + 1) * bin_width)
        for i in range(bins)
    ]

    hist = band.GetHistogram(
        min=min_value,
        max=max_value,
        buckets=bins,
        include_out_of_range=include_out_of_range,
        approx_ok=approx_ok,
    )
    return hist, ranges

`to_xyz(bands=None, path=None)` #

Convert to XYZ.

Parameters:

Name	Type	Description	Default
`path`	`str`	path to the file where the data will be saved. If None, the data will be returned as a DataFrame. default is None.	`None`
`bands`	`List[int]`	indices of the bands. If None, all bands will be used. default is None	`None`

Returns:

Type	Description
`Union[DataFrame, None]`	DataFrame/File: DataFrame with columns: lon, lat, band_1, band_2,... . If a path is provided the data will be saved to disk as a .xyz file

Examples:

First we will create a dataset from a float32 array with values between 1 and 10, and then we will assign a scale of 0.1 to the dataset.

>>> import numpy as np
>>> arr = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])
>>> top_left_corner = (0, 0)
>>> cell_size = 0.05
>>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size,epsg=4326)
>>> print(dataset)
<BLANKLINE>
            Top Left Corner: (0.0, 0.0)
            Cell size: 0.05
            Dimension: 2 * 2
            EPSG: 4326
            Number of Bands: 2
            Band names: ['Band_1', 'Band_2']
            Band colors: {0: 'undefined', 1: 'undefined'}
            Band units: ['', '']
            Scale: [1.0, 1.0]
            Offset: [0, 0]
            Mask: -9999.0
            Data type: int64
            File: ...
<BLANKLINE>
>>> df = dataset.to_xyz()
>>> print(df)
     lon    lat  Band_1  Band_2
0  0.025 -0.025       1       5
1  0.075 -0.025       2       6
2  0.025 -0.075       3       7
3  0.075 -0.075       4       8

Source code in pyramids/dataset.py

def to_xyz(
    self, bands: Optional[List[int]] = None, path: Optional[str] = None
) -> Union[DataFrame, None]:
    """Convert to XYZ.

    Args:
        path (str, optional):
            path to the file where the data will be saved. If None, the data will be returned as a DataFrame.
            default is None.
        bands (List[int], optional):
            indices of the bands. If None, all bands will be used. default is None

    Returns:
        DataFrame/File:
            DataFrame with columns: lon, lat, band_1, band_2,... . If a path is provided the data will be saved to
            disk as a .xyz file

    Examples:
        - First we will create a dataset from a float32 array with values between 1 and 10, and then we will
            assign a scale of 0.1 to the dataset.
            ```python
            >>> import numpy as np
            >>> arr = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])
            >>> top_left_corner = (0, 0)
            >>> cell_size = 0.05
            >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size,epsg=4326)
            >>> print(dataset)
            <BLANKLINE>
                        Top Left Corner: (0.0, 0.0)
                        Cell size: 0.05
                        Dimension: 2 * 2
                        EPSG: 4326
                        Number of Bands: 2
                        Band names: ['Band_1', 'Band_2']
                        Band colors: {0: 'undefined', 1: 'undefined'}
                        Band units: ['', '']
                        Scale: [1.0, 1.0]
                        Offset: [0, 0]
                        Mask: -9999.0
                        Data type: int64
                        File: ...
            <BLANKLINE>
            >>> df = dataset.to_xyz()
            >>> print(df)
                 lon    lat  Band_1  Band_2
            0  0.025 -0.025       1       5
            1  0.075 -0.025       2       6
            2  0.025 -0.075       3       7
            3  0.075 -0.075       4       8
            ```
    """
    try:
        from osgeo_utils import gdal2xyz
    except ImportError:
        raise ImportError(
            "osgeo_utils is not installed. Install it using pip: pip install osgeo-utils"
        )

    if bands is None:
        bands = range(1, self.band_count + 1)
    elif isinstance(bands, int):
        bands = [bands + 1]
    elif isinstance(bands, list):
        bands = [band + 1 for band in bands]
    else:
        raise ValueError("bands must be an integer or a list of integers.")

    band_nums = bands
    arr = gdal2xyz.gdal2xyz(
        self.raster,
        path,
        skip_nodata=True,
        return_np_arrays=True,
        band_nums=band_nums,
    )
    if path is None:
        band_names = []
        if bands is not None:
            for band in bands:
                band_names.append(self.band_names[band - 1])
        else:
            band_names = self.band_names

        df = pd.DataFrame(columns=["lon", "lat"] + band_names)
        df["lon"] = arr[0]
        df["lat"] = arr[1]
        df[band_names] = arr[2].transpose()
        return df

Name	Type	Description	Default
`path`	`str`	Path of file to open.	required
`read_only`	`bool`	File mode, set to False, to open in "update" mode.	`True`
`file_i`	`int`	Index to the file inside the compressed file you want to read, if the compressed file has only one file. Default is 0.	`0`