Source code for sedona.spark.geopandas.geodataframe

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
from __future__ import annotations

from typing import Any, Literal, Callable, Union
import typing

import os
import shapely
import warnings
import numpy as np
import shapely
import geopandas as gpd
import pandas as pd
import pyspark.pandas as pspd
import sedona.spark.geopandas as sgpd
from pyspark.pandas import Series as PandasOnSparkSeries
from pyspark.pandas.frame import DataFrame as PandasOnSparkDataFrame
from pyspark.pandas.utils import log_advice

from sedona.spark.geopandas._typing import Label
from sedona.spark.geopandas.base import GeoFrame

from pandas.api.extensions import register_extension_dtype
from geopandas.geodataframe import crs_mismatch_error
from geopandas.array import GeometryDtype

register_extension_dtype(GeometryDtype)


# ============================================================================
# IMPLEMENTATION STATUS TRACKING
# ============================================================================

IMPLEMENTATION_STATUS = {
    "IMPLEMENTED": [
        "area",
        "buffer",
        "crs",
        "geometry",
        "active_geometry_name",
        "sindex",
        "rename_geometry",
        "copy",
        "sjoin",
        "to_parquet",
    ],
    "NOT_IMPLEMENTED": [
        "to_geopandas",
        "_to_geopandas",
        "geom_type",
        "type",
        "length",
        "is_valid",
        "is_valid_reason",
        "is_empty",
        "is_simple",
        "is_ring",
        "is_ccw",
        "is_closed",
        "has_z",
        "boundary",
        "centroid",
        "convex_hull",
        "envelope",
        "exterior",
        "interiors",
        "unary_union",
        "count_coordinates",
        "count_geometries",
        "count_interior_rings",
        "get_precision",
        "get_geometry",
        "concave_hull",
        "delaunay_triangles",
        "voronoi_polygons",
        "minimum_rotated_rectangle",
        "extract_unique_points",
        "offset_curve",
        "remove_repeated_points",
        "set_precision",
        "representative_point",
        "minimum_bounding_circle",
        "minimum_bounding_radius",
        "minimum_clearance",
        "normalize",
        "make_valid",
        "reverse",
        "segmentize",
        "transform",
        "force_2d",
        "force_3d",
        "line_merge",
        "union_all",
        "intersection_all",
        "contains",
        "contains_properly",
    ],
    "PARTIALLY_IMPLEMENTED": ["set_geometry"],  # Only drop=True case is not implemented
}

IMPLEMENTATION_PRIORITY = {
    "HIGH": [
        "to_geopandas",
        "_to_geopandas",
        "contains",
        "contains_properly",
        "convex_hull",
        "count_coordinates",
        "count_geometries",
        "is_ring",
        "is_closed",
        "make_valid",
    ],
    "MEDIUM": [
        "force_2d",
        "force_3d",
        "transform",
        "segmentize",
        "line_merge",
        "union_all",
        "intersection_all",
        "reverse",
        "normalize",
        "get_geometry",
    ],
    "LOW": [
        "delaunay_triangles",
        "voronoi_polygons",
        "minimum_bounding_circle",
        "representative_point",
        "extract_unique_points",
        "offset_curve",
        "minimum_rotated_rectangle",
        "concave_hull",
    ],
}


def _not_implemented_error(method_name: str, additional_info: str = "") -> str:
    """
    Generate a standardized NotImplementedError message for GeoDataFrame methods.

    Parameters
    ----------
    method_name : str
        The name of the method that is not implemented.
    additional_info : str, optional
        Additional information about the method or workarounds.

    Returns
    -------
    str
        Formatted error message.
    """
    base_message = (
        f"GeoDataFrame.{method_name}() is not implemented yet.\n"
        f"This method will be added in a future release."
    )

    if additional_info:
        base_message += f"\n\n{additional_info}"

    workaround = (
        "\n\nTemporary workaround - use GeoPandas:\n"
        "  gpd_df = sedona_gdf.to_geopandas()\n"
        f"  result = gpd_df.{method_name}(...)\n"
        "  # Note: This will collect all data to the driver."
    )

    return base_message + workaround


[docs] class GeoDataFrame(GeoFrame, pspd.DataFrame): """ A pandas-on-Spark DataFrame for geospatial data with geometry columns. GeoDataFrame extends pyspark.pandas.DataFrame to provide geospatial operations using Apache Sedona's spatial functions. It maintains compatibility with GeoPandas GeoDataFrame while operating on distributed datasets. Parameters ---------- data : dict, array-like, DataFrame, or GeoDataFrame Data to initialize the GeoDataFrame. Can be a dictionary, array-like structure, pandas DataFrame, GeoPandas GeoDataFrame, or another GeoDataFrame. geometry : str, array-like, or GeoSeries, optional Column name, array of geometries, or GeoSeries to use as the active geometry. If None, will look for existing geometry columns. crs : pyproj.CRS, optional Coordinate Reference System for the geometries. columns : Index or array-like, optional Column labels to use for the resulting frame. index : Index or array-like, optional Index to use for the resulting frame. Examples -------- >>> from shapely.geometry import Point, Polygon >>> from sedona.spark.geopandas import GeoDataFrame >>> import pandas as pd >>> >>> # Create from dictionary with geometry >>> data = { ... 'name': ['A', 'B', 'C'], ... 'geometry': [Point(0, 0), Point(1, 1), Point(2, 2)] ... } >>> gdf = GeoDataFrame(data, crs='EPSG:4326') >>> gdf name geometry 0 A POINT (0 0) 1 B POINT (1 1) 2 C POINT (2 2) >>> >>> # Spatial operations >>> buffered = gdf.buffer(0.1) >>> buffered.area 0 0.031416 1 0.031416 2 0.031416 dtype: float64 >>> >>> # Spatial joins >>> polygons = GeoDataFrame({ ... 'region': ['Region1', 'Region2'], ... 'geometry': [ ... Polygon([(-1, -1), (1, -1), (1, 1), (-1, 1)]), ... Polygon([(0.5, 0.5), (2.5, 0.5), (2.5, 2.5), (0.5, 2.5)]) ... ] ... }) >>> result = gdf.sjoin(polygons, how='left', predicate='within') >>> result['region'] 0 Region1 1 Region2 2 Region2 dtype: object Notes ----- This implementation differs from GeoPandas in several ways: - Uses Spark for distributed processing - Geometries are stored in WKB (Well-Known Binary) format internally - Some methods may have different performance characteristics - Not all GeoPandas methods are implemented yet (see IMPLEMENTATION_STATUS) Performance Considerations: - Operations are distributed across Spark cluster - Avoid converting to GeoPandas (.to_geopandas()) on large datasets - Use .sample() for testing with large datasets - Spatial joins are optimized for distributed processing Geometry Column Management: - Supports multiple geometry columns - One geometry column is designated as 'active' at a time - Active geometry is used for spatial operations and plotting - Use set_geometry() to change the active geometry column See Also -------- geopandas.GeoDataFrame : The GeoPandas equivalent sedona.spark.geopandas.GeoSeries : Series with geometry data """ def __getitem__(self, key: Any) -> Any: """ Get item from GeoDataFrame by key. Parameters ---------- key : str, list, slice, ndarray or Series - If key is a string, returns a Series for that column - If key is a list of strings, returns a new GeoDataFrame with selected columns - If key is a slice or array, returns rows in the GeoDataFrame Returns ------- Any Series, GeoDataFrame, or other objects depending on the key type. Examples -------- >>> from shapely.geometry import Point >>> from sedona.spark.geopandas import GeoDataFrame >>> >>> data = {'geometry': [Point(0, 0), Point(1, 1)], 'value': [1, 2]} >>> gdf = GeoDataFrame(data) >>> gdf['value'] 0 1 1 2 Name: value, dtype: int64 """ # Here we are getting a ps.Series with the same underlying anchor (ps.Dataframe). # This is important so we don't unnecessarily try to perform operations on different dataframes item = pspd.DataFrame.__getitem__(self, key) if isinstance(item, pspd.DataFrame): # don't specify crs=self.crs here because it might not include the geometry column # if it does include the geometry column, we don't need to set crs anyways return GeoDataFrame(item) elif isinstance(item, pspd.Series): ps_series: pspd.Series = item try: return sgpd.GeoSeries(ps_series) except TypeError: return ps_series else: raise Exception(f"Logical Error: Unexpected type: {type(item)}") _geometry_column_name = None # ============================================================================ # CONSTRUCTION AND INITIALIZATION # ============================================================================
[docs] def __init__( self, data=None, index=None, columns=None, dtype=None, copy=False, geometry: Any | None = None, crs: Any | None = None, **kwargs, ): assert data is not None self._anchor: GeoDataFrame self._col_label: Label from sedona.spark.geopandas import GeoSeries from pyspark.sql import DataFrame as SparkDataFrame if isinstance(data, (GeoDataFrame, GeoSeries)): if crs: data.crs = crs # For each of these super().__init__() calls, we let pyspark decide which inputs are valid or not # instead of calling e.g assert not dtype ourselves. # This way, if Spark adds support later, than we inherit those changes naturally super().__init__(data, index=index, columns=columns, dtype=dtype, copy=copy) elif isinstance(data, (PandasOnSparkDataFrame, SparkDataFrame)): super().__init__(data, index=index, columns=columns, dtype=dtype, copy=copy) elif isinstance(data, PandasOnSparkSeries): try: data = GeoSeries(data, crs=crs) except TypeError: pass super().__init__(data, index=index, columns=columns, dtype=dtype, copy=copy) else: # below are not distributed dataframe types if isinstance(data, gpd.GeoDataFrame): # We can use GeoDataFrame.active_geometry_name once we drop support for geopandas < 1.0.0 # Below is the equivalent, since active_geometry_name simply calls _geometry_column_name if data._geometry_column_name: # Geopandas stores crs as metadata instead of inside of the shapely objects so we must save it and set it manually later if not crs: crs = data.crs if not geometry: geometry = data.geometry.name pd_df = pd.DataFrame( data, index=index, columns=columns, dtype=dtype, copy=copy, ) # Spark complains if it's left as a geometry type geom_type_cols = pd_df.select_dtypes(include=["geometry"]).columns pd_df[geom_type_cols] = pd_df[geom_type_cols].astype(object) # initialize the parent class pyspark Dataframe with the pandas Dataframe super().__init__( data=pd_df, index=index, dtype=dtype, copy=copy, ) # Set geometry column name if isinstance(data, (GeoDataFrame, gpd.GeoDataFrame)): self._geometry_column_name = data._geometry_column_name if crs is not None and data.crs != crs: raise ValueError(crs_mismatch_error) if geometry: self.set_geometry(geometry, inplace=True, crs=crs) if geometry is None and "geometry" in self.columns: if (self.columns == "geometry").sum() > 1: raise ValueError( "GeoDataFrame does not support multiple columns " "using the geometry column name 'geometry'." ) geometry: pspd.Series = self["geometry"] if isinstance(geometry, sgpd.GeoSeries): if crs is not None: self.set_geometry(geometry, inplace=True, crs=crs) # No need to call set_geometry() here since it's already part of the df, just set the name self._geometry_column_name = "geometry" if geometry is None and crs: raise ValueError( "Assigning CRS to a GeoDataFrame without a geometry column is not " "supported. Supply geometry using the 'geometry=' keyword argument, " "or by providing a DataFrame with column name 'geometry'", )
# ============================================================================ # GEOMETRY COLUMN MANAGEMENT # ============================================================================ def _get_geometry(self) -> sgpd.GeoSeries: if self._geometry_column_name not in self: if self._geometry_column_name is None: msg = ( "You are calling a geospatial method on the GeoDataFrame, " "but the active geometry column to use has not been set. " ) else: msg = ( "You are calling a geospatial method on the GeoDataFrame, " f"but the active geometry column ('{self._geometry_column_name}') " "is not present. " ) geo_cols = list(self.columns[self.dtypes == "geometry"]) if len(geo_cols) > 0: msg += ( f"\nThere are columns with geometry data type ({geo_cols}), and " "you can either set one as the active geometry with " 'df.set_geometry("name") or access the column as a ' 'GeoSeries (df["name"]) and call the method directly on it.' ) else: msg += ( "\nThere are no existing columns with geometry data type. You can " "add a geometry column as the active geometry column with " "df.set_geometry. " ) raise MissingGeometryColumnError(msg) return self[self._geometry_column_name] def _set_geometry(self, col): # This check is included in the original geopandas. Note that this prevents assigning a str to the property # e.g. df.geometry = "geometry" # However the user can still use specify a str in the public .set_geometry() method # ie. df.geometry = "geometry1" errors, but df.set_geometry("geometry1") works if not pd.api.types.is_list_like(col): raise ValueError("Must use a list-like to set the geometry property") self.set_geometry(col, inplace=True) geometry = property( fget=_get_geometry, fset=_set_geometry, doc="Geometry data for GeoDataFrame" ) @typing.overload def set_geometry( self, col, drop: bool | None = ..., inplace: Literal[True] = ..., crs: Any | None = ..., ) -> None: ... @typing.overload def set_geometry( self, col, drop: bool | None = ..., inplace: Literal[False] = ..., crs: Any | None = ..., ) -> GeoDataFrame: ...
[docs] def set_geometry( self, col, drop: bool | None = None, inplace: bool = False, crs: Any | None = None, ) -> GeoDataFrame | None: """ Set the GeoDataFrame geometry using either an existing column or the specified input. By default yields a new object. The original geometry column is replaced with the input. Parameters ---------- col : column label or array-like An existing column name or values to set as the new geometry column. If values (array-like, (Geo)Series) are passed, then if they are named (Series) the new geometry column will have the corresponding name, otherwise the existing geometry column will be replaced. If there is no existing geometry column, the new geometry column will use the default name "geometry". drop : boolean, default False When specifying a named Series or an existing column name for `col`, controls if the previous geometry column should be dropped from the result. The default of False keeps both the old and new geometry column. .. deprecated:: 1.0.0 inplace : boolean, default False Modify the GeoDataFrame in place (do not create a new object) crs : pyproj.CRS, optional Coordinate system to use. The value can be anything accepted by :meth:`pyproj.CRS.from_user_input() <pyproj.crs.CRS.from_user_input>`, such as an authority string (eg "EPSG:4326") or a WKT string. If passed, overrides both DataFrame and col's crs. Otherwise, tries to get crs from passed col values or DataFrame. Examples -------- >>> from sedona.spark.geopandas import GeoDataFrame >>> from shapely.geometry import Point >>> d = {'col1': ['name1', 'name2'], 'geometry': [Point(1, 2), Point(2, 1)]} >>> gdf = GeoDataFrame(d, crs="EPSG:4326") >>> gdf col1 geometry 0 name1 POINT (1 2) 1 name2 POINT (2 1) Passing an array: >>> df1 = gdf.set_geometry([Point(0,0), Point(1,1)]) >>> df1 col1 geometry 0 name1 POINT (0 0) 1 name2 POINT (1 1) Using existing column: >>> gdf["buffered"] = gdf.buffer(2) >>> df2 = gdf.set_geometry("buffered") >>> df2.geometry 0 POLYGON ((3 2, 2.99037 1.80397, 2.96157 1.6098... 1 POLYGON ((4 1, 3.99037 0.80397, 3.96157 0.6098... Name: buffered, dtype: geometry Returns ------- GeoDataFrame See also -------- GeoDataFrame.rename_geometry : rename an active geometry column """ # Most of the code here is taken from DataFrame.set_index() if inplace: frame = self else: frame = self.copy() geo_column_name = self._geometry_column_name new_series = False if geo_column_name is None: geo_column_name = "geometry" if isinstance( col, (pspd.Series, pd.Series, list, np.ndarray, gpd.array.GeometryArray) ): if drop: msg = ( "The `drop` keyword argument is deprecated and has no effect when " "`col` is an array-like value. You should stop passing `drop` to " "`set_geometry` when this is the case." ) warnings.warn(msg, category=FutureWarning, stacklevel=2) if isinstance(col, (pspd.Series, pd.Series)): if col.name is not None: geo_column_name = col.name level = col else: level = col.rename(geo_column_name) else: level = pspd.Series(col, name=geo_column_name) if not isinstance(level, sgpd.GeoSeries): # Set the crs later, so we can allow_override=True level = sgpd.GeoSeries(level) new_series = True elif hasattr(col, "ndim") and col.ndim > 1: raise ValueError("Must pass array with one dimension only.") else: # should be a colname try: level = frame[col] except KeyError: raise ValueError(f"Unknown column {col}") if isinstance(level, (sgpd.GeoDataFrame, gpd.GeoDataFrame)): raise ValueError( "GeoDataFrame does not support setting the geometry column where " "the column name is shared by multiple columns." ) given_colname_drop_msg = ( "The `drop` keyword argument is deprecated and in future the only " "supported behaviour will match drop=False. To silence this " "warning and adopt the future behaviour, stop providing " "`drop` as a keyword to `set_geometry`. To replicate the " "`drop=True` behaviour you should update " "your code to\n`geo_col_name = gdf.active_geometry_name;" " gdf.set_geometry(new_geo_col).drop(" "columns=geo_col_name).rename_geometry(geo_col_name)`." ) if drop is False: # specifically False, not falsy i.e. None # User supplied False explicitly, but arg is deprecated warnings.warn( given_colname_drop_msg, category=FutureWarning, stacklevel=2, ) if drop: raise NotImplementedError( _not_implemented_error( "set_geometry", "Setting geometry with drop=True parameter is not supported.", ) ) else: # if not dropping, set the active geometry name to the given col name geo_column_name = col # This operation throws a warning to the user asking them to set pspd.set_option('compute.ops_on_diff_frames', True) # to allow operations on different frames. We pass these warnings on to the user so they must manually set it themselves. if crs: level.set_crs(crs, inplace=True, allow_override=True) new_series = True frame._geometry_column_name = geo_column_name if new_series: # Note: This casts GeoSeries back into pspd.Series, so we lose any metadata that's not serialized frame[geo_column_name] = level if not inplace: return frame
@typing.overload def rename_geometry( self, col: str, inplace: Literal[True] = ..., ) -> None: ... @typing.overload def rename_geometry( self, col: str, inplace: Literal[False] = ..., ) -> GeoDataFrame: ...
[docs] def rename_geometry(self, col: str, inplace: bool = False) -> GeoDataFrame | None: """ Renames the GeoDataFrame geometry column to the specified name. By default yields a new object. The original geometry column is replaced with the input. Parameters ---------- col : new geometry column label inplace : boolean, default False Modify the GeoDataFrame in place (without creating a new object) Examples -------- >>> from sedona.spark.geopandas import GeoDataFrame >>> from shapely.geometry import Point >>> d = {'col1': ['name1', 'name2'], 'geometry': [Point(1, 2), Point(2, 1)]} >>> df = GeoDataFrame(d, crs="EPSG:4326") >>> df1 = df.rename_geometry('geom1') >>> df1.geometry.name 'geom1' >>> df.rename_geometry('geom1', inplace=True) >>> df.geometry.name 'geom1' See also -------- GeoDataFrame.set_geometry : set the active geometry """ geometry_col = self.geometry.name if col in self.columns: raise ValueError(f"Column named {col} already exists") else: mapper = {col: col for col in list(self.columns)} mapper[geometry_col] = col if inplace: self.rename(columns=mapper, inplace=True, errors="raise") self.set_geometry(col, inplace=True) return None df = self.copy() df.rename(columns=mapper, inplace=True, errors="raise") df.set_geometry(col, inplace=True) return df
# ============================================================================ # PROPERTIES AND ATTRIBUTES # ============================================================================ @property def active_geometry_name(self) -> Any: """Return the name of the active geometry column Returns a name if a GeoDataFrame has an active geometry column set, otherwise returns None. The return type is usually a string, but may be an integer, tuple or other hashable, depending on the contents of the dataframe columns. You can also access the active geometry column using the ``.geometry`` property. You can set a GeoSeries to be an active geometry using the :meth:`~GeoDataFrame.set_geometry` method. Returns ------- str or other index label supported by pandas name of an active geometry column or None See also -------- GeoDataFrame.set_geometry : set the active geometry """ return self._geometry_column_name
[docs] def to_geopandas(self) -> gpd.GeoDataFrame: """ Note: Unlike in pandas and geopandas, Sedona will always return a general Index. This differs from pandas and geopandas, which will return a RangeIndex by default. e.g pd.Index([0, 1, 2]) instead of pd.RangeIndex(start=0, stop=3, step=1) """ log_advice( "`to_geopandas` loads all data into the driver's memory. " "It should only be used if the resulting geopandas GeoSeries is expected to be small." ) return self._to_geopandas()
def _to_geopandas(self) -> gpd.GeoDataFrame: pd_df = self._internal.to_pandas_frame for col_name in pd_df.columns: series: pspd.Series = self[col_name] if isinstance(series, sgpd.GeoSeries): # Use _to_geopandas instead of to_geopandas to avoid logging extra warnings pd_df[col_name] = series._to_geopandas() else: pd_df[col_name] = series._to_pandas() return gpd.GeoDataFrame(pd_df, geometry=self._geometry_column_name)
[docs] def to_spark_pandas(self) -> pspd.DataFrame: """ Convert the GeoDataFrame to a Spark Pandas DataFrame. """ return pspd.DataFrame(self._internal)
[docs] def copy(self, deep=False) -> GeoDataFrame: """ Make a copy of this GeoDataFrame object. Parameters ---------- deep : bool, default False This parameter is not supported but just a dummy parameter to match pandas. Returns ------- GeoDataFrame A copy of this GeoDataFrame object. Examples -------- >>> from shapely.geometry import Point >>> from sedona.spark.geopandas import GeoDataFrame >>> gdf = GeoDataFrame([{"geometry": Point(1, 1), "value1": 2, "value2": 3}]) >>> gdf_copy = gdf.copy() >>> print(gdf_copy) geometry value1 value2 0 POINT (1 1) 2 3 """ # Note: The deep parameter is a dummy parameter just as it is in PySpark pandas return GeoDataFrame( pspd.DataFrame(self._internal.copy()), geometry=self.active_geometry_name )
def _safe_get_crs(self): """ Helper method for getting the crs of the GeoDataframe safely. Returns None if no geometry column is set instead of raising an error. """ try: return self.geometry.crs except MissingGeometryColumnError: return None @property def crs(self): return self.geometry.crs @crs.setter def crs(self, value): # Since pyspark dataframes are immutable, we can't modify in place, so we create the new geoseries and replace it self.geometry = self.geometry.set_crs(value)
[docs] def set_crs(self, crs, inplace=False, allow_override=True): """ Set the Coordinate Reference System (CRS) of the ``GeoDataFrame``. If there are multiple geometry columns within the GeoDataFrame, only the CRS of the active geometry column is set. Pass ``None`` to remove CRS from the active geometry column. Notes ----- The underlying geometries are not transformed to this CRS. To transform the geometries to a new CRS, use the ``to_crs`` method. Parameters ---------- crs : pyproj.CRS | None, optional The value can be anything accepted by :meth:`pyproj.CRS.from_user_input() <pyproj.crs.CRS.from_user_input>`, such as an authority string (eg "EPSG:4326") or a WKT string. epsg : int, optional EPSG code specifying the projection. inplace : bool, default False If True, the CRS of the GeoDataFrame will be changed in place (while still returning the result) instead of making a copy of the GeoDataFrame. allow_override : bool, default True If the GeoDataFrame already has a CRS, allow to replace the existing CRS, even when both are not equal. In Sedona, setting this to True will lead to eager evaluation instead of lazy evaluation. Unlike Geopandas, True is the default value in Sedona for performance reasons. Examples -------- >>> from sedona.spark.geopandas import GeoDataFrame >>> from shapely.geometry import Point >>> d = {'col1': ['name1', 'name2'], 'geometry': [Point(1, 2), Point(2, 1)]} >>> gdf = GeoDataFrame(d) >>> gdf col1 geometry 0 name1 POINT (1 2) 1 name2 POINT (2 1) Setting CRS to a GeoDataFrame without one: >>> gdf.crs is None True >>> gdf = gdf.set_crs('epsg:3857') >>> gdf.crs # doctest: +SKIP <Projected CRS: EPSG:3857> Name: WGS 84 / Pseudo-Mercator Axis Info [cartesian]: - X[east]: Easting (metre) - Y[north]: Northing (metre) Area of Use: - name: World - 85°S to 85°N - bounds: (-180.0, -85.06, 180.0, 85.06) Coordinate Operation: - name: Popular Visualisation Pseudo-Mercator - method: Popular Visualisation Pseudo Mercator Datum: World Geodetic System 1984 - Ellipsoid: WGS 84 - Prime Meridian: Greenwich Overriding existing CRS: >>> gdf = gdf.set_crs(4326, allow_override=True) Without ``allow_override=True``, ``set_crs`` returns an error if you try to override CRS. See Also -------- GeoDataFrame.to_crs : re-project to another CRS """ # Since pyspark dataframes are immutable, we can't modify in place, so we create the new geoseries and replace it new_geometry = self.geometry.set_crs(crs, allow_override=allow_override) if inplace: self.geometry = new_geometry else: df = self.copy() df.geometry = new_geometry return df
[docs] def to_crs( self, crs: Any | None = None, epsg: int | None = None, inplace: bool = False, ) -> GeoDataFrame | None: """Transform geometries to a new coordinate reference system. Transform all geometries in an active geometry column to a different coordinate reference system. The ``crs`` attribute on the current GeoSeries must be set. Either ``crs`` or ``epsg`` may be specified for output. This method will transform all points in all objects. It has no notion of projecting entire geometries. All segments joining points are assumed to be lines in the current projection, not geodesics. Objects crossing the dateline (or other projection boundary) will have undesirable behavior. Parameters ---------- crs : pyproj.CRS, optional if `epsg` is specified The value can be anything accepted by :meth:`pyproj.CRS.from_user_input() <pyproj.crs.CRS.from_user_input>`, such as an authority string (eg "EPSG:4326") or a WKT string. epsg : int, optional if `crs` is specified EPSG code specifying output projection. inplace : bool, optional, default: False Whether to return a new GeoDataFrame or do the transformation in place. Returns ------- GeoDataFrame Examples -------- >>> from shapely.geometry import Point >>> from sedona.spark.geopandas import GeoDataFrame >>> d = {'col1': ['name1', 'name2'], 'geometry': [Point(1, 2), Point(2, 1)]} >>> gdf = GeoDataFrame(d, crs=4326) >>> gdf col1 geometry 0 name1 POINT (1 2) 1 name2 POINT (2 1) >>> gdf.crs # doctest: +SKIP <Geographic 2D CRS: EPSG:4326> Name: WGS 84 Axis Info [ellipsoidal]: - Lat[north]: Geodetic latitude (degree) - Lon[east]: Geodetic longitude (degree) Area of Use: - name: World - bounds: (-180.0, -90.0, 180.0, 90.0) Datum: World Geodetic System 1984 - Ellipsoid: WGS 84 - Prime Meridian: Greenwich >>> gdf = gdf.to_crs(3857) >>> gdf col1 geometry 0 name1 POINT (111319.491 222684.209) 1 name2 POINT (222638.982 111325.143) >>> gdf.crs # doctest: +SKIP <Projected CRS: EPSG:3857> Name: WGS 84 / Pseudo-Mercator Axis Info [cartesian]: - X[east]: Easting (metre) - Y[north]: Northing (metre) Area of Use: - name: World - 85°S to 85°N - bounds: (-180.0, -85.06, 180.0, 85.06) Coordinate Operation: - name: Popular Visualisation Pseudo-Mercator - method: Popular Visualisation Pseudo Mercator Datum: World Geodetic System 1984 - Ellipsoid: WGS 84 - Prime Meridian: Greenwich See Also -------- GeoDataFrame.set_crs : assign CRS without re-projection """ new_geometry = self.geometry.to_crs(crs=crs, epsg=epsg) if inplace: df = self df.geometry = new_geometry return None else: df = self.copy() df.geometry = new_geometry return df
[docs] @classmethod def from_dict( cls, data: dict, geometry=None, crs: Any | None = None, **kwargs, ) -> GeoDataFrame: raise NotImplementedError("from_dict() is not implemented yet.")
[docs] @classmethod def from_features( cls, features, crs: Any | None = None, columns: Iterable[str] | None = None ) -> GeoDataFrame: raise NotImplementedError("from_features() is not implemented yet.")
[docs] @classmethod def from_postgis( cls, sql: str | sqlalchemy.text, con, geom_col: str = "geom", crs: Any | None = None, index_col: str | list[str] | None = None, coerce_float: bool = True, parse_dates: list | dict | None = None, params: list | tuple | dict | None = None, chunksize: int | None = None, ) -> GeoDataFrame: raise NotImplementedError("from_postgis() is not implemented yet.")
[docs] @classmethod def from_arrow( cls, table, geometry: str | None = None, to_pandas_kwargs: dict | None = None ): """ Construct a GeoDataFrame from a Arrow table object based on GeoArrow extension types. See https://geoarrow.org/ for details on the GeoArrow specification. This functions accepts any tabular Arrow object implementing the `Arrow PyCapsule Protocol`_ (i.e. having an ``__arrow_c_array__`` or ``__arrow_c_stream__`` method). .. _Arrow PyCapsule Protocol: https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html .. versionadded:: 1.0 Parameters ---------- table : pyarrow.Table or Arrow-compatible table Any tabular object implementing the Arrow PyCapsule Protocol (i.e. has an ``__arrow_c_array__`` or ``__arrow_c_stream__`` method). This table should have at least one column with a geoarrow geometry type. geometry : str, default None The name of the geometry column to set as the active geometry column. If None, the first geometry column found will be used. to_pandas_kwargs : dict, optional Arguments passed to the `pa.Table.to_pandas` method for non-geometry columns. This can be used to control the behavior of the conversion of the non-geometry columns to a pandas DataFrame. For example, you can use this to control the dtype conversion of the columns. By default, the `to_pandas` method is called with no additional arguments. Returns ------- GeoDataFrame See Also -------- GeoDataFrame.to_arrow GeoSeries.from_arrow Examples -------- >>> from sedona.spark.geopandas import GeoDataFrame >>> import geoarrow.pyarrow as ga # requires: pip install geoarrow-pyarrow >>> import pyarrow as pa # requires: pip install pyarrow >>> table = pa.Table.from_arrays([ ... ga.as_geoarrow([None, "POLYGON ((0 0, 1 1, 0 1, 0 0))", "LINESTRING (0 0, -1 1, 0 -1)"]), ... pa.array([1, 2, 3]), ... pa.array(["a", "b", "c"]), ... ], names=["geometry", "id", "value"]) >>> gdf = GeoDataFrame.from_arrow(table) >>> gdf geometry id value 0 None 1 a 1 POLYGON ((0 0, 1 1, 0 1, 0 0)) 2 b 2 LINESTRING (0 0, -1 1, 0 -1) 3 c """ if to_pandas_kwargs is None: to_pandas_kwargs = {} gpd_df = gpd.GeoDataFrame.from_arrow( table, geometry=geometry, **to_pandas_kwargs ) return GeoDataFrame(gpd_df)
[docs] def to_json( self, na: Literal["null", "drop", "keep"] = "null", show_bbox: bool = False, drop_id: bool = False, to_wgs84: bool = False, **kwargs, ) -> str: """ Returns a GeoJSON representation of the ``GeoDataFrame`` as a string. Parameters ---------- na : {'null', 'drop', 'keep'}, default 'null' Dictates how to represent missing (NaN) values in the output. - ``null``: Outputs missing entries as JSON `null`. - ``drop``: Removes the entire property from a feature if its value is missing. - ``keep``: Outputs missing entries as ``NaN``. show_bbox : bool, default False If True, the `bbox` (bounds) of the geometries is included in the output. drop_id : bool, default False If True, the GeoDataFrame index is not written to the 'id' field of each GeoJSON Feature. to_wgs84 : bool, default False If True, all geometries are transformed to WGS84 (EPSG:4326) to meet the `2016 GeoJSON specification <https://tools.ietf.org/html/rfc7946>`_. When False, the current CRS is exported if it's set. **kwargs Additional keyword arguments passed to `json.dumps()`. Returns ------- str A GeoJSON representation of the GeoDataFrame. See Also -------- GeoDataFrame.to_file : Write a ``GeoDataFrame`` to a file, which can be used for GeoJSON format. Examples -------- >>> from sedona.spark.geopandas import GeoDataFrame >>> from shapely.geometry import Point >>> d = {'col1': ['name1', 'name2'], 'geometry': [Point(1, 2), Point(2, 1)]} >>> gdf = GeoDataFrame(d, crs="EPSG:3857") >>> gdf.to_json() '{"type": "FeatureCollection", "features": [{"id": "0", "type": "Feature", "properties": {"col1": "name1"}, "geometry": {"type": "Point", "coordinates": [1.0, 2.0]}}, {"id": "1", "type": "Feature", "properties": {"col1": "name2"}, "geometry": {"type": "Point", "coordinates": [2.0, 1.0]}}], "crs": {"type": "name", "properties": {"name": "urn:ogc:def:crs:EPSG::3857"}}}' See also -------- GeoDataFrame.to_file : write GeoDataFrame to file """ # Because this function returns the geojson string in memory, # we simply rely on geopandas's implementation. # Additionally, spark doesn't seem to have a straight forward way to get the string # without writing to a file first by using sdf.write.format("geojson").save(path, **kwargs) # return self.to_geopandas().to_json(na, show_bbox, drop_id, to_wgs84, **kwargs) # ST_AsGeoJSON() works only for one column result = self.to_geopandas() return result.to_json(na, show_bbox, drop_id, to_wgs84, **kwargs)
@property def __geo_interface__(self) -> dict: raise NotImplementedError("__geo_interface__() is not implemented yet.")
[docs] def iterfeatures( self, na: str = "null", show_bbox: bool = False, drop_id: bool = False ) -> typing.Generator[dict]: raise NotImplementedError("iterfeatures() is not implemented yet.")
[docs] def to_geo_dict( self, na: str | None = "null", show_bbox: bool = False, drop_id: bool = False ) -> dict: raise NotImplementedError("to_geo_dict() is not implemented yet.")
[docs] def to_wkb(self, hex: bool = False, **kwargs) -> pd.DataFrame: raise NotImplementedError("to_wkb() is not implemented yet.")
[docs] def to_wkt(self, **kwargs) -> pd.DataFrame: raise NotImplementedError("to_wkt() is not implemented yet.")
[docs] def to_arrow( self, *, index: bool | None = None, geometry_encoding="WKB", interleaved: bool = True, include_z: bool | None = None, ): """Encode a GeoDataFrame to GeoArrow format. See https://geoarrow.org/ for details on the GeoArrow specification. This function returns a generic Arrow data object implementing the `Arrow PyCapsule Protocol`_ (i.e. having an ``__arrow_c_stream__`` method). This object can then be consumed by your Arrow implementation of choice that supports this protocol. .. _Arrow PyCapsule Protocol: https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html Note: Requires geopandas versions >= 1.0.0 to use with Sedona. Parameters ---------- index : bool, default None If ``True``, always include the dataframe's index(es) as columns in the file output. If ``False``, the index(es) will not be written to the file. If ``None``, the index(ex) will be included as columns in the file output except `RangeIndex` which is stored as metadata only. Note: Unlike in geopandas, ``None`` will include the index in the column because Sedona always converts `RangeIndex` into a general `Index`. geometry_encoding : {'WKB', 'geoarrow' }, default 'WKB' The GeoArrow encoding to use for the data conversion. interleaved : bool, default True Only relevant for 'geoarrow' encoding. If True, the geometries' coordinates are interleaved in a single fixed size list array. If False, the coordinates are stored as separate arrays in a struct type. include_z : bool, default None Only relevant for 'geoarrow' encoding (for WKB, the dimensionality of the individual geometries is preserved). If False, return 2D geometries. If True, include the third dimension in the output (if a geometry has no third dimension, the z-coordinates will be NaN). By default, will infer the dimensionality from the input geometries. Note that this inference can be unreliable with empty geometries (for a guaranteed result, it is recommended to specify the keyword). Returns ------- ArrowTable A generic Arrow table object with geometry columns encoded to GeoArrow. Examples -------- >>> from sedona.spark.geopandas import GeoDataFrame >>> from shapely.geometry import Point >>> data = {'col1': ['name1', 'name2'], 'geometry': [Point(1, 2), Point(2, 1)]} >>> gdf = GeoDataFrame(data) >>> gdf col1 geometry 0 name1 POINT (1 2) 1 name2 POINT (2 1) >>> arrow_table = gdf.to_arrow(index=False) >>> arrow_table <geopandas.io._geoarrow.ArrowTable object at ...> The returned data object needs to be consumed by a library implementing the Arrow PyCapsule Protocol. For example, wrapping the data as a pyarrow.Table (requires pyarrow >= 14.0): >>> import pyarrow as pa # requires: pip install pyarrow >>> table = pa.table(arrow_table) >>> table pyarrow.Table col1: string geometry: binary ---- col1: [["name1","name2"]] geometry: [[0101000000000000000000F03F0000000000000040,\ 01010000000000000000000040000000000000F03F]] """ # Because this function returns the arrow table in memory, we simply rely on geopandas's implementation. # This also returns a geopandas specific data type, which can be converted to an actual pyarrow table, # so there is no direct Sedona equivalent. This way we also get all of the arguments implemented for free. return self.to_geopandas().to_arrow( index=index, geometry_encoding=geometry_encoding, interleaved=interleaved, include_z=include_z, )
[docs] def to_feather( self, path, index: bool | None = None, compression: str | None = None, schema_version=None, **kwargs, ): raise NotImplementedError("to_feather() is not implemented yet.")
@property def type(self): # Implementation of the abstract method raise NotImplementedError( _not_implemented_error("type", "Returns numeric geometry type codes.") )
[docs] def plot(self, *args, **kwargs): """ Plot a GeoDataFrame. Generate a plot of a GeoDataFrame with matplotlib. If a column is specified, the plot coloring will be based on values in that column. Note: This method is not scalable and requires collecting all data to the driver. Parameters ---------- column : str, np.array, pd.Series, pd.Index (default None) The name of the dataframe column, np.array, pd.Series, or pd.Index to be plotted. If np.array, pd.Series, or pd.Index are used then it must have same length as dataframe. Values are used to color the plot. Ignored if `color` is also set. kind: str The kind of plots to produce. The default is to create a map ("geo"). Other supported kinds of plots from pandas: - 'line' : line plot - 'bar' : vertical bar plot - 'barh' : horizontal bar plot - 'hist' : histogram - 'box' : BoxPlot - 'kde' : Kernel Density Estimation plot - 'density' : same as 'kde' - 'area' : area plot - 'pie' : pie plot - 'scatter' : scatter plot - 'hexbin' : hexbin plot. cmap : str (default None) The name of a colormap recognized by matplotlib. color : str, np.array, pd.Series (default None) If specified, all objects will be colored uniformly. ax : matplotlib.pyplot.Artist (default None) axes on which to draw the plot cax : matplotlib.pyplot Artist (default None) axes on which to draw the legend in case of color map. categorical : bool (default False) If False, cmap will reflect numerical values of the column being plotted. For non-numerical columns, this will be set to True. legend : bool (default False) Plot a legend. Ignored if no `column` is given, or if `color` is given. scheme : str (default None) Name of a choropleth classification scheme (requires mapclassify). A mapclassify.MapClassifier object will be used under the hood. Supported are all schemes provided by mapclassify (e.g. 'BoxPlot', 'EqualInterval', 'FisherJenks', 'FisherJenksSampled', 'HeadTailBreaks', 'JenksCaspall', 'JenksCaspallForced', 'JenksCaspallSampled', 'MaxP', 'MaximumBreaks', 'NaturalBreaks', 'Quantiles', 'Percentiles', 'StdMean', 'UserDefined'). Arguments can be passed in classification_kwds. k : int (default 5) Number of classes (ignored if scheme is None) vmin : None or float (default None) Minimum value of cmap. If None, the minimum data value in the column to be plotted is used. vmax : None or float (default None) Maximum value of cmap. If None, the maximum data value in the column to be plotted is used. markersize : str or float or sequence (default None) Only applies to point geometries within a frame. If a str, will use the values in the column of the frame specified by markersize to set the size of markers. Otherwise can be a value to apply to all points, or a sequence of the same length as the number of points. figsize : tuple of integers (default None) Size of the resulting matplotlib.figure.Figure. If the argument axes is given explicitly, figsize is ignored. legend_kwds : dict (default None) Keyword arguments to pass to :func:`matplotlib.pyplot.legend` or :func:`matplotlib.pyplot.colorbar`. Additional accepted keywords when `scheme` is specified: fmt : string A formatting specification for the bin edges of the classes in the legend. For example, to have no decimals: ``{"fmt": "{:.0f}"}``. labels : list-like A list of legend labels to override the auto-generated labels. Needs to have the same number of elements as the number of classes (`k`). interval : boolean (default False) An option to control brackets from mapclassify legend. If True, open/closed interval brackets are shown in the legend. categories : list-like Ordered list-like object of categories to be used for categorical plot. classification_kwds : dict (default None) Keyword arguments to pass to mapclassify missing_kwds : dict (default None) Keyword arguments specifying color options (as style_kwds) to be passed on to geometries with missing values in addition to or overwriting other style kwds. If None, geometries with missing values are not plotted. aspect : 'auto', 'equal', None or float (default 'auto') Set aspect of axis. If 'auto', the default aspect for map plots is 'equal'; if however data are not projected (coordinates are long/lat), the aspect is by default set to 1/cos(df_y * pi/180) with df_y the y coordinate of the middle of the GeoDataFrame (the mean of the y range of bounding box) so that a long/lat square appears square in the middle of the plot. This implies an Equirectangular projection. If None, the aspect of `ax` won't be changed. It can also be set manually (float) as the ratio of y-unit to x-unit. autolim : bool (default True) Update axes data limits to contain the new geometries. **style_kwds : dict Style options to be passed on to the actual plot function, such as ``edgecolor``, ``facecolor``, ``linewidth``, ``markersize``, ``alpha``. Returns ------- ax : matplotlib axes instance Examples -------- >>> import geodatasets # requires: pip install geodatasets >>> import geopandas as gpd >>> df = gpd.read_file(geodatasets.get_path("nybb")) >>> df.head() # doctest: +SKIP BoroCode ... geometry 0 5 ... MULTIPOLYGON (((970217.022 145643.332, 970227.... 1 4 ... MULTIPOLYGON (((1029606.077 156073.814, 102957... 2 3 ... MULTIPOLYGON (((1021176.479 151374.797, 102100... 3 1 ... MULTIPOLYGON (((981219.056 188655.316, 980940.... 4 2 ... MULTIPOLYGON (((1012821.806 229228.265, 101278... >>> df.plot("BoroName", cmap="Set1") # doctest: +SKIP """ return self.to_geopandas().plot(*args, **kwargs)
# ============================================================================ # SPATIAL OPERATIONS # ============================================================================
[docs] def sjoin( self, other, how="inner", predicate="intersects", lsuffix="left", rsuffix="right", distance=None, on_attribute=None, **kwargs, ): """ Spatial join of two GeoDataFrames. Parameters ---------- other : GeoDataFrame The right GeoDataFrame to join with. how : str, default 'inner' The type of join: * 'left': use keys from left_df; retain only left_df geometry column * 'right': use keys from right_df; retain only right_df geometry column * 'inner': use intersection of keys from both dfs; retain only left_df geometry column predicate : str, default 'intersects' Binary predicate. Valid values: 'intersects', 'contains', 'within', 'dwithin', 'touches', 'crosses', 'overlaps', 'covers', 'covered_by' lsuffix : str, default 'left' Suffix to apply to overlapping column names (left GeoDataFrame). rsuffix : str, default 'right' Suffix to apply to overlapping column names (right GeoDataFrame). distance : float, optional Distance for 'dwithin' predicate. Required if predicate='dwithin'. on_attribute : str, list or tuple, optional Column name(s) to join on as an additional join restriction. These must be found in both DataFrames. **kwargs Additional keyword arguments passed to the spatial join function. Returns ------- GeoDataFrame A GeoDataFrame with the results of the spatial join. Examples -------- >>> from shapely.geometry import Point, Polygon >>> from sedona.spark.geopandas import GeoDataFrame >>> polygons = GeoDataFrame({ ... 'geometry': [Polygon([(0, 0), (0, 1), (1, 1), (1, 0)])], ... 'value': [1] ... }) >>> points = GeoDataFrame({ ... 'geometry': [Point(0.5, 0.5), Point(2, 2)], ... 'value': [1, 2] ... }) >>> joined = points.sjoin(polygons) >>> joined geometry_left value_left geometry_right value_right 0 POINT (0.5 0.5) 1 POLYGON ((0 0, 0 1, 1 1, 1 0, 0 0)) 1 """ from sedona.spark.geopandas.tools.sjoin import sjoin as sjoin_tool return sjoin_tool( self, other, how=how, predicate=predicate, lsuffix=lsuffix, rsuffix=rsuffix, distance=distance, on_attribute=on_attribute, **kwargs, )
# ============================================================================ # I/O OPERATIONS # ============================================================================
[docs] @classmethod def from_file( cls, filename: str, format: str | None = None, **kwargs ) -> GeoDataFrame: """Alternate constructor to create a ``GeoDataFrame`` from a file. Parameters ---------- filename : str File path or file handle to read from. If the path is a directory, Sedona will read all files in that directory. format : str, optional The format of the file to read, by default None. If None, Sedona infers the format from the file extension. Note that format inference is not supported for directories. Available formats are "shapefile", "geojson", "geopackage", and "geoparquet". table_name : str, optional The name of the table to read from a GeoPackage file, by default None. This is required if ``format`` is "geopackage". **kwargs Additional keyword arguments passed to the file reader. Returns ------- GeoDataFrame A new GeoDataFrame created from the file. See Also -------- GeoDataFrame.to_file : Write a ``GeoDataFrame`` to a file. """ return sgpd.io.read_file(filename, format, **kwargs)
[docs] def to_file( self, path: str, driver: str | None = None, schema: dict | None = None, index: bool | None = None, **kwargs, ): """ Write the ``GeoDataFrame`` to a file. Parameters ---------- path : str File path or file handle to write to. driver : str, default None The format driver used to write the file. If not specified, it attempts to infer it from the file extension. If no extension is specified, Sedona will error. Options: "geojson", "geopackage", "geoparquet" schema : dict, default None Not applicable to Sedona's implementation. index : bool, default None If True, write index into one or more columns (for MultiIndex). Default None writes the index into one or more columns only if the index is named, is a MultiIndex, or has a non-integer data type. If False, no index is written. **kwargs Additional keyword arguments: mode : str, default 'w' The write mode, 'w' to overwrite the existing file and 'a' to append. 'overwrite' and 'append' are equivalent to 'w' and 'a' respectively. crs : pyproj.CRS, default None If specified, the CRS is passed to Fiona to better control how the file is written. If None, GeoPandas will determine the CRS based on the ``crs`` attribute. The value can be anything accepted by :meth:`pyproj.CRS.from_user_input <pyproj.crs.CRS.from_user_input>`, such as an authority string (e.g., "EPSG:4326") or a WKT string. engine : str Not applicable to Sedona's implementation. metadata : dict[str, str], default None Optional metadata to be stored in the file. Keys and values must be strings. Supported only for "GPKG" driver. Not supported by Sedona. Examples -------- >>> from shapely.geometry import Point, LineString >>> from sedona.spark.geopandas import GeoDataFrame >>> gdf = GeoDataFrame({ ... "geometry": [Point(0, 0), LineString([(0, 0), (1, 1)])], ... "int": [1, 2] ... }) >>> gdf.to_file("output.parquet", driver="geoparquet") With selected drivers you can also append to a file with ``mode="a"``: >>> gdf.to_file("output.geojson", driver="geojson", mode="a") When the index is of non-integer dtype, ``index=None`` (default) is treated as True, writing the index to the file. >>> gdf = GeoDataFrame({"geometry": [Point(0, 0), Point(1, 1)]}, index=["a", "b"]) >>> gdf.to_file("output_with_index.parquet", driver="geoparquet") """ sgpd.io._to_file(self, path, driver, index, **kwargs)
[docs] def to_parquet(self, path, **kwargs): """ Write the GeoDataFrame to a GeoParquet file. Parameters ---------- path : str The file path where the GeoParquet file will be written. **kwargs Additional arguments to pass to the Sedona DataFrame output function. Examples -------- >>> from shapely.geometry import Point >>> from sedona.spark.geopandas import GeoDataFrame >>> gdf = GeoDataFrame({"geometry": [Point(0, 0), Point(1, 1)], "value": [1, 2]}) >>> gdf.to_parquet("output.parquet") """ self.to_file(path, driver="geoparquet", **kwargs)
# ----------------------------------------------------------------------------- # # Utils # ----------------------------------------------------------------------------- def _ensure_geometry(data, crs: Any | None = None) -> sgpd.GeoSeries: """ Ensure the data is of geometry dtype or converted to it. If the input is a GeometryDtype with a set CRS, `crs` is ignored. """ if isinstance(data, sgpd.GeoSeries): if data.crs is None and crs is not None: # Avoids caching issues/crs sharing issues data = data.copy() data.crs = crs return data else: return sgpd.GeoSeries(data, crs=crs) # We don't raise AttributeError because that would be caught by pyspark's __getattr__ creating a misleading error message
[docs] class MissingGeometryColumnError(Exception): pass