Source code for sedona.spark.stats.hotspot_detection.getis_ord

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

"""Getis Ord functions. From the 1992 paper by Getis & Ord.

Getis, A., & Ord, J. K. (1992). The analysis of spatial association by use of distance statistics.
Geographical Analysis, 24(3), 189-206. https://doi.org/10.1111/j.1538-4632.1992.tb00261.x
"""

from pyspark.sql import DataFrame, SparkSession

# todo change weights and x type to string


[docs]def g_local(
    dataframe: DataFrame,
    x: str,
    weights: str = "weights",
    permutations: int = 0,
    star: bool = False,
    island_weight: float = 0.0,
) -> DataFrame:
    """Performs the Gi or Gi* statistic on the x column of the dataframe.

    Weights should be the neighbors of this row. The members of the weights should be comprised of structs containing a
    value column and a neighbor column. The neighbor column should be the contents of the neighbors with the same types
    as the parent row (minus neighbors). You can use `wherobots.weighing.add_distance_band_column` to achieve this. To
    calculate the Gi* statistic, ensure the focal observation is in the neighbors array (i.e. the row is in the weights
    column) and `star=true`. Significance is calculated with a z score. Permutation tests are not yet implemented and
    thus island weight does nothing. The following columns will be added: G, E[G], V[G], Z, P.

    Args:
        dataframe: the dataframe to perform the G statistic on
        x: The column name we want to perform hotspot analysis on
        weights: The column name containing the neighbors array. The neighbor column should be the contents of
            the neighbors with the same types as the parent row (minus neighbors). You can use
            `wherobots.weighing.add_distance_band_column` to achieve this.
        permutations: Not used. Permutation tests are not supported yet. The number of permutations to use for the
            significance test.
        star: Whether the focal observation is in the neighbors array. If true this calculates Gi*, otherwise Gi
        island_weight: Not used. The weight for the simulated neighbor used for records without a neighbor in perm tests
    Returns:
        A dataframe with the original columns plus the columns G, E[G], V[G], Z, P.
    """
    sedona = SparkSession.getActiveSession()

    result_df = sedona._jvm.org.apache.sedona.stats.hotspotDetection.GetisOrd.gLocal(
        dataframe._jdf, x, weights, permutations, star, island_weight
    )

    return DataFrame(result_df, sedona)