Source code for sedona.spark.stats.hotspot_detection.getis_ord

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

"""Getis Ord functions. From the 1992 paper by Getis & Ord.

Getis, A., & Ord, J. K. (1992). The analysis of spatial association by use of distance statistics.
Geographical Analysis, 24(3), 189-206. https://doi.org/10.1111/j.1538-4632.1992.tb00261.x
"""

from pyspark.sql import DataFrame, SparkSession

# todo change weights and x type to string


[docs] def g_local( dataframe: DataFrame, x: str, weights: str = "weights", permutations: int = 0, star: bool = False, island_weight: float = 0.0, ) -> DataFrame: """Performs the Gi or Gi* statistic on the x column of the dataframe. Weights should be the neighbors of this row. The members of the weights should be comprised of structs containing a value column and a neighbor column. The neighbor column should be the contents of the neighbors with the same types as the parent row (minus neighbors). You can use `wherobots.weighing.add_distance_band_column` to achieve this. To calculate the Gi* statistic, ensure the focal observation is in the neighbors array (i.e. the row is in the weights column) and `star=true`. Significance is calculated with a z score. Permutation tests are not yet implemented and thus island weight does nothing. The following columns will be added: G, E[G], V[G], Z, P. Args: dataframe: the dataframe to perform the G statistic on x: The column name we want to perform hotspot analysis on weights: The column name containing the neighbors array. The neighbor column should be the contents of the neighbors with the same types as the parent row (minus neighbors). You can use `wherobots.weighing.add_distance_band_column` to achieve this. permutations: Not used. Permutation tests are not supported yet. The number of permutations to use for the significance test. star: Whether the focal observation is in the neighbors array. If true this calculates Gi*, otherwise Gi island_weight: Not used. The weight for the simulated neighbor used for records without a neighbor in perm tests Returns: A dataframe with the original columns plus the columns G, E[G], V[G], Z, P. """ sedona = SparkSession.getActiveSession() result_df = sedona._jvm.org.apache.sedona.stats.hotspotDetection.GetisOrd.gLocal( dataframe._jdf, x, weights, permutations, star, island_weight ) return DataFrame(result_df, sedona)