Spatially aggregate airports per country

Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements.  See the NOTICE file
distributed with this work for additional information
regarding copyright ownership.  The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License.  You may obtain a copy of the License at
  http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied.  See the License for the
specific language governing permissions and limitations
under the License.

In [ ]:

Copied!

import os

import geopandas as gpd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr, when, explode, hex

from sedona.spark import *
from utilities import getConfig
import os

import geopandas as gpd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr, when, explode, hex

from sedona.spark import *
from utilities import getConfig

Setup Sedona environment¶

In [ ]:

Copied!





config = SedonaContext.builder() .\
    config('spark.jars.packages',
           'org.apache.sedona:sedona-spark-shaded-3.4_2.12:1.5.1,'
           'org.datasyslab:geotools-wrapper:1.5.1-28.2,'
           'uk.co.gresearch.spark:spark-extension_2.12:2.11.0-3.4'). \
    config('spark.jars.repositories', 'https://artifacts.unidata.ucar.edu/repository/unidata-all'). \
    getOrCreate()

sedona = SedonaContext.create(config)
sc = sedona.sparkContext
sc.setSystemProperty("sedona.global.charset", "utf8")
config = SedonaContext.builder() .\
    config('spark.jars.packages',
           'org.apache.sedona:sedona-spark-shaded-3.4_2.12:1.5.1,'
           'org.datasyslab:geotools-wrapper:1.5.1-28.2,'
           'uk.co.gresearch.spark:spark-extension_2.12:2.11.0-3.4'). \
    config('spark.jars.repositories', 'https://artifacts.unidata.ucar.edu/repository/unidata-all'). \
    getOrCreate()

sedona = SedonaContext.create(config)
sc = sedona.sparkContext
sc.setSystemProperty("sedona.global.charset", "utf8")

Read countries shapefile into a Sedona DataFrame¶

Data link: https://www.naturalearthdata.com/downloads/50m-cultural-vectors/

In [ ]:

Copied!





countries = ShapefileReader.readToGeometryRDD(sc, "data/ne_50m_admin_0_countries_lakes/")
countries_df = Adapter.toDf(countries, sedona)
countries_df.createOrReplaceTempView("country")
countries_df.printSchema()
countries = ShapefileReader.readToGeometryRDD(sc, "data/ne_50m_admin_0_countries_lakes/")
countries_df = Adapter.toDf(countries, sedona)
countries_df.createOrReplaceTempView("country")
countries_df.printSchema()

Read airports shapefile into a Sedona DataFrame¶

Data link: https://www.naturalearthdata.com/downloads/50m-cultural-vectors/

In [ ]:

Copied!





airports = ShapefileReader.readToGeometryRDD(sc, "data/ne_50m_airports/")
airports_df = Adapter.toDf(airports, sedona)
airports_df.createOrReplaceTempView("airport")
airports_df.printSchema()
airports = ShapefileReader.readToGeometryRDD(sc, "data/ne_50m_airports/")
airports_df = Adapter.toDf(airports, sedona)
airports_df.createOrReplaceTempView("airport")
airports_df.printSchema()

Run Spatial Join using SQL API¶

In [ ]:

Copied!

result = sedona.sql("SELECT c.geometry as country_geom, c.NAME_EN, a.geometry as airport_geom, a.name FROM country c, airport a WHERE ST_Contains(c.geometry, a.geometry)")
result = sedona.sql("SELECT c.geometry as country_geom, c.NAME_EN, a.geometry as airport_geom, a.name FROM country c, airport a WHERE ST_Contains(c.geometry, a.geometry)")

Run Spatial Join using RDD API¶

In [ ]:

Copied!





airports_rdd = Adapter.toSpatialRdd(airports_df, "geometry")
# Drop the duplicate name column in countries_df
countries_df = countries_df.drop("NAME")
countries_rdd = Adapter.toSpatialRdd(countries_df, "geometry")

airports_rdd.analyze()
countries_rdd.analyze()

# 4 is the num partitions used in spatial partitioning. This is an optional parameter
airports_rdd.spatialPartitioning(GridType.KDBTREE, 4)
countries_rdd.spatialPartitioning(airports_rdd.getPartitioner())

buildOnSpatialPartitionedRDD = True
usingIndex = True
considerBoundaryIntersection = True
airports_rdd.buildIndex(IndexType.QUADTREE, buildOnSpatialPartitionedRDD)

result_pair_rdd = JoinQueryRaw.SpatialJoinQueryFlat(airports_rdd, countries_rdd, usingIndex, considerBoundaryIntersection)

result2 = Adapter.toDf(result_pair_rdd, countries_rdd.fieldNames, airports.fieldNames, sedona)

result2.createOrReplaceTempView("join_result_with_all_cols")
# Select the columns needed in the join
result2 = sedona.sql("SELECT leftgeometry as country_geom, NAME_EN, rightgeometry as airport_geom, name FROM join_result_with_all_cols")
airports_rdd = Adapter.toSpatialRdd(airports_df, "geometry")
# Drop the duplicate name column in countries_df
countries_df = countries_df.drop("NAME")
countries_rdd = Adapter.toSpatialRdd(countries_df, "geometry")

airports_rdd.analyze()
countries_rdd.analyze()

# 4 is the num partitions used in spatial partitioning. This is an optional parameter
airports_rdd.spatialPartitioning(GridType.KDBTREE, 4)
countries_rdd.spatialPartitioning(airports_rdd.getPartitioner())

buildOnSpatialPartitionedRDD = True
usingIndex = True
considerBoundaryIntersection = True
airports_rdd.buildIndex(IndexType.QUADTREE, buildOnSpatialPartitionedRDD)

result_pair_rdd = JoinQueryRaw.SpatialJoinQueryFlat(airports_rdd, countries_rdd, usingIndex, considerBoundaryIntersection)

result2 = Adapter.toDf(result_pair_rdd, countries_rdd.fieldNames, airports.fieldNames, sedona)

result2.createOrReplaceTempView("join_result_with_all_cols")
# Select the columns needed in the join
result2 = sedona.sql("SELECT leftgeometry as country_geom, NAME_EN, rightgeometry as airport_geom, name FROM join_result_with_all_cols")

Print spatial join results¶

In [ ]:

Copied!





# The result of SQL API
result.show()
# The result of RDD API
result2.show()
# The result of SQL API
result.show()
# The result of RDD API
result2.show()

Group airports by country¶

In [ ]:

Copied!





# result.createOrReplaceTempView("result")
result2.createOrReplaceTempView("result")
groupedresult = sedona.sql("SELECT c.NAME_EN, c.country_geom, count(*) as AirportCount FROM result c GROUP BY c.NAME_EN, c.country_geom")
groupedresult.show()
groupedresult.createOrReplaceTempView("grouped_result")
# result.createOrReplaceTempView("result")
result2.createOrReplaceTempView("result")
groupedresult = sedona.sql("SELECT c.NAME_EN, c.country_geom, count(*) as AirportCount FROM result c GROUP BY c.NAME_EN, c.country_geom")
groupedresult.show()
groupedresult.createOrReplaceTempView("grouped_result")

Visualize the number of airports in each country¶

Visualize using SedonaKepler¶

In [ ]:

Copied!

sedona_kepler_map = SedonaKepler.create_map(df=groupedresult, name="AirportCount", config=getConfig())
sedona_kepler_map
sedona_kepler_map = SedonaKepler.create_map(df=groupedresult, name="AirportCount", config=getConfig())
sedona_kepler_map

Visualize using SedonaPyDeck¶

The above visualization is generated by a pre-set config informing SedonaKepler that the map to be rendered has to be a choropleth map with choropleth of the AirportCount column value.

This can be also be achieved using SedonaPyDeck and its create_choropleth_map API.

In [ ]:

Copied!

sedona_pydeck_map = SedonaPyDeck.create_choropleth_map(df=groupedresult, plot_col='AirportCount')
sedona_pydeck_map
sedona_pydeck_map = SedonaPyDeck.create_choropleth_map(df=groupedresult, plot_col='AirportCount')
sedona_pydeck_map

Visualize Uber H3 cells using SedonaKepler¶

The following tutorial depicts how Uber H3 cells can be generated using Sedona and visualized using SedonaKepler.

Generate H3 cell IDs¶

ST_H3CellIDs can be used to generated cell IDs for given geometries

In [ ]:

Copied!

h3_df = sedona.sql("SELECT g.NAME_EN, g.country_geom, ST_H3CellIDs(g.country_geom, 3, false) as h3_cellID from grouped_result g")
h3_df.show(2)
h3_df = sedona.sql("SELECT g.NAME_EN, g.country_geom, ST_H3CellIDs(g.country_geom, 3, false) as h3_cellID from grouped_result g")
h3_df.show(2)

Since each geometry can have multiple H3 cell IDs, let's explode the generated H3 cell ID array to get individual cells¶

In [ ]:

Copied!

exploded_h3 = h3_df.select(h3_df.NAME_EN, h3_df.country_geom, explode(h3_df.h3_cellID).alias("h3"))
exploded_h3.show(2)
exploded_h3 = h3_df.select(h3_df.NAME_EN, h3_df.country_geom, explode(h3_df.h3_cellID).alias("h3"))
exploded_h3.show(2)

Convert generated long H3 cell ID to a hex cell ID¶

SedonaKepler accepts each H3 cell ID as a hexadecimal to automatically visualize them. Also, let us sample the data to be able to visualize sparse cells on the map.

In [ ]:

Copied!





exploded_h3 = exploded_h3.sample(0.3)
exploded_h3.createOrReplaceTempView("exploded_h3")
hex_exploded_h3 = exploded_h3.select(exploded_h3.NAME_EN, hex(exploded_h3.h3).alias("ex_h3"))
hex_exploded_h3.show(2)
hex_exploded_h3.printSchema()
exploded_h3 = exploded_h3.sample(0.3)
exploded_h3.createOrReplaceTempView("exploded_h3")
hex_exploded_h3 = exploded_h3.select(exploded_h3.NAME_EN, hex(exploded_h3.h3).alias("ex_h3"))
hex_exploded_h3.show(2)
hex_exploded_h3.printSchema()

Visualize using SedonaKepler¶

Now, simply provide the final df to SedonaKepler.create_map and you can automagically visualize the H3 cells on the map!

In [ ]:

Copied!

sedona_kepler_h3 = SedonaKepler.create_map(df=hex_exploded_h3, name="h3")
sedona_kepler_h3
sedona_kepler_h3 = SedonaKepler.create_map(df=hex_exploded_h3, name="h3")
sedona_kepler_h3

Last update: April 14, 2024 03:28:23