In [1]:
import hvplot.pandas
import geopandas as gpd
import numpy as np
import pandas as pd
import pyproj
import seareport_data as D
import shapely
In [2]:
# First let's get the OSM dataset
# - wgs84_area is the area of the polygons calculated on the WGS84 **ellipsoid**
# - wgs84_perimeter is the area of the perimeter calculated on the WGS84 **ellipsoid**
# This means that they are accurate value regardless of the CRS
OSM = D.osm_df()
OSM
/home/panos/Prog/poseidon/polyshrink/.venv/lib/python3.11/site-packages/pyogrio/raw.py:198: RuntimeWarning: driver SQLite does not support open option SPATIALITE
  return ogr_read(
Out[2]:
fid wgs84_area wgs84_perimeter no_coords geometry
0 0 1145.781517 142.520084 6 POLYGON ((-4.94237 55.72545, -4.94192 55.72558...
1 1 5748.232118 362.277633 10 POLYGON ((-4.94051 55.72458, -4.94049 55.72407...
2 2 99341.705251 1320.830613 29 POLYGON ((-4.7329 55.52496, -4.7338 55.52495, ...
3 3 16150.508271 615.633188 15 POLYGON ((-4.95385 55.1629, -4.95388 55.16341,...
4 4 837.525248 294.416503 14 POLYGON ((-1.75363 50.72293, -1.75358 50.72303...
... ... ... ... ... ...
790044 790044 6617.409232 320.464159 14 POLYGON ((-98.4276 72.97151, -98.42721 72.9714...
790045 790045 13896.928896 457.350250 13 POLYGON ((-98.42497 72.85995, -98.42458 72.860...
790046 790046 5314.926419 378.743891 22 POLYGON ((-98.44028 72.85203, -98.4401 72.8519...
790047 790047 11917.621708 456.143971 12 POLYGON ((-98.39741 72.88149, -98.39694 72.881...
790048 790048 93200.135605 1441.291680 24 POLYGON ((-78.74229 52.58088, -78.74251 52.580...

790049 rows × 5 columns

In [3]:
# Let's also get the UTM tiles
UTM = D.utm_df()
UTM
Out[3]:
zone row code geometry
0 0 A esri:102021 POLYGON ((0 -90, 0 -80, -180 -80, -180 -90, 0 ...
1 0 B esri:102021 POLYGON ((180 -90, 180 -80, 0 -80, 0 -90, 180 ...
2 1 C epsg:32701 POLYGON ((-174 -80, -174 -72, -180 -72, -180 -...
3 1 D epsg:32701 POLYGON ((-174 -72, -174 -64, -180 -64, -180 -...
4 1 E epsg:32701 POLYGON ((-174 -64, -174 -56, -180 -56, -180 -...
... ... ... ... ...
1196 60 V epsg:32660 POLYGON ((180 56, 180 64, 174 64, 174 56, 180 ...
1197 60 W epsg:32660 POLYGON ((180 64, 180 72, 174 72, 174 64, 180 ...
1198 60 X epsg:32660 POLYGON ((180 72, 180 84, 174 84, 174 72, 180 ...
1199 0 X esri:102018 POLYGON ((0 84, 0 90, -180 90, -180 84, 0 84))
1200 0 Y esri:102018 POLYGON ((180 84, 180 90, 0 90, 0 84, 180 84))

1201 rows × 4 columns

In [4]:
# now let's pick one island to demonstrate how concave hull works
#
# Tip to find the FID of a specific island, load the dataset on QGIS and use the "Idenfity Features Button": Ctrl + Shift + I
# To find the path to the dataset, you can use:
# D.osm()[0]
#
# So, we will work with Limnos: 
island = OSM.iloc[[36593]]
island
island.hvplot.polygons(tiles=True, alpha=0.5)
Out[4]:
fid wgs84_area wgs84_perimeter no_coords geometry
36593 36593 4.757744e+08 279963.630413 14727 POLYGON ((25.36017 39.91077, 25.36003 39.91067...
Out[4]:
In [5]:
# Now, the latest GEOS versions support concave hulls. 
# This bindings for it has been implemented on shapely/geopandas, too
# https://shapely.readthedocs.io/en/2.0.6/reference/shapely.concave_hull.html
# https://geopandas.org/en/latest/docs/reference/api/geopandas.GeoSeries.concave_hull.html
# Let's see how it works
island.concave_hull().to_frame("geometry").hvplot.polygons(tiles=True, alpha=0.5)
Out[5]:
In [6]:
# That's pretty horrible... What are these lines?
# Let's see if we can tweak it:
island.concave_hull?
Signature: island.concave_hull(ratio=0.0, allow_holes=False)
Docstring:
Returns a ``GeoSeries`` of geometries representing the concave hull
of each geometry.

The concave hull of a geometry is the smallest concave `Polygon`
containing all the points in each geometry, unless the number of points
in the geometric object is less than three. For two points, the concave
hull collapses to a `LineString`; for 1, a `Point`.

The hull is constructed by removing border triangles of the Delaunay
Triangulation of the points as long as their "size" is larger than the
maximum edge length ratio and optionally allowing holes. The edge length factor
is a fraction of the length difference between the longest and shortest edges
in the Delaunay Triangulation of the input points. For further information
on the algorithm used, see
https://libgeos.org/doxygen/classgeos_1_1algorithm_1_1hull_1_1ConcaveHull.html

Parameters
----------
ratio : float, (optional, default 0.0)
    Number in the range [0, 1]. Higher numbers will include fewer vertices
    in the hull.
allow_holes : bool, (optional, default False)
    If set to True, the concave hull may have holes.

Examples
--------

>>> from shapely.geometry import Polygon, LineString, Point, MultiPoint
>>> s = geopandas.GeoSeries(
...     [
...         Polygon([(0, 0), (1, 1), (0, 1)]),
...         LineString([(0, 0), (1, 1), (1, 0)]),
...         MultiPoint([(0, 0), (1, 1), (0, 1), (1, 0), (0.5, 0.5)]),
...         MultiPoint([(0, 0), (1, 1)]),
...         Point(0, 0),
...     ],
...     crs=3857
... )
>>> s
0                       POLYGON ((0 0, 1 1, 0 1, 0 0))
1                           LINESTRING (0 0, 1 1, 1 0)
2    MULTIPOINT ((0 0), (1 1), (0 1), (1 0), (0.5 0...
3                            MULTIPOINT ((0 0), (1 1))
4                                          POINT (0 0)
dtype: geometry

>>> s.concave_hull()
0                      POLYGON ((0 1, 1 1, 0 0, 0 1))
1                      POLYGON ((0 0, 1 1, 1 0, 0 0))
2    POLYGON ((0.5 0.5, 0 1, 1 1, 1 0, 0 0, 0.5 0.5))
3                               LINESTRING (0 0, 1 1)
4                                         POINT (0 0)
dtype: geometry

See also
--------
GeoSeries.convex_hull : convex hull geometry
File:      ~/Prog/poseidon/polyshrink/.venv/lib/python3.11/site-packages/geopandas/base.py
Type:      method
In [7]:
# Hmm. there is this `ratio` which seems to be useful. 
# Nevertheless it will not help in our case (we will still use it later on, but it will not solve this specific problem).
# 
# The problem with this concave hull algorithm is that it is supposed to work with 
# a "soup" of points thar are more or less uniformly distributed.
# More specifically, the algorithm is doing a Delauney triangulation on all the points
# and it starts to remove the outer triangles
#
# The problem is that ALL our points are on the boundary of the polygon and there are no points inside it.
# So the triangles are few and they are huge, too. 
# 
# That's the problem
# But it can be solved. We just need to add triangles inside. 
# Let's do that. geopandas has the right method for this:
island.sample_points?
Signature: island.sample_points(size, method='uniform', seed=None, rng=None, **kwargs)
Docstring:
Sample points from each geometry.

Generate a MultiPoint per each geometry containing points sampled from the
geometry. You can either sample randomly from a uniform distribution or use an
advanced sampling algorithm from the ``pointpats`` package.

For polygons, this samples within the area of the polygon. For lines,
this samples along the length of the linestring. For multi-part
geometries, the weights of each part are selected according to their relevant
attribute (area for Polygons, length for LineStrings), and then points are
sampled from each part.

Any other geometry type (e.g. Point, GeometryCollection) is ignored, and an
empty MultiPoint geometry is returned.

Parameters
----------
size : int | array-like
    The size of the sample requested. Indicates the number of samples to draw
    from each geometry.  If an array of the same length as a GeoSeries is
    passed, it denotes the size of a sample per geometry.
method : str, default "uniform"
    The sampling method. ``uniform`` samples uniformly at random from a
    geometry using ``numpy.random.uniform``. Other allowed strings
    (e.g. ``"cluster_poisson"``) denote sampling function name from the
    ``pointpats.random`` module (see
    http://pysal.org/pointpats/api.html#random-distributions). Pointpats methods
    are implemented for (Multi)Polygons only and will return an empty MultiPoint
    for other geometry types.
rng : {None, int, array_like[ints], SeedSequence, BitGenerator, Generator}, optional
    A random generator or seed to initialize the numpy BitGenerator. If None, then fresh,
    unpredictable entropy will be pulled from the OS.
**kwargs : dict
    Options for the pointpats sampling algorithms.

Returns
-------
GeoSeries
    Points sampled within (or along) each geometry.

Examples
--------
>>> from shapely.geometry import Polygon
>>> s = geopandas.GeoSeries(
...     [
...         Polygon([(1, -1), (1, 0), (0, 0)]),
...         Polygon([(3, -1), (4, 0), (3, 1)]),
...     ]
... )

>>> s.sample_points(size=10)  # doctest: +SKIP
0    MULTIPOINT ((0.1045 -0.10294), (0.35249 -0.264...
1    MULTIPOINT ((3.03261 -0.43069), (3.10068 0.114...
Name: sampled_points, dtype: geometry
File:      ~/Prog/poseidon/polyshrink/.venv/lib/python3.11/site-packages/geopandas/base.py
Type:      method
In [8]:
# So for example:
island.sample_points(size=500, rng=42).to_frame("geometry").hvplot.points(tiles=True)
Out[8]:
In [9]:
# Hmm... that seems to work well enough. 
# The appropriate `size` needs to be chosen for each polygon separately, though,
# but it shouldn't be too hard to find an appropriate formula
def concave_hull(gdf, ratio):
    no_points = np.maximum(gdf.no_coords, np.minimum(10_000, (gdf.wgs84_area // 10_0000).astype(int)))
    inner = gdf.sample_points(no_points)
    outer = gdf.boundary.apply(lambda g: shapely.MultiPoint(g.coords))
    #outer = gdf.points
    total = inner.union(outer)
    a = total.concave_hull(ratio=ratio).union(gdf).union_all()
    out = gpd.GeoDataFrame(
        geometry=[total.concave_hull(ratio=ratio).union(gdf).union_all()], 
        crs=gdf.crs,
    )
    out = out.explode().exterior.polygonize().to_frame("geometry")
    return out
In [10]:
# For the record, we probably need to do the concave hull calculations in projected space, but for demonstration purposes it should be fine to stay on 4326
In [11]:
concave_hull(island, 0.1).hvplot(tiles=True, alpha=0.5)
Out[11]:
In [12]:
# Hey! that seemed to work!!!
# Let's see now what `ratio` does
concave_hull(island, 0.00).hvplot(tiles=True, alpha=0.5)
concave_hull(island, 0.05).hvplot(tiles=True, alpha=0.5)
concave_hull(island, 0.10).hvplot(tiles=True, alpha=0.5)
concave_hull(island, 0.30).hvplot(tiles=True, alpha=0.5)
Out[12]:
Out[12]:
Out[12]:
Out[12]:
In [13]:
# So, the higher ratio, the more convex the curve.
# And if ratio=1.0 then we get the convex hull:
concave_hull(island, 1).hvplot(tiles=True, alpha=0.5)
island.convex_hull.to_frame("geometry").hvplot(tiles=True, alpha=0.5)
Out[13]:
Out[13]:
In [14]:
# So in a nutshell our problem is:
# 
# For each polygon:
#   1. Figure out an appropriate `size` for `sample_points()`. 
#   2. Figure out an appropriate `ratio` for `concave_hull()`. 
# 
# The first problem is not that difficult. Too few points means bad triangles. 
# Too many points means slow calculations. 
# So we can always just use more points than strictly necessary
#
# The second problem is more challenging though. If we test with more islands we will see that
# the ratio value depends on the area of the polygon, the number of points and the 
# distribution of the points (shape of the island).
# 
# Generally speaking, bigger islands need smaller ratios compared to smaller islands (in order to produce similar concave hulls).
In [15]:
# Quantifying concave hulls
# These are some ways we could try to comparing one concave hull Polygon with a different one:
# 
# concave_hull_area / original_area
# concave_hull_perimeter / original_perimeter
# concave_hull_no_points / original_no_points
#
# To calculate these ratios we will need a help function

def calc_area_and_perimeter(
    gdf: gpd.GeoDataFrame,
    geod: pyproj.Geod = pyproj.Geod(ellps="WGS84"),
    *,
    sort: bool = False,
) -> gpd.GeoDataFrame:
    assert gdf.crs is not None, "CRS must be defined!"
    area_and_perimeter = gdf.to_crs(4326).geometry.apply(geod.geometry_area_perimeter).apply(pd.Series)
    area_and_perimeter = area_and_perimeter.rename(columns={0: "area", 1: "perimeter"})
    area_and_perimeter.area *= -1
    gdf = gdf.assign(
        wgs84_area=area_and_perimeter.area,
        wgs84_perimeter=area_and_perimeter.perimeter,
        no_coords=gdf.count_coordinates() - 1,
    )
    if sort:
        gdf = gdf.sort_values("wgs84_area", ascending=False).reset_index(drop=True)
    return gdf

concave_0000 = calc_area_and_perimeter(concave_hull(island, 0.00))
concave_0050 = calc_area_and_perimeter(concave_hull(island, 0.05))
concave_0100 = calc_area_and_perimeter(concave_hull(island, 0.10))
concave_1000 = calc_area_and_perimeter(island.convex_hull.to_frame("geometry")).reset_index(drop=True)
pd.concat([concave_0000, concave_0050, concave_0100, concave_1000]).drop(columns=["geometry"]).divide(island.drop(columns=["geometry", "fid"]).reset_index(drop=True), axis=1)
Out[15]:
wgs84_area wgs84_perimeter no_coords
0 1.002667 0.927512 0.620221
0 1.092406 0.609269 0.038840
0 1.225798 0.479493 0.016229
0 1.611783 0.391222 0.002241
In [16]:
# here are the raw numbers
pd.concat([island, concave_0000, concave_0050, concave_0100, concave_1000])
Out[16]:
fid wgs84_area wgs84_perimeter no_coords geometry
36593 36593.0 4.757744e+08 279963.630413 14727 POLYGON ((25.36017 39.91077, 25.36003 39.91067...
0 NaN 4.770432e+08 259669.726128 9134 POLYGON ((25.03344 39.9857, 25.03357 39.98576,...
0 NaN 5.197388e+08 170573.261021 572 POLYGON ((25.03344 39.9857, 25.03717 39.98837,...
0 NaN 5.832033e+08 134240.606491 239 POLYGON ((25.03344 39.9857, 25.03717 39.98837,...
0 NaN 7.668450e+08 109527.817304 33 POLYGON ((25.34977 39.7817, 25.16542 39.79627,...
In [17]:
# Apart from that we can also use the hausdorf distance
# (i.e. the maximum distance of the closest nodes): https://en.wikipedia.org/wiki/Hausdorff_distance
#
# In a nutshell, the hausdorff distance is the maxium distance between an island node and its closest mesh node.
# 
# Since the calculation of the hausdorff distnace needs to happen on a Projected CRS we will use the appropriate UTM zone.
island_32635 = island.to_crs(32635)
float(concave_0000.to_crs(32635).hausdorff_distance(island_32635, align=False).iloc[0])
float(concave_0050.to_crs(32635).hausdorff_distance(island_32635, align=False).iloc[0])
float(concave_0100.to_crs(32635).hausdorff_distance(island_32635, align=False).iloc[0])
float(concave_1000.to_crs(32635).hausdorff_distance(island_32635, align=False).iloc[0])
Out[17]:
453.90479114261746
Out[17]:
3103.7407248587174
Out[17]:
6729.447185171601
Out[17]:
12334.955268317786