In [1]:
import hvplot.pandas
import geopandas as gpd
import numpy as np
import pandas as pd
import pyproj
import seareport_data as D
import shapely
In [2]:
# First let's get the OSM dataset
# - wgs84_area is the area of the polygons calculated on the WGS84 **ellipsoid**
# - wgs84_perimeter is the area of the perimeter calculated on the WGS84 **ellipsoid**
# This means that they are accurate value regardless of the CRS
OSM = D.osm_df()
OSM
/home/panos/Prog/poseidon/polyshrink/.venv/lib/python3.11/site-packages/pyogrio/raw.py:198: RuntimeWarning: driver SQLite does not support open option SPATIALITE return ogr_read(
Out[2]:
fid | wgs84_area | wgs84_perimeter | no_coords | geometry | |
---|---|---|---|---|---|
0 | 0 | 1145.781517 | 142.520084 | 6 | POLYGON ((-4.94237 55.72545, -4.94192 55.72558... |
1 | 1 | 5748.232118 | 362.277633 | 10 | POLYGON ((-4.94051 55.72458, -4.94049 55.72407... |
2 | 2 | 99341.705251 | 1320.830613 | 29 | POLYGON ((-4.7329 55.52496, -4.7338 55.52495, ... |
3 | 3 | 16150.508271 | 615.633188 | 15 | POLYGON ((-4.95385 55.1629, -4.95388 55.16341,... |
4 | 4 | 837.525248 | 294.416503 | 14 | POLYGON ((-1.75363 50.72293, -1.75358 50.72303... |
... | ... | ... | ... | ... | ... |
790044 | 790044 | 6617.409232 | 320.464159 | 14 | POLYGON ((-98.4276 72.97151, -98.42721 72.9714... |
790045 | 790045 | 13896.928896 | 457.350250 | 13 | POLYGON ((-98.42497 72.85995, -98.42458 72.860... |
790046 | 790046 | 5314.926419 | 378.743891 | 22 | POLYGON ((-98.44028 72.85203, -98.4401 72.8519... |
790047 | 790047 | 11917.621708 | 456.143971 | 12 | POLYGON ((-98.39741 72.88149, -98.39694 72.881... |
790048 | 790048 | 93200.135605 | 1441.291680 | 24 | POLYGON ((-78.74229 52.58088, -78.74251 52.580... |
790049 rows × 5 columns
In [3]:
# Let's also get the UTM tiles
UTM = D.utm_df()
UTM
Out[3]:
zone | row | code | geometry | |
---|---|---|---|---|
0 | 0 | A | esri:102021 | POLYGON ((0 -90, 0 -80, -180 -80, -180 -90, 0 ... |
1 | 0 | B | esri:102021 | POLYGON ((180 -90, 180 -80, 0 -80, 0 -90, 180 ... |
2 | 1 | C | epsg:32701 | POLYGON ((-174 -80, -174 -72, -180 -72, -180 -... |
3 | 1 | D | epsg:32701 | POLYGON ((-174 -72, -174 -64, -180 -64, -180 -... |
4 | 1 | E | epsg:32701 | POLYGON ((-174 -64, -174 -56, -180 -56, -180 -... |
... | ... | ... | ... | ... |
1196 | 60 | V | epsg:32660 | POLYGON ((180 56, 180 64, 174 64, 174 56, 180 ... |
1197 | 60 | W | epsg:32660 | POLYGON ((180 64, 180 72, 174 72, 174 64, 180 ... |
1198 | 60 | X | epsg:32660 | POLYGON ((180 72, 180 84, 174 84, 174 72, 180 ... |
1199 | 0 | X | esri:102018 | POLYGON ((0 84, 0 90, -180 90, -180 84, 0 84)) |
1200 | 0 | Y | esri:102018 | POLYGON ((180 84, 180 90, 0 90, 0 84, 180 84)) |
1201 rows × 4 columns
In [4]:
# now let's pick one island to demonstrate how concave hull works
#
# Tip to find the FID of a specific island, load the dataset on QGIS and use the "Idenfity Features Button": Ctrl + Shift + I
# To find the path to the dataset, you can use:
# D.osm()[0]
#
# So, we will work with Limnos:
island = OSM.iloc[[36593]]
island
island.hvplot.polygons(tiles=True, alpha=0.5)
Out[4]:
fid | wgs84_area | wgs84_perimeter | no_coords | geometry | |
---|---|---|---|---|---|
36593 | 36593 | 4.757744e+08 | 279963.630413 | 14727 | POLYGON ((25.36017 39.91077, 25.36003 39.91067... |
Out[4]:
In [5]:
# Now, the latest GEOS versions support concave hulls.
# This bindings for it has been implemented on shapely/geopandas, too
# https://shapely.readthedocs.io/en/2.0.6/reference/shapely.concave_hull.html
# https://geopandas.org/en/latest/docs/reference/api/geopandas.GeoSeries.concave_hull.html
# Let's see how it works
island.concave_hull().to_frame("geometry").hvplot.polygons(tiles=True, alpha=0.5)
Out[5]:
In [6]:
# That's pretty horrible... What are these lines?
# Let's see if we can tweak it:
island.concave_hull?
Signature: island.concave_hull(ratio=0.0, allow_holes=False) Docstring: Returns a ``GeoSeries`` of geometries representing the concave hull of each geometry. The concave hull of a geometry is the smallest concave `Polygon` containing all the points in each geometry, unless the number of points in the geometric object is less than three. For two points, the concave hull collapses to a `LineString`; for 1, a `Point`. The hull is constructed by removing border triangles of the Delaunay Triangulation of the points as long as their "size" is larger than the maximum edge length ratio and optionally allowing holes. The edge length factor is a fraction of the length difference between the longest and shortest edges in the Delaunay Triangulation of the input points. For further information on the algorithm used, see https://libgeos.org/doxygen/classgeos_1_1algorithm_1_1hull_1_1ConcaveHull.html Parameters ---------- ratio : float, (optional, default 0.0) Number in the range [0, 1]. Higher numbers will include fewer vertices in the hull. allow_holes : bool, (optional, default False) If set to True, the concave hull may have holes. Examples -------- >>> from shapely.geometry import Polygon, LineString, Point, MultiPoint >>> s = geopandas.GeoSeries( ... [ ... Polygon([(0, 0), (1, 1), (0, 1)]), ... LineString([(0, 0), (1, 1), (1, 0)]), ... MultiPoint([(0, 0), (1, 1), (0, 1), (1, 0), (0.5, 0.5)]), ... MultiPoint([(0, 0), (1, 1)]), ... Point(0, 0), ... ], ... crs=3857 ... ) >>> s 0 POLYGON ((0 0, 1 1, 0 1, 0 0)) 1 LINESTRING (0 0, 1 1, 1 0) 2 MULTIPOINT ((0 0), (1 1), (0 1), (1 0), (0.5 0... 3 MULTIPOINT ((0 0), (1 1)) 4 POINT (0 0) dtype: geometry >>> s.concave_hull() 0 POLYGON ((0 1, 1 1, 0 0, 0 1)) 1 POLYGON ((0 0, 1 1, 1 0, 0 0)) 2 POLYGON ((0.5 0.5, 0 1, 1 1, 1 0, 0 0, 0.5 0.5)) 3 LINESTRING (0 0, 1 1) 4 POINT (0 0) dtype: geometry See also -------- GeoSeries.convex_hull : convex hull geometry File: ~/Prog/poseidon/polyshrink/.venv/lib/python3.11/site-packages/geopandas/base.py Type: method
In [7]:
# Hmm. there is this `ratio` which seems to be useful.
# Nevertheless it will not help in our case (we will still use it later on, but it will not solve this specific problem).
#
# The problem with this concave hull algorithm is that it is supposed to work with
# a "soup" of points thar are more or less uniformly distributed.
# More specifically, the algorithm is doing a Delauney triangulation on all the points
# and it starts to remove the outer triangles
#
# The problem is that ALL our points are on the boundary of the polygon and there are no points inside it.
# So the triangles are few and they are huge, too.
#
# That's the problem
# But it can be solved. We just need to add triangles inside.
# Let's do that. geopandas has the right method for this:
island.sample_points?
Signature: island.sample_points(size, method='uniform', seed=None, rng=None, **kwargs) Docstring: Sample points from each geometry. Generate a MultiPoint per each geometry containing points sampled from the geometry. You can either sample randomly from a uniform distribution or use an advanced sampling algorithm from the ``pointpats`` package. For polygons, this samples within the area of the polygon. For lines, this samples along the length of the linestring. For multi-part geometries, the weights of each part are selected according to their relevant attribute (area for Polygons, length for LineStrings), and then points are sampled from each part. Any other geometry type (e.g. Point, GeometryCollection) is ignored, and an empty MultiPoint geometry is returned. Parameters ---------- size : int | array-like The size of the sample requested. Indicates the number of samples to draw from each geometry. If an array of the same length as a GeoSeries is passed, it denotes the size of a sample per geometry. method : str, default "uniform" The sampling method. ``uniform`` samples uniformly at random from a geometry using ``numpy.random.uniform``. Other allowed strings (e.g. ``"cluster_poisson"``) denote sampling function name from the ``pointpats.random`` module (see http://pysal.org/pointpats/api.html#random-distributions). Pointpats methods are implemented for (Multi)Polygons only and will return an empty MultiPoint for other geometry types. rng : {None, int, array_like[ints], SeedSequence, BitGenerator, Generator}, optional A random generator or seed to initialize the numpy BitGenerator. If None, then fresh, unpredictable entropy will be pulled from the OS. **kwargs : dict Options for the pointpats sampling algorithms. Returns ------- GeoSeries Points sampled within (or along) each geometry. Examples -------- >>> from shapely.geometry import Polygon >>> s = geopandas.GeoSeries( ... [ ... Polygon([(1, -1), (1, 0), (0, 0)]), ... Polygon([(3, -1), (4, 0), (3, 1)]), ... ] ... ) >>> s.sample_points(size=10) # doctest: +SKIP 0 MULTIPOINT ((0.1045 -0.10294), (0.35249 -0.264... 1 MULTIPOINT ((3.03261 -0.43069), (3.10068 0.114... Name: sampled_points, dtype: geometry File: ~/Prog/poseidon/polyshrink/.venv/lib/python3.11/site-packages/geopandas/base.py Type: method
In [8]:
# So for example:
island.sample_points(size=500, rng=42).to_frame("geometry").hvplot.points(tiles=True)
Out[8]:
In [9]:
# Hmm... that seems to work well enough.
# The appropriate `size` needs to be chosen for each polygon separately, though,
# but it shouldn't be too hard to find an appropriate formula
def concave_hull(gdf, ratio):
no_points = np.maximum(gdf.no_coords, np.minimum(10_000, (gdf.wgs84_area // 10_0000).astype(int)))
inner = gdf.sample_points(no_points)
outer = gdf.boundary.apply(lambda g: shapely.MultiPoint(g.coords))
#outer = gdf.points
total = inner.union(outer)
a = total.concave_hull(ratio=ratio).union(gdf).union_all()
out = gpd.GeoDataFrame(
geometry=[total.concave_hull(ratio=ratio).union(gdf).union_all()],
crs=gdf.crs,
)
out = out.explode().exterior.polygonize().to_frame("geometry")
return out
In [10]:
# For the record, we probably need to do the concave hull calculations in projected space, but for demonstration purposes it should be fine to stay on 4326
In [11]:
concave_hull(island, 0.1).hvplot(tiles=True, alpha=0.5)
Out[11]:
In [12]:
# Hey! that seemed to work!!!
# Let's see now what `ratio` does
concave_hull(island, 0.00).hvplot(tiles=True, alpha=0.5)
concave_hull(island, 0.05).hvplot(tiles=True, alpha=0.5)
concave_hull(island, 0.10).hvplot(tiles=True, alpha=0.5)
concave_hull(island, 0.30).hvplot(tiles=True, alpha=0.5)
Out[12]:
Out[12]:
Out[12]:
Out[12]:
In [13]:
# So, the higher ratio, the more convex the curve.
# And if ratio=1.0 then we get the convex hull:
concave_hull(island, 1).hvplot(tiles=True, alpha=0.5)
island.convex_hull.to_frame("geometry").hvplot(tiles=True, alpha=0.5)
Out[13]:
Out[13]:
In [14]:
# So in a nutshell our problem is:
#
# For each polygon:
# 1. Figure out an appropriate `size` for `sample_points()`.
# 2. Figure out an appropriate `ratio` for `concave_hull()`.
#
# The first problem is not that difficult. Too few points means bad triangles.
# Too many points means slow calculations.
# So we can always just use more points than strictly necessary
#
# The second problem is more challenging though. If we test with more islands we will see that
# the ratio value depends on the area of the polygon, the number of points and the
# distribution of the points (shape of the island).
#
# Generally speaking, bigger islands need smaller ratios compared to smaller islands (in order to produce similar concave hulls).
In [15]:
# Quantifying concave hulls
# These are some ways we could try to comparing one concave hull Polygon with a different one:
#
# concave_hull_area / original_area
# concave_hull_perimeter / original_perimeter
# concave_hull_no_points / original_no_points
#
# To calculate these ratios we will need a help function
def calc_area_and_perimeter(
gdf: gpd.GeoDataFrame,
geod: pyproj.Geod = pyproj.Geod(ellps="WGS84"),
*,
sort: bool = False,
) -> gpd.GeoDataFrame:
assert gdf.crs is not None, "CRS must be defined!"
area_and_perimeter = gdf.to_crs(4326).geometry.apply(geod.geometry_area_perimeter).apply(pd.Series)
area_and_perimeter = area_and_perimeter.rename(columns={0: "area", 1: "perimeter"})
area_and_perimeter.area *= -1
gdf = gdf.assign(
wgs84_area=area_and_perimeter.area,
wgs84_perimeter=area_and_perimeter.perimeter,
no_coords=gdf.count_coordinates() - 1,
)
if sort:
gdf = gdf.sort_values("wgs84_area", ascending=False).reset_index(drop=True)
return gdf
concave_0000 = calc_area_and_perimeter(concave_hull(island, 0.00))
concave_0050 = calc_area_and_perimeter(concave_hull(island, 0.05))
concave_0100 = calc_area_and_perimeter(concave_hull(island, 0.10))
concave_1000 = calc_area_and_perimeter(island.convex_hull.to_frame("geometry")).reset_index(drop=True)
pd.concat([concave_0000, concave_0050, concave_0100, concave_1000]).drop(columns=["geometry"]).divide(island.drop(columns=["geometry", "fid"]).reset_index(drop=True), axis=1)
Out[15]:
wgs84_area | wgs84_perimeter | no_coords | |
---|---|---|---|
0 | 1.002667 | 0.927512 | 0.620221 |
0 | 1.092406 | 0.609269 | 0.038840 |
0 | 1.225798 | 0.479493 | 0.016229 |
0 | 1.611783 | 0.391222 | 0.002241 |
In [16]:
# here are the raw numbers
pd.concat([island, concave_0000, concave_0050, concave_0100, concave_1000])
Out[16]:
fid | wgs84_area | wgs84_perimeter | no_coords | geometry | |
---|---|---|---|---|---|
36593 | 36593.0 | 4.757744e+08 | 279963.630413 | 14727 | POLYGON ((25.36017 39.91077, 25.36003 39.91067... |
0 | NaN | 4.770432e+08 | 259669.726128 | 9134 | POLYGON ((25.03344 39.9857, 25.03357 39.98576,... |
0 | NaN | 5.197388e+08 | 170573.261021 | 572 | POLYGON ((25.03344 39.9857, 25.03717 39.98837,... |
0 | NaN | 5.832033e+08 | 134240.606491 | 239 | POLYGON ((25.03344 39.9857, 25.03717 39.98837,... |
0 | NaN | 7.668450e+08 | 109527.817304 | 33 | POLYGON ((25.34977 39.7817, 25.16542 39.79627,... |
In [17]:
# Apart from that we can also use the hausdorf distance
# (i.e. the maximum distance of the closest nodes): https://en.wikipedia.org/wiki/Hausdorff_distance
#
# In a nutshell, the hausdorff distance is the maxium distance between an island node and its closest mesh node.
#
# Since the calculation of the hausdorff distnace needs to happen on a Projected CRS we will use the appropriate UTM zone.
island_32635 = island.to_crs(32635)
float(concave_0000.to_crs(32635).hausdorff_distance(island_32635, align=False).iloc[0])
float(concave_0050.to_crs(32635).hausdorff_distance(island_32635, align=False).iloc[0])
float(concave_0100.to_crs(32635).hausdorff_distance(island_32635, align=False).iloc[0])
float(concave_1000.to_crs(32635).hausdorff_distance(island_32635, align=False).iloc[0])
Out[17]:
453.90479114261746
Out[17]:
3103.7407248587174
Out[17]:
6729.447185171601
Out[17]:
12334.955268317786