mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-02-14 18:37:58 +00:00
Compare commits
21 Commits
settings-m
...
v5.0.0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ee8915f2b6 | ||
|
|
5475bf7b9c | ||
|
|
95e2d8c846 | ||
|
|
7552818866 | ||
|
|
db3991af74 | ||
|
|
4523b9aaed | ||
|
|
8b1cabebd6 | ||
|
|
0cf636a80c | ||
|
|
c2cb6722fe | ||
|
|
f8337bedb2 | ||
|
|
efc09a5cfc | ||
|
|
86ad9efa8a | ||
|
|
d984100e23 | ||
|
|
499110f549 | ||
|
|
267e5dac0d | ||
|
|
32d3eb46d5 | ||
|
|
c8a0dc8af1 | ||
|
|
14ecfc7834 | ||
|
|
cad44eb00c | ||
|
|
f76dbb0a16 | ||
|
|
8dd218a1d0 |
@@ -87,7 +87,6 @@ Checklist for releases:
|
||||
* [ ] increase versions in
|
||||
* `src/nominatim_api/version.py`
|
||||
* `src/nominatim_db/version.py`
|
||||
* CMakeLists.txt
|
||||
* [ ] update `ChangeLog` (copy information from patch releases from release branch)
|
||||
* [ ] complete `docs/admin/Migration.md`
|
||||
* [ ] update EOL dates in `SECURITY.md`
|
||||
|
||||
25
ChangeLog
25
ChangeLog
@@ -1,3 +1,28 @@
|
||||
5.0.0
|
||||
* increase required versions for PostgreSQL (12+), PostGIS (3.0+)
|
||||
* remove installation via cmake and debundle osm2pgsql
|
||||
* remove deprecated PHP frontend
|
||||
* remove deprecated legacy tokenizer
|
||||
* add configurable pre-processing of queries
|
||||
* add query pre-processor to split up Japanese addresses
|
||||
* rewrite of osm2pgsql style implementation
|
||||
(also adds support for osm2pgsql-themepark)
|
||||
* reduce the number of SQL queries needed to complete a 'lookup' call
|
||||
* improve computation of centroid for lines with only two points
|
||||
* improve bbox output for postcode areas
|
||||
* improve result order by returning the largest object when other things are
|
||||
equal
|
||||
* add fallback for reverse geocoding to default country tables
|
||||
* exclude postcode areas from reverse geocoding
|
||||
* disable search endpoint when database is reverse-only (regression)
|
||||
* minor performance improvements to area split algorithm
|
||||
* switch table and index creation to use autocommit mode to avoid deadlocks
|
||||
* drop overly long ways during import
|
||||
* restrict automatic migrations to versions 4.3+
|
||||
* switch linting from pylint to flake8
|
||||
* switch tests to use a wikimedia test file in the new CSV style
|
||||
* various fixes and improvements to documentation
|
||||
|
||||
4.5.0
|
||||
* allow building Nominatim as a pip package
|
||||
* make osm2pgsql building optional
|
||||
|
||||
@@ -9,10 +9,10 @@ versions.
|
||||
|
||||
| Version | End of support for security updates |
|
||||
| ------- | ----------------------------------- |
|
||||
| 5.0.x | 2027-02-06
|
||||
| 4.5.x | 2026-09-12 |
|
||||
| 4.4.x | 2026-03-07 |
|
||||
| 4.3.x | 2025-09-07 |
|
||||
| 4.2.x | 2024-11-24 |
|
||||
|
||||
## Reporting a Vulnerability
|
||||
|
||||
|
||||
@@ -9,19 +9,15 @@ the following steps:
|
||||
* Update the frontend: `pip install -U nominatim-api`
|
||||
* (optionally) Restart updates
|
||||
|
||||
If you are still using CMake for the installation of Nominatim, then you
|
||||
need to update the software in one step before migrating the database.
|
||||
It is not recommended to do this while the machine is serving requests.
|
||||
|
||||
Below you find additional migrations and hints about other structural and
|
||||
breaking changes. **Please read them before running the migration.**
|
||||
|
||||
!!! note
|
||||
If you are migrating from a version <4.3, you need to install 4.3
|
||||
first and migrate to 4.3 first. Then you can migrate to the current
|
||||
and migrate to 4.3 first. Then you can migrate to the current
|
||||
version. It is strongly recommended to do a reimport instead.
|
||||
|
||||
## 4.5.0 -> master
|
||||
## 4.5.0 -> 5.0.0
|
||||
|
||||
### PHP frontend removed
|
||||
|
||||
@@ -33,6 +29,42 @@ needed. It currently omits a warning and does otherwise nothing. It will be
|
||||
removed in later versions of Nominatim. So make sure you remove it from your
|
||||
scripts.
|
||||
|
||||
### CMake building removed
|
||||
|
||||
Nominatim can now only be installed via pip. Please follow the installation
|
||||
instructions for the current version to change to pip.
|
||||
|
||||
### osm2pgsql no longer vendored in
|
||||
|
||||
Nominatim no longer ships its own version of osm2pgsql. Please install a
|
||||
stock version of osm2pgsql from your distribution. See the
|
||||
[installation instruction for osm2pgsql](https://osm2pgsql.org/doc/install.html)
|
||||
for details. A minimum version of 1.8 is required. The current stable versions
|
||||
of Ubuntu and Debian already ship with an appropriate versions. For older
|
||||
installation, you may have to compile a newer osm2pgsql yourself.
|
||||
|
||||
### Legacy tokenizer removed
|
||||
|
||||
The `legacy` tokenizer is no longer enabled. This tokenizer has been superseded
|
||||
by the `ICU` tokenizer a long time ago. In the unlikely case that your database
|
||||
still uses the `legacy` tokenizer, you must reimport your database.
|
||||
|
||||
### osm2pgsql style overhauled
|
||||
|
||||
There are some fundamental changes to how customized osm2pgsql styles should
|
||||
be written. The changes are mostly backwards compatible, i.e. custom styles
|
||||
should still work with the new implementation. The only exception is a
|
||||
customization of the `process_tags()` function. This function is no longer
|
||||
considered public and neither are the helper functions used in it.
|
||||
They currently still work but will be removed at some point. If you have
|
||||
been making changes to `process_tags`, please review your style and try
|
||||
to switch to the new convenience functions.
|
||||
|
||||
For more information on the changes, see the
|
||||
[pull request](https://github.com/osm-search/Nominatim/pull/3615)
|
||||
and read the new
|
||||
[customization documentation](https://nominatim.org/release-docs/latest/customize/Import-Styles/).
|
||||
|
||||
## 4.4.0 -> 4.5.0
|
||||
|
||||
### New structure for Python packages
|
||||
|
||||
@@ -68,10 +68,10 @@ the update interval no new data has been published yet, it will go to sleep
|
||||
until the next expected update and only then attempt to download the next batch.
|
||||
|
||||
The one-time mode is particularly useful if you want to run updates continuously
|
||||
but need to schedule other work in between updates. For example, the main
|
||||
service at osm.org uses it, to regularly recompute postcodes -- a process that
|
||||
must not be run while updates are in progress. Its update script
|
||||
looks like this:
|
||||
but need to schedule other work in between updates. For example, you might
|
||||
want to regularly recompute postcodes -- a process that
|
||||
must not be run while updates are in progress. An update script refreshing
|
||||
postcodes regularly might look like this:
|
||||
|
||||
```sh
|
||||
#!/bin/bash
|
||||
@@ -109,17 +109,19 @@ Unit=nominatim-updates.service
|
||||
WantedBy=multi-user.target
|
||||
```
|
||||
|
||||
And then a similar service definition: `/etc/systemd/system/nominatim-updates.service`:
|
||||
`OnUnitActiveSec` defines how often the individual update command is run.
|
||||
|
||||
Then add a service definition for the timer in `/etc/systemd/system/nominatim-updates.service`:
|
||||
|
||||
```
|
||||
[Unit]
|
||||
Description=Single updates of Nominatim
|
||||
|
||||
[Service]
|
||||
WorkingDirectory=/srv/nominatim
|
||||
ExecStart=nominatim replication --once
|
||||
StandardOutput=append:/var/log/nominatim-updates.log
|
||||
StandardError=append:/var/log/nominatim-updates.error.log
|
||||
WorkingDirectory=/srv/nominatim-project
|
||||
ExecStart=/srv/nominatim-venv/bin/nominatim replication --once
|
||||
StandardOutput=journald
|
||||
StandardError=inherit
|
||||
User=nominatim
|
||||
Group=nominatim
|
||||
Type=simple
|
||||
@@ -128,9 +130,9 @@ Type=simple
|
||||
WantedBy=multi-user.target
|
||||
```
|
||||
|
||||
Replace the `WorkingDirectory` with your project directory. Also adapt user and
|
||||
group names as required. `OnUnitActiveSec` defines how often the individual
|
||||
update command is run.
|
||||
Replace the `WorkingDirectory` with your project directory. `ExecStart` points
|
||||
to the nominatim binary that was installed in your virtualenv earlier.
|
||||
Finally, you might need to adapt user and group names as required.
|
||||
|
||||
Now activate the service and start the updates:
|
||||
|
||||
@@ -140,12 +142,13 @@ sudo systemctl enable nominatim-updates.timer
|
||||
sudo systemctl start nominatim-updates.timer
|
||||
```
|
||||
|
||||
You can stop future data updates, while allowing any current, in-progress
|
||||
You can stop future data updates while allowing any current, in-progress
|
||||
update steps to finish, by running `sudo systemctl stop
|
||||
nominatim-updates.timer` and waiting until `nominatim-updates.service` isn't
|
||||
running (`sudo systemctl is-active nominatim-updates.service`). Current output
|
||||
from the update can be seen like above (`systemctl status
|
||||
nominatim-updates.service`).
|
||||
running (`sudo systemctl is-active nominatim-updates.service`).
|
||||
|
||||
To check the output from the update process, use journalctl: `journalctl -u
|
||||
nominatim-updates.service`
|
||||
|
||||
|
||||
#### Catch-up mode
|
||||
@@ -155,13 +158,13 @@ all changes from the server until the database is up-to-date. The catch-up mode
|
||||
still respects the parameter `NOMINATIM_REPLICATION_MAX_DIFF`. It downloads and
|
||||
applies the changes in appropriate batches until all is done.
|
||||
|
||||
The catch-up mode is foremost useful to bring the database up to speed after the
|
||||
The catch-up mode is foremost useful to bring the database up to date after the
|
||||
initial import. Give that the service usually is not in production at this
|
||||
point, you can temporarily be a bit more generous with the batch size and
|
||||
number of threads you use for the updates by running catch-up like this:
|
||||
|
||||
```
|
||||
cd /srv/nominatim
|
||||
cd /srv/nominatim-project
|
||||
NOMINATIM_REPLICATION_MAX_DIFF=5000 nominatim replication --catch-up --threads 15
|
||||
```
|
||||
|
||||
@@ -173,13 +176,13 @@ replication catch-up at whatever interval you desire.
|
||||
When running scheduled updates with catch-up, it is a good idea to choose
|
||||
a replication source with an update frequency that is an order of magnitude
|
||||
lower. For example, if you want to update once a day, use an hourly updated
|
||||
source. This makes sure that you don't miss an entire day of updates when
|
||||
source. This ensures that you don't miss an entire day of updates when
|
||||
the source is unexpectedly late to publish its update.
|
||||
|
||||
If you want to use the source with the same update frequency (e.g. a daily
|
||||
updated source with daily updates), use the
|
||||
continuous update mode. It ensures to re-request the newest update until it
|
||||
is published.
|
||||
once mode together with a frequently run systemd script as described above.
|
||||
It ensures to re-request the newest update until they have been published.
|
||||
|
||||
|
||||
#### Continuous updates
|
||||
@@ -197,36 +200,3 @@ parameters:
|
||||
|
||||
The update application keeps running forever and retrieves and applies
|
||||
new updates from the server as they are published.
|
||||
|
||||
You can run this command as a simple systemd service. Create a service
|
||||
description like that in `/etc/systemd/system/nominatim-updates.service`:
|
||||
|
||||
```
|
||||
[Unit]
|
||||
Description=Continuous updates of Nominatim
|
||||
|
||||
[Service]
|
||||
WorkingDirectory=/srv/nominatim
|
||||
ExecStart=nominatim replication
|
||||
StandardOutput=append:/var/log/nominatim-updates.log
|
||||
StandardError=append:/var/log/nominatim-updates.error.log
|
||||
User=nominatim
|
||||
Group=nominatim
|
||||
Type=simple
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
```
|
||||
|
||||
Replace the `WorkingDirectory` with your project directory. Also adapt user
|
||||
and group names as required.
|
||||
|
||||
Now activate the service and start the updates:
|
||||
|
||||
```
|
||||
sudo systemctl daemon-reload
|
||||
sudo systemctl enable nominatim-updates
|
||||
sudo systemctl start nominatim-updates
|
||||
```
|
||||
|
||||
|
||||
|
||||
@@ -326,7 +326,7 @@ defined primary names are forgotten.)
|
||||
|
||||
| Name | Description |
|
||||
| :----- | :---------- |
|
||||
| core | Basic set of recogniced names for all places. |
|
||||
| core | Basic set of recognized names for all places. |
|
||||
| address | Additional names useful when indexing full addresses. |
|
||||
| poi | Extended set of recognized names for pois. Use on top of the core set. |
|
||||
|
||||
|
||||
@@ -50,7 +50,7 @@ queries. This happens in two stages:
|
||||
as during the import process but may involve other processing like,
|
||||
for example, word break detection.
|
||||
2. The **token analysis** step breaks down the query parts into tokens,
|
||||
looks them up in the database and assignes them possible functions and
|
||||
looks them up in the database and assigns them possible functions and
|
||||
probabilities.
|
||||
|
||||
Query processing can be further customized while the rest of the analysis
|
||||
|
||||
@@ -425,7 +425,7 @@ function Place:write_row(k, v)
|
||||
if self.geometry == nil then
|
||||
self.geometry = self.geom_func(self.object)
|
||||
end
|
||||
if self.geometry:is_null() then
|
||||
if self.geometry == nil or self.geometry:is_null() then
|
||||
return 0
|
||||
end
|
||||
|
||||
@@ -608,6 +608,9 @@ function module.process_way(object)
|
||||
|
||||
if geom:is_null() then
|
||||
geom = o:as_linestring()
|
||||
if geom:is_null() or geom:length() > 30 then
|
||||
return nil
|
||||
end
|
||||
end
|
||||
|
||||
return geom
|
||||
|
||||
@@ -17,28 +17,6 @@ CREATE TYPE nearfeaturecentr AS (
|
||||
centroid GEOMETRY
|
||||
);
|
||||
|
||||
-- feature intersects geometry
|
||||
-- for areas and linestrings they must touch at least along a line
|
||||
CREATE OR REPLACE FUNCTION is_relevant_geometry(de9im TEXT, geom_type TEXT)
|
||||
RETURNS BOOLEAN
|
||||
AS $$
|
||||
BEGIN
|
||||
IF substring(de9im from 1 for 2) != 'FF' THEN
|
||||
RETURN TRUE;
|
||||
END IF;
|
||||
|
||||
IF geom_type = 'ST_Point' THEN
|
||||
RETURN substring(de9im from 4 for 1) = '0';
|
||||
END IF;
|
||||
|
||||
IF geom_type in ('ST_LineString', 'ST_MultiLineString') THEN
|
||||
RETURN substring(de9im from 4 for 1) = '1';
|
||||
END IF;
|
||||
|
||||
RETURN substring(de9im from 4 for 1) = '2';
|
||||
END
|
||||
$$ LANGUAGE plpgsql IMMUTABLE;
|
||||
|
||||
CREATE OR REPLACE function getNearFeatures(in_partition INTEGER, feature GEOMETRY,
|
||||
feature_centroid GEOMETRY,
|
||||
maxrank INTEGER)
|
||||
@@ -59,7 +37,12 @@ BEGIN
|
||||
isguess, postcode, centroid
|
||||
FROM location_area_large_{{ partition }}
|
||||
WHERE geometry && feature
|
||||
AND is_relevant_geometry(ST_Relate(geometry, feature), ST_GeometryType(feature))
|
||||
AND CASE WHEN ST_Dimension(feature) = 0
|
||||
THEN _ST_Covers(geometry, feature)
|
||||
WHEN ST_Dimension(feature) = 2
|
||||
THEN ST_Relate(geometry, feature, 'T********')
|
||||
ELSE ST_NPoints(ST_Intersection(geometry, feature)) > 1
|
||||
END
|
||||
AND rank_address < maxrank
|
||||
-- Postcodes currently still use rank_search to define for which
|
||||
-- features they are relevant.
|
||||
@@ -142,14 +125,16 @@ BEGIN
|
||||
|
||||
IF in_rank_search <= 4 and not in_estimate THEN
|
||||
INSERT INTO location_area_country (place_id, country_code, geometry)
|
||||
values (in_place_id, in_country_code, in_geometry);
|
||||
(SELECT in_place_id, in_country_code, geom
|
||||
FROM split_geometry(in_geometry) as geom);
|
||||
RETURN TRUE;
|
||||
END IF;
|
||||
|
||||
{% for partition in db.partitions %}
|
||||
IF in_partition = {{ partition }} THEN
|
||||
INSERT INTO location_area_large_{{ partition }} (partition, place_id, country_code, keywords, rank_search, rank_address, isguess, postcode, centroid, geometry)
|
||||
values (in_partition, in_place_id, in_country_code, in_keywords, in_rank_search, in_rank_address, in_estimate, postcode, in_centroid, in_geometry);
|
||||
(SELECT in_partition, in_place_id, in_country_code, in_keywords, in_rank_search, in_rank_address, in_estimate, postcode, in_centroid, geom
|
||||
FROM split_geometry(in_geometry) as geom);
|
||||
RETURN TRUE;
|
||||
END IF;
|
||||
{% endfor %}
|
||||
|
||||
@@ -348,8 +348,6 @@ CREATE OR REPLACE FUNCTION add_location(place_id BIGINT, country_code varchar(2)
|
||||
RETURNS BOOLEAN
|
||||
AS $$
|
||||
DECLARE
|
||||
locationid INTEGER;
|
||||
secgeo GEOMETRY;
|
||||
postcode TEXT;
|
||||
BEGIN
|
||||
PERFORM deleteLocationArea(partition, place_id, rank_search);
|
||||
@@ -360,18 +358,19 @@ BEGIN
|
||||
postcode := upper(trim (in_postcode));
|
||||
END IF;
|
||||
|
||||
IF ST_GeometryType(geometry) in ('ST_Polygon','ST_MultiPolygon') THEN
|
||||
FOR secgeo IN select split_geometry(geometry) AS geom LOOP
|
||||
PERFORM insertLocationAreaLarge(partition, place_id, country_code, keywords, rank_search, rank_address, false, postcode, centroid, secgeo);
|
||||
END LOOP;
|
||||
|
||||
ELSEIF ST_GeometryType(geometry) = 'ST_Point' THEN
|
||||
secgeo := place_node_fuzzy_area(geometry, rank_search);
|
||||
PERFORM insertLocationAreaLarge(partition, place_id, country_code, keywords, rank_search, rank_address, true, postcode, centroid, secgeo);
|
||||
|
||||
IF ST_Dimension(geometry) = 2 THEN
|
||||
RETURN insertLocationAreaLarge(partition, place_id, country_code, keywords,
|
||||
rank_search, rank_address, false, postcode,
|
||||
centroid, geometry);
|
||||
END IF;
|
||||
|
||||
RETURN true;
|
||||
IF ST_Dimension(geometry) = 0 THEN
|
||||
RETURN insertLocationAreaLarge(partition, place_id, country_code, keywords,
|
||||
rank_search, rank_address, true, postcode,
|
||||
centroid, place_node_fuzzy_area(geometry, rank_search));
|
||||
END IF;
|
||||
|
||||
RETURN false;
|
||||
END;
|
||||
$$
|
||||
LANGUAGE plpgsql;
|
||||
@@ -394,19 +393,21 @@ DECLARE
|
||||
geo RECORD;
|
||||
area FLOAT;
|
||||
remainingdepth INTEGER;
|
||||
added INTEGER;
|
||||
BEGIN
|
||||
|
||||
-- RAISE WARNING 'quad_split_geometry: maxarea=%, depth=%',maxarea,maxdepth;
|
||||
|
||||
IF (ST_GeometryType(geometry) not in ('ST_Polygon','ST_MultiPolygon') OR NOT ST_IsValid(geometry)) THEN
|
||||
IF not ST_IsValid(geometry) THEN
|
||||
RETURN;
|
||||
END IF;
|
||||
|
||||
IF ST_Dimension(geometry) != 2 OR maxdepth <= 1 THEN
|
||||
RETURN NEXT geometry;
|
||||
RETURN;
|
||||
END IF;
|
||||
|
||||
remainingdepth := maxdepth - 1;
|
||||
area := ST_AREA(geometry);
|
||||
IF remainingdepth < 1 OR area < maxarea THEN
|
||||
IF area < maxarea THEN
|
||||
RETURN NEXT geometry;
|
||||
RETURN;
|
||||
END IF;
|
||||
@@ -426,7 +427,6 @@ BEGIN
|
||||
xmid := (xmin+xmax)/2;
|
||||
ymid := (ymin+ymax)/2;
|
||||
|
||||
added := 0;
|
||||
FOR seg IN 1..4 LOOP
|
||||
|
||||
IF seg = 1 THEN
|
||||
@@ -442,16 +442,13 @@ BEGIN
|
||||
secbox := ST_SetSRID(ST_MakeBox2D(ST_Point(xmid,ymid),ST_Point(xmax,ymax)),4326);
|
||||
END IF;
|
||||
|
||||
IF st_intersects(geometry, secbox) THEN
|
||||
secgeo := st_intersection(geometry, secbox);
|
||||
IF NOT ST_IsEmpty(secgeo) AND ST_GeometryType(secgeo) in ('ST_Polygon','ST_MultiPolygon') THEN
|
||||
FOR geo IN select quad_split_geometry(secgeo, maxarea, remainingdepth) as geom LOOP
|
||||
IF NOT ST_IsEmpty(geo.geom) AND ST_GeometryType(geo.geom) in ('ST_Polygon','ST_MultiPolygon') THEN
|
||||
added := added + 1;
|
||||
RETURN NEXT geo.geom;
|
||||
END IF;
|
||||
END LOOP;
|
||||
END IF;
|
||||
secgeo := st_intersection(geometry, secbox);
|
||||
IF NOT ST_IsEmpty(secgeo) AND ST_Dimension(secgeo) = 2 THEN
|
||||
FOR geo IN SELECT quad_split_geometry(secgeo, maxarea, remainingdepth) as geom LOOP
|
||||
IF NOT ST_IsEmpty(geo.geom) AND ST_Dimension(geo.geom) = 2 THEN
|
||||
RETURN NEXT geo.geom;
|
||||
END IF;
|
||||
END LOOP;
|
||||
END IF;
|
||||
END LOOP;
|
||||
|
||||
@@ -467,10 +464,22 @@ CREATE OR REPLACE FUNCTION split_geometry(geometry GEOMETRY)
|
||||
DECLARE
|
||||
geo RECORD;
|
||||
BEGIN
|
||||
-- 10000000000 is ~~ 1x1 degree
|
||||
FOR geo IN select quad_split_geometry(geometry, 0.25, 20) as geom LOOP
|
||||
RETURN NEXT geo.geom;
|
||||
END LOOP;
|
||||
IF ST_GeometryType(geometry) = 'ST_MultiPolygon'
|
||||
and ST_Area(geometry) * 10 > ST_Area(Box2D(geometry))
|
||||
THEN
|
||||
FOR geo IN
|
||||
SELECT quad_split_geometry(g, 0.25, 20) as geom
|
||||
FROM (SELECT (ST_Dump(geometry)).geom::geometry(Polygon, 4326) AS g) xx
|
||||
LOOP
|
||||
RETURN NEXT geo.geom;
|
||||
END LOOP;
|
||||
ELSE
|
||||
FOR geo IN
|
||||
SELECT quad_split_geometry(geometry, 0.25, 20) as geom
|
||||
LOOP
|
||||
RETURN NEXT geo.geom;
|
||||
END LOOP;
|
||||
END IF;
|
||||
RETURN;
|
||||
END;
|
||||
$$
|
||||
|
||||
@@ -23,8 +23,8 @@
|
||||
"allotments" : 22,
|
||||
"neighbourhood" : [20, 22],
|
||||
"quarter" : [20, 22],
|
||||
"isolated_dwelling" : [22, 20],
|
||||
"farm" : [22, 20],
|
||||
"isolated_dwelling" : [22, 25],
|
||||
"farm" : [22, 25],
|
||||
"city_block" : 25,
|
||||
"mountain_pass" : 25,
|
||||
"square" : 25,
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
query-preprocessing:
|
||||
- step: split_japanese_phrases
|
||||
- step: normalize
|
||||
normalization:
|
||||
- ":: lower ()"
|
||||
@@ -9,16 +10,17 @@ normalization:
|
||||
- "'nº' > 'no'"
|
||||
- "ª > a"
|
||||
- "º > o"
|
||||
- "[[:Punctuation:][:Symbol:]\u02bc] > ' '"
|
||||
- "[[:Punctuation:][:Symbol:][\u02bc] - [-:]]+ > '-'"
|
||||
- "ß > 'ss'" # German szet is unambiguously equal to double ss
|
||||
- "[^[:alnum:] [:Canonical_Combining_Class=Virama:] [:Space:]] >"
|
||||
- "[^[:alnum:] [:Canonical_Combining_Class=Virama:] [:Space:] [-:]] >"
|
||||
- "[:Lm:] >"
|
||||
- ":: [[:Number:]] Latin ()"
|
||||
- ":: [[:Number:]] Ascii ();"
|
||||
- ":: [[:Number:]] NFD ();"
|
||||
- "[[:Nonspacing Mark:] [:Cf:]] >;"
|
||||
- "[:Space:]+ > ' '"
|
||||
- "[-:]?[:Space:]+[-:]? > ' '"
|
||||
transliteration:
|
||||
- "[-:] > ' '"
|
||||
- ":: Latin ()"
|
||||
- !include icu-rules/extended-unicode-to-asccii.yaml
|
||||
- ":: Ascii ()"
|
||||
|
||||
@@ -0,0 +1,61 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2025 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
This file divides Japanese addresses into three categories:
|
||||
prefecture, municipality, and other.
|
||||
The division is not strict but simple using these keywords.
|
||||
"""
|
||||
from typing import List
|
||||
import re
|
||||
|
||||
from .config import QueryConfig
|
||||
from .base import QueryProcessingFunc
|
||||
from ..search.query import Phrase
|
||||
|
||||
MATCH_PATTERNS = [
|
||||
r'''
|
||||
(...??[都都道府県縣]) # [group1] prefecture
|
||||
(.+?[市区區町村]) # [group2] municipalities (city/wards/towns/villages)
|
||||
(.+) # [group3] other words
|
||||
''',
|
||||
r'''
|
||||
(...??[都都道府県縣]) # [group1] prefecture
|
||||
(.+) # [group3] other words
|
||||
''',
|
||||
r'''
|
||||
(.+?[市区區町村]) # [group2] municipalities (city/wards/towns/villages)
|
||||
(.+) # [group3] other words
|
||||
'''
|
||||
]
|
||||
|
||||
|
||||
class _JapanesePreprocessing:
|
||||
|
||||
def __init__(self, config: QueryConfig) -> None:
|
||||
self.config = config
|
||||
|
||||
def split_phrase(self, phrase: Phrase) -> Phrase:
|
||||
"""
|
||||
This function performs a division on the given text using a regular expression.
|
||||
"""
|
||||
for pattern in MATCH_PATTERNS:
|
||||
result = re.match(pattern, phrase.text, re.VERBOSE)
|
||||
if result is not None:
|
||||
return Phrase(phrase.ptype, ':'.join(result.groups()))
|
||||
|
||||
return phrase
|
||||
|
||||
def __call__(self, phrases: List[Phrase]) -> List[Phrase]:
|
||||
"""Split a Japanese address using japanese_tokenizer.
|
||||
"""
|
||||
return [self.split_phrase(p) for p in phrases]
|
||||
|
||||
|
||||
def create(config: QueryConfig) -> QueryProcessingFunc:
|
||||
""" Create a function of japanese preprocessing.
|
||||
"""
|
||||
return _JapanesePreprocessing(config)
|
||||
@@ -433,6 +433,7 @@ PENALTY_WORDCHANGE = {
|
||||
BreakType.START: 0.0,
|
||||
BreakType.END: 0.0,
|
||||
BreakType.PHRASE: 0.0,
|
||||
BreakType.SOFT_PHRASE: 0.0,
|
||||
BreakType.WORD: 0.1,
|
||||
BreakType.PART: 0.2,
|
||||
BreakType.TOKEN: 0.4
|
||||
|
||||
@@ -133,7 +133,7 @@ class ForwardGeocoder:
|
||||
"""
|
||||
assert self.query_analyzer is not None
|
||||
qwords = [word for phrase in query.source
|
||||
for word in re.split('[, ]+', phrase.text) if word]
|
||||
for word in re.split('[-,: ]+', phrase.text) if word]
|
||||
if not qwords:
|
||||
return
|
||||
|
||||
@@ -146,7 +146,7 @@ class ForwardGeocoder:
|
||||
distance = 0.0
|
||||
norm = self.query_analyzer.normalize_text(' '.join((result.display_name,
|
||||
result.country_code or '')))
|
||||
words = set((w for w in norm.split(' ') if w))
|
||||
words = set((w for w in re.split('[-,: ]+', norm) if w))
|
||||
if not words:
|
||||
continue
|
||||
for qword in qwords:
|
||||
|
||||
@@ -7,10 +7,12 @@
|
||||
"""
|
||||
Implementation of query analysis for the ICU tokenizer.
|
||||
"""
|
||||
from typing import Tuple, Dict, List, Optional, NamedTuple, Iterator, Any, cast
|
||||
from typing import Tuple, Dict, List, Optional, Iterator, Any, cast
|
||||
from collections import defaultdict
|
||||
import dataclasses
|
||||
import difflib
|
||||
import re
|
||||
from itertools import zip_longest
|
||||
|
||||
from icu import Transliterator
|
||||
|
||||
@@ -34,17 +36,30 @@ DB_TO_TOKEN_TYPE = {
|
||||
'C': qmod.TokenType.COUNTRY
|
||||
}
|
||||
|
||||
PENALTY_IN_TOKEN_BREAK = {
|
||||
qmod.BreakType.START: 0.5,
|
||||
qmod.BreakType.END: 0.5,
|
||||
qmod.BreakType.PHRASE: 0.5,
|
||||
qmod.BreakType.SOFT_PHRASE: 0.5,
|
||||
qmod.BreakType.WORD: 0.1,
|
||||
qmod.BreakType.PART: 0.0,
|
||||
qmod.BreakType.TOKEN: 0.0
|
||||
}
|
||||
|
||||
class QueryPart(NamedTuple):
|
||||
|
||||
@dataclasses.dataclass
|
||||
class QueryPart:
|
||||
""" Normalized and transliterated form of a single term in the query.
|
||||
When the term came out of a split during the transliteration,
|
||||
the normalized string is the full word before transliteration.
|
||||
The word number keeps track of the word before transliteration
|
||||
and can be used to identify partial transliterated terms.
|
||||
Penalty is the break penalty for the break following the token.
|
||||
"""
|
||||
token: str
|
||||
normalized: str
|
||||
word_number: int
|
||||
penalty: float
|
||||
|
||||
|
||||
QueryParts = List[QueryPart]
|
||||
@@ -58,10 +73,12 @@ def yield_words(terms: List[QueryPart], start: int) -> Iterator[Tuple[str, qmod.
|
||||
total = len(terms)
|
||||
for first in range(start, total):
|
||||
word = terms[first].token
|
||||
yield word, qmod.TokenRange(first, first + 1)
|
||||
penalty = PENALTY_IN_TOKEN_BREAK[qmod.BreakType.WORD]
|
||||
yield word, qmod.TokenRange(first, first + 1, penalty=penalty)
|
||||
for last in range(first + 1, min(first + 20, total)):
|
||||
word = ' '.join((word, terms[last].token))
|
||||
yield word, qmod.TokenRange(first, last + 1)
|
||||
penalty += terms[last - 1].penalty
|
||||
yield word, qmod.TokenRange(first, last + 1, penalty=penalty)
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
@@ -94,25 +111,25 @@ class ICUToken(qmod.Token):
|
||||
self.penalty += (distance/len(self.lookup_word))
|
||||
|
||||
@staticmethod
|
||||
def from_db_row(row: SaRow) -> 'ICUToken':
|
||||
def from_db_row(row: SaRow, base_penalty: float = 0.0) -> 'ICUToken':
|
||||
""" Create a ICUToken from the row of the word table.
|
||||
"""
|
||||
count = 1 if row.info is None else row.info.get('count', 1)
|
||||
addr_count = 1 if row.info is None else row.info.get('addr_count', 1)
|
||||
|
||||
penalty = 0.0
|
||||
penalty = base_penalty
|
||||
if row.type == 'w':
|
||||
penalty = 0.3
|
||||
penalty += 0.3
|
||||
elif row.type == 'W':
|
||||
if len(row.word_token) == 1 and row.word_token == row.word:
|
||||
penalty = 0.2 if row.word.isdigit() else 0.3
|
||||
penalty += 0.2 if row.word.isdigit() else 0.3
|
||||
elif row.type == 'H':
|
||||
penalty = sum(0.1 for c in row.word_token if c != ' ' and not c.isdigit())
|
||||
penalty += sum(0.1 for c in row.word_token if c != ' ' and not c.isdigit())
|
||||
if all(not c.isdigit() for c in row.word_token):
|
||||
penalty += 0.2 * (len(row.word_token) - 1)
|
||||
elif row.type == 'C':
|
||||
if len(row.word_token) == 1:
|
||||
penalty = 0.3
|
||||
penalty += 0.3
|
||||
|
||||
if row.info is None:
|
||||
lookup_word = row.word
|
||||
@@ -202,7 +219,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
|
||||
|
||||
for row in await self.lookup_in_db(list(words.keys())):
|
||||
for trange in words[row.word_token]:
|
||||
token = ICUToken.from_db_row(row)
|
||||
token = ICUToken.from_db_row(row, trange.penalty or 0.0)
|
||||
if row.type == 'S':
|
||||
if row.info['op'] in ('in', 'near'):
|
||||
if trange.start == 0:
|
||||
@@ -242,16 +259,24 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
|
||||
wordnr = 0
|
||||
for phrase in query.source:
|
||||
query.nodes[-1].ptype = phrase.ptype
|
||||
for word in phrase.text.split(' '):
|
||||
phrase_split = re.split('([ :-])', phrase.text)
|
||||
# The zip construct will give us the pairs of word/break from
|
||||
# the regular expression split. As the split array ends on the
|
||||
# final word, we simply use the fillvalue to even out the list and
|
||||
# add the phrase break at the end.
|
||||
for word, breakchar in zip_longest(*[iter(phrase_split)]*2, fillvalue=','):
|
||||
if not word:
|
||||
continue
|
||||
trans = self.transliterator.transliterate(word)
|
||||
if trans:
|
||||
for term in trans.split(' '):
|
||||
if term:
|
||||
parts.append(QueryPart(term, word, wordnr))
|
||||
parts.append(QueryPart(term, word, wordnr,
|
||||
PENALTY_IN_TOKEN_BREAK[qmod.BreakType.TOKEN]))
|
||||
query.add_node(qmod.BreakType.TOKEN, phrase.ptype)
|
||||
query.nodes[-1].btype = qmod.BreakType.WORD
|
||||
query.nodes[-1].btype = qmod.BreakType(breakchar)
|
||||
parts[-1].penalty = PENALTY_IN_TOKEN_BREAK[qmod.BreakType(breakchar)]
|
||||
wordnr += 1
|
||||
query.nodes[-1].btype = qmod.BreakType.PHRASE
|
||||
|
||||
for word, wrange in yield_words(parts, phrase_start):
|
||||
words[word].append(wrange)
|
||||
@@ -272,7 +297,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
|
||||
""" Add tokens to query that are not saved in the database.
|
||||
"""
|
||||
for part, node, i in zip(parts, query.nodes, range(1000)):
|
||||
if len(part.token) <= 4 and part[0].isdigit()\
|
||||
if len(part.token) <= 4 and part.token.isdigit()\
|
||||
and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER):
|
||||
query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER,
|
||||
ICUToken(penalty=0.5, token=0,
|
||||
|
||||
@@ -21,7 +21,13 @@ class BreakType(enum.Enum):
|
||||
END = '>'
|
||||
""" End of the query. """
|
||||
PHRASE = ','
|
||||
""" Break between two phrases. """
|
||||
""" Hard break between two phrases. Address parts cannot cross hard
|
||||
phrase boundaries."""
|
||||
SOFT_PHRASE = ':'
|
||||
""" Likely break between two phrases. Address parts should not cross soft
|
||||
phrase boundaries. Soft breaks can be inserted by a preprocessor
|
||||
that is analysing the input string.
|
||||
"""
|
||||
WORD = ' '
|
||||
""" Break between words. """
|
||||
PART = '-'
|
||||
@@ -116,6 +122,7 @@ class TokenRange:
|
||||
"""
|
||||
start: int
|
||||
end: int
|
||||
penalty: Optional[float] = None
|
||||
|
||||
def __lt__(self, other: 'TokenRange') -> bool:
|
||||
return self.end <= other.start
|
||||
|
||||
@@ -27,6 +27,7 @@ PENALTY_TOKENCHANGE = {
|
||||
qmod.BreakType.START: 0.0,
|
||||
qmod.BreakType.END: 0.0,
|
||||
qmod.BreakType.PHRASE: 0.0,
|
||||
qmod.BreakType.SOFT_PHRASE: 0.0,
|
||||
qmod.BreakType.WORD: 0.1,
|
||||
qmod.BreakType.PART: 0.2,
|
||||
qmod.BreakType.TOKEN: 0.4
|
||||
|
||||
@@ -8,4 +8,4 @@
|
||||
Version information for the Nominatim API.
|
||||
"""
|
||||
|
||||
NOMINATIM_API_VERSION = '4.5.0'
|
||||
NOMINATIM_API_VERSION = '5.0.0'
|
||||
|
||||
@@ -122,13 +122,16 @@ class SetupAll:
|
||||
|
||||
LOG.warning('Post-process tables')
|
||||
with connect(args.config.get_libpq_dsn()) as conn:
|
||||
conn.autocommit = True
|
||||
await database_import.create_search_indices(conn, args.config,
|
||||
drop=args.no_updates,
|
||||
threads=num_threads)
|
||||
LOG.warning('Create search index for default country names.')
|
||||
conn.autocommit = False
|
||||
country_info.create_country_names(conn, tokenizer,
|
||||
args.config.get_str_list('LANGUAGES'))
|
||||
if args.no_updates:
|
||||
conn.autocommit = True
|
||||
freeze.drop_update_tables(conn)
|
||||
tokenizer.finalize_import(args.config)
|
||||
|
||||
@@ -183,6 +186,7 @@ class SetupAll:
|
||||
from ..tools import database_import, refresh
|
||||
|
||||
with connect(config.get_libpq_dsn()) as conn:
|
||||
conn.autocommit = True
|
||||
LOG.warning('Create functions (1st pass)')
|
||||
refresh.create_functions(conn, config, False, False)
|
||||
LOG.warning('Create tables')
|
||||
|
||||
@@ -25,6 +25,8 @@ class ICUTokenAnalysis:
|
||||
|
||||
def __init__(self, norm_rules: str, trans_rules: str,
|
||||
analysis_rules: Mapping[Optional[str], 'TokenAnalyzerRule']):
|
||||
# additional break signs are not relevant during name analysis
|
||||
norm_rules += ";[[:Space:][-:]]+ > ' ';"
|
||||
self.normalizer = Transliterator.createFromRules("icu_normalization",
|
||||
norm_rules)
|
||||
trans_rules += ";[:Space:]+ > ' '"
|
||||
|
||||
@@ -55,7 +55,7 @@ def parse_version(version: str) -> NominatimVersion:
|
||||
return NominatimVersion(*[int(x) for x in parts[:2] + parts[2].split('-')])
|
||||
|
||||
|
||||
NOMINATIM_VERSION = parse_version('4.5.0-0')
|
||||
NOMINATIM_VERSION = parse_version('5.0.0-0')
|
||||
|
||||
POSTGRESQL_REQUIRED_VERSION = (12, 0)
|
||||
POSTGIS_REQUIRED_VERSION = (3, 0)
|
||||
|
||||
@@ -267,3 +267,34 @@ Feature: Rank assignment
|
||||
| object | rank_search | rank_address |
|
||||
| N23:amenity | 30 | 30 |
|
||||
| N23:place | 16 | 16 |
|
||||
|
||||
Scenario: Address rank 25 is only used for addr:place
|
||||
Given the grid
|
||||
| 10 | 33 | 34 | 11 |
|
||||
Given the places
|
||||
| osm | class | type | name |
|
||||
| N10 | place | village | vil |
|
||||
| N11 | place | farm | farm |
|
||||
And the places
|
||||
| osm | class | type | name | geometry |
|
||||
| W1 | highway | residential | RD | 33,11 |
|
||||
And the places
|
||||
| osm | class | type | name | addr+farm | geometry |
|
||||
| W2 | highway | residential | RD2 | farm | 34,11 |
|
||||
And the places
|
||||
| osm | class | type | housenr |
|
||||
| N33 | place | house | 23 |
|
||||
And the places
|
||||
| osm | class | type | housenr | addr+place |
|
||||
| N34 | place | house | 23 | farm |
|
||||
When importing
|
||||
Then placex contains
|
||||
| object | parent_place_id |
|
||||
| N11 | N10 |
|
||||
| N33 | W1 |
|
||||
| N34 | N11 |
|
||||
And place_addressline contains
|
||||
| object | address |
|
||||
| W1 | N10 |
|
||||
| W2 | N10 |
|
||||
| W2 | N11 |
|
||||
|
||||
@@ -0,0 +1,34 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2025 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Tests for japanese phrase splitting.
|
||||
"""
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from icu import Transliterator
|
||||
|
||||
import nominatim_api.search.query as qmod
|
||||
from nominatim_api.query_preprocessing.config import QueryConfig
|
||||
from nominatim_api.query_preprocessing import split_japanese_phrases
|
||||
|
||||
def run_preprocessor_on(query):
|
||||
proc = split_japanese_phrases.create(QueryConfig().set_normalizer(None))
|
||||
|
||||
return proc(query)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('inp,outp', [('大阪府大阪市大阪', '大阪府:大阪市:大阪'),
|
||||
('大阪府大阪', '大阪府:大阪'),
|
||||
('大阪市大阪', '大阪市:大阪')])
|
||||
def test_split_phrases(inp, outp):
|
||||
query = [qmod.Phrase(qmod.PhraseType.NONE, inp)]
|
||||
|
||||
out = run_preprocessor_on(query)
|
||||
|
||||
assert out == [qmod.Phrase(qmod.PhraseType.NONE, outp)]
|
||||
@@ -23,14 +23,10 @@ def test_refresh_import_secondary_importance_non_existing(dsn):
|
||||
|
||||
def test_refresh_import_secondary_importance_testdb(dsn, src_dir, temp_db_conn, temp_db_cursor):
|
||||
temp_db_cursor.execute('CREATE EXTENSION postgis')
|
||||
temp_db_cursor.execute('CREATE EXTENSION postgis_raster')
|
||||
assert refresh.import_secondary_importance(dsn, src_dir / 'test' / 'testdb') == 0
|
||||
|
||||
if postgis_version_tuple(temp_db_conn)[0] < 3:
|
||||
assert refresh.import_secondary_importance(dsn, src_dir / 'test' / 'testdb') > 0
|
||||
else:
|
||||
temp_db_cursor.execute('CREATE EXTENSION postgis_raster')
|
||||
assert refresh.import_secondary_importance(dsn, src_dir / 'test' / 'testdb') == 0
|
||||
|
||||
assert temp_db_cursor.table_exists('secondary_importance')
|
||||
assert temp_db_cursor.table_exists('secondary_importance')
|
||||
|
||||
|
||||
@pytest.mark.parametrize("replace", (True, False))
|
||||
@@ -41,8 +37,7 @@ def test_refresh_import_wikipedia(dsn, src_dir, table_factory, temp_db_cursor, r
|
||||
# use the small wikipedia file for the API testdb
|
||||
assert refresh.import_wikipedia_articles(dsn, src_dir / 'test' / 'testdb') == 0
|
||||
|
||||
assert temp_db_cursor.table_rows('wikipedia_article') > 0
|
||||
assert temp_db_cursor.table_rows('wikipedia_redirect') > 0
|
||||
assert temp_db_cursor.table_rows('wikimedia_importance') > 0
|
||||
|
||||
|
||||
def test_recompute_importance(placex_table, table_factory, temp_db_conn, temp_db_cursor):
|
||||
|
||||
BIN
test/testdb/wikimedia-importance.csv.gz
Normal file
BIN
test/testdb/wikimedia-importance.csv.gz
Normal file
Binary file not shown.
Binary file not shown.
Reference in New Issue
Block a user