mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-02-16 15:47:58 +00:00
Compare commits
21 Commits
settings-m
...
v5.0.0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ee8915f2b6 | ||
|
|
5475bf7b9c | ||
|
|
95e2d8c846 | ||
|
|
7552818866 | ||
|
|
db3991af74 | ||
|
|
4523b9aaed | ||
|
|
8b1cabebd6 | ||
|
|
0cf636a80c | ||
|
|
c2cb6722fe | ||
|
|
f8337bedb2 | ||
|
|
efc09a5cfc | ||
|
|
86ad9efa8a | ||
|
|
d984100e23 | ||
|
|
499110f549 | ||
|
|
267e5dac0d | ||
|
|
32d3eb46d5 | ||
|
|
c8a0dc8af1 | ||
|
|
14ecfc7834 | ||
|
|
cad44eb00c | ||
|
|
f76dbb0a16 | ||
|
|
8dd218a1d0 |
@@ -87,7 +87,6 @@ Checklist for releases:
|
|||||||
* [ ] increase versions in
|
* [ ] increase versions in
|
||||||
* `src/nominatim_api/version.py`
|
* `src/nominatim_api/version.py`
|
||||||
* `src/nominatim_db/version.py`
|
* `src/nominatim_db/version.py`
|
||||||
* CMakeLists.txt
|
|
||||||
* [ ] update `ChangeLog` (copy information from patch releases from release branch)
|
* [ ] update `ChangeLog` (copy information from patch releases from release branch)
|
||||||
* [ ] complete `docs/admin/Migration.md`
|
* [ ] complete `docs/admin/Migration.md`
|
||||||
* [ ] update EOL dates in `SECURITY.md`
|
* [ ] update EOL dates in `SECURITY.md`
|
||||||
|
|||||||
25
ChangeLog
25
ChangeLog
@@ -1,3 +1,28 @@
|
|||||||
|
5.0.0
|
||||||
|
* increase required versions for PostgreSQL (12+), PostGIS (3.0+)
|
||||||
|
* remove installation via cmake and debundle osm2pgsql
|
||||||
|
* remove deprecated PHP frontend
|
||||||
|
* remove deprecated legacy tokenizer
|
||||||
|
* add configurable pre-processing of queries
|
||||||
|
* add query pre-processor to split up Japanese addresses
|
||||||
|
* rewrite of osm2pgsql style implementation
|
||||||
|
(also adds support for osm2pgsql-themepark)
|
||||||
|
* reduce the number of SQL queries needed to complete a 'lookup' call
|
||||||
|
* improve computation of centroid for lines with only two points
|
||||||
|
* improve bbox output for postcode areas
|
||||||
|
* improve result order by returning the largest object when other things are
|
||||||
|
equal
|
||||||
|
* add fallback for reverse geocoding to default country tables
|
||||||
|
* exclude postcode areas from reverse geocoding
|
||||||
|
* disable search endpoint when database is reverse-only (regression)
|
||||||
|
* minor performance improvements to area split algorithm
|
||||||
|
* switch table and index creation to use autocommit mode to avoid deadlocks
|
||||||
|
* drop overly long ways during import
|
||||||
|
* restrict automatic migrations to versions 4.3+
|
||||||
|
* switch linting from pylint to flake8
|
||||||
|
* switch tests to use a wikimedia test file in the new CSV style
|
||||||
|
* various fixes and improvements to documentation
|
||||||
|
|
||||||
4.5.0
|
4.5.0
|
||||||
* allow building Nominatim as a pip package
|
* allow building Nominatim as a pip package
|
||||||
* make osm2pgsql building optional
|
* make osm2pgsql building optional
|
||||||
|
|||||||
@@ -9,10 +9,10 @@ versions.
|
|||||||
|
|
||||||
| Version | End of support for security updates |
|
| Version | End of support for security updates |
|
||||||
| ------- | ----------------------------------- |
|
| ------- | ----------------------------------- |
|
||||||
|
| 5.0.x | 2027-02-06
|
||||||
| 4.5.x | 2026-09-12 |
|
| 4.5.x | 2026-09-12 |
|
||||||
| 4.4.x | 2026-03-07 |
|
| 4.4.x | 2026-03-07 |
|
||||||
| 4.3.x | 2025-09-07 |
|
| 4.3.x | 2025-09-07 |
|
||||||
| 4.2.x | 2024-11-24 |
|
|
||||||
|
|
||||||
## Reporting a Vulnerability
|
## Reporting a Vulnerability
|
||||||
|
|
||||||
|
|||||||
@@ -9,19 +9,15 @@ the following steps:
|
|||||||
* Update the frontend: `pip install -U nominatim-api`
|
* Update the frontend: `pip install -U nominatim-api`
|
||||||
* (optionally) Restart updates
|
* (optionally) Restart updates
|
||||||
|
|
||||||
If you are still using CMake for the installation of Nominatim, then you
|
|
||||||
need to update the software in one step before migrating the database.
|
|
||||||
It is not recommended to do this while the machine is serving requests.
|
|
||||||
|
|
||||||
Below you find additional migrations and hints about other structural and
|
Below you find additional migrations and hints about other structural and
|
||||||
breaking changes. **Please read them before running the migration.**
|
breaking changes. **Please read them before running the migration.**
|
||||||
|
|
||||||
!!! note
|
!!! note
|
||||||
If you are migrating from a version <4.3, you need to install 4.3
|
If you are migrating from a version <4.3, you need to install 4.3
|
||||||
first and migrate to 4.3 first. Then you can migrate to the current
|
and migrate to 4.3 first. Then you can migrate to the current
|
||||||
version. It is strongly recommended to do a reimport instead.
|
version. It is strongly recommended to do a reimport instead.
|
||||||
|
|
||||||
## 4.5.0 -> master
|
## 4.5.0 -> 5.0.0
|
||||||
|
|
||||||
### PHP frontend removed
|
### PHP frontend removed
|
||||||
|
|
||||||
@@ -33,6 +29,42 @@ needed. It currently omits a warning and does otherwise nothing. It will be
|
|||||||
removed in later versions of Nominatim. So make sure you remove it from your
|
removed in later versions of Nominatim. So make sure you remove it from your
|
||||||
scripts.
|
scripts.
|
||||||
|
|
||||||
|
### CMake building removed
|
||||||
|
|
||||||
|
Nominatim can now only be installed via pip. Please follow the installation
|
||||||
|
instructions for the current version to change to pip.
|
||||||
|
|
||||||
|
### osm2pgsql no longer vendored in
|
||||||
|
|
||||||
|
Nominatim no longer ships its own version of osm2pgsql. Please install a
|
||||||
|
stock version of osm2pgsql from your distribution. See the
|
||||||
|
[installation instruction for osm2pgsql](https://osm2pgsql.org/doc/install.html)
|
||||||
|
for details. A minimum version of 1.8 is required. The current stable versions
|
||||||
|
of Ubuntu and Debian already ship with an appropriate versions. For older
|
||||||
|
installation, you may have to compile a newer osm2pgsql yourself.
|
||||||
|
|
||||||
|
### Legacy tokenizer removed
|
||||||
|
|
||||||
|
The `legacy` tokenizer is no longer enabled. This tokenizer has been superseded
|
||||||
|
by the `ICU` tokenizer a long time ago. In the unlikely case that your database
|
||||||
|
still uses the `legacy` tokenizer, you must reimport your database.
|
||||||
|
|
||||||
|
### osm2pgsql style overhauled
|
||||||
|
|
||||||
|
There are some fundamental changes to how customized osm2pgsql styles should
|
||||||
|
be written. The changes are mostly backwards compatible, i.e. custom styles
|
||||||
|
should still work with the new implementation. The only exception is a
|
||||||
|
customization of the `process_tags()` function. This function is no longer
|
||||||
|
considered public and neither are the helper functions used in it.
|
||||||
|
They currently still work but will be removed at some point. If you have
|
||||||
|
been making changes to `process_tags`, please review your style and try
|
||||||
|
to switch to the new convenience functions.
|
||||||
|
|
||||||
|
For more information on the changes, see the
|
||||||
|
[pull request](https://github.com/osm-search/Nominatim/pull/3615)
|
||||||
|
and read the new
|
||||||
|
[customization documentation](https://nominatim.org/release-docs/latest/customize/Import-Styles/).
|
||||||
|
|
||||||
## 4.4.0 -> 4.5.0
|
## 4.4.0 -> 4.5.0
|
||||||
|
|
||||||
### New structure for Python packages
|
### New structure for Python packages
|
||||||
|
|||||||
@@ -68,10 +68,10 @@ the update interval no new data has been published yet, it will go to sleep
|
|||||||
until the next expected update and only then attempt to download the next batch.
|
until the next expected update and only then attempt to download the next batch.
|
||||||
|
|
||||||
The one-time mode is particularly useful if you want to run updates continuously
|
The one-time mode is particularly useful if you want to run updates continuously
|
||||||
but need to schedule other work in between updates. For example, the main
|
but need to schedule other work in between updates. For example, you might
|
||||||
service at osm.org uses it, to regularly recompute postcodes -- a process that
|
want to regularly recompute postcodes -- a process that
|
||||||
must not be run while updates are in progress. Its update script
|
must not be run while updates are in progress. An update script refreshing
|
||||||
looks like this:
|
postcodes regularly might look like this:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
@@ -109,17 +109,19 @@ Unit=nominatim-updates.service
|
|||||||
WantedBy=multi-user.target
|
WantedBy=multi-user.target
|
||||||
```
|
```
|
||||||
|
|
||||||
And then a similar service definition: `/etc/systemd/system/nominatim-updates.service`:
|
`OnUnitActiveSec` defines how often the individual update command is run.
|
||||||
|
|
||||||
|
Then add a service definition for the timer in `/etc/systemd/system/nominatim-updates.service`:
|
||||||
|
|
||||||
```
|
```
|
||||||
[Unit]
|
[Unit]
|
||||||
Description=Single updates of Nominatim
|
Description=Single updates of Nominatim
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
WorkingDirectory=/srv/nominatim
|
WorkingDirectory=/srv/nominatim-project
|
||||||
ExecStart=nominatim replication --once
|
ExecStart=/srv/nominatim-venv/bin/nominatim replication --once
|
||||||
StandardOutput=append:/var/log/nominatim-updates.log
|
StandardOutput=journald
|
||||||
StandardError=append:/var/log/nominatim-updates.error.log
|
StandardError=inherit
|
||||||
User=nominatim
|
User=nominatim
|
||||||
Group=nominatim
|
Group=nominatim
|
||||||
Type=simple
|
Type=simple
|
||||||
@@ -128,9 +130,9 @@ Type=simple
|
|||||||
WantedBy=multi-user.target
|
WantedBy=multi-user.target
|
||||||
```
|
```
|
||||||
|
|
||||||
Replace the `WorkingDirectory` with your project directory. Also adapt user and
|
Replace the `WorkingDirectory` with your project directory. `ExecStart` points
|
||||||
group names as required. `OnUnitActiveSec` defines how often the individual
|
to the nominatim binary that was installed in your virtualenv earlier.
|
||||||
update command is run.
|
Finally, you might need to adapt user and group names as required.
|
||||||
|
|
||||||
Now activate the service and start the updates:
|
Now activate the service and start the updates:
|
||||||
|
|
||||||
@@ -140,12 +142,13 @@ sudo systemctl enable nominatim-updates.timer
|
|||||||
sudo systemctl start nominatim-updates.timer
|
sudo systemctl start nominatim-updates.timer
|
||||||
```
|
```
|
||||||
|
|
||||||
You can stop future data updates, while allowing any current, in-progress
|
You can stop future data updates while allowing any current, in-progress
|
||||||
update steps to finish, by running `sudo systemctl stop
|
update steps to finish, by running `sudo systemctl stop
|
||||||
nominatim-updates.timer` and waiting until `nominatim-updates.service` isn't
|
nominatim-updates.timer` and waiting until `nominatim-updates.service` isn't
|
||||||
running (`sudo systemctl is-active nominatim-updates.service`). Current output
|
running (`sudo systemctl is-active nominatim-updates.service`).
|
||||||
from the update can be seen like above (`systemctl status
|
|
||||||
nominatim-updates.service`).
|
To check the output from the update process, use journalctl: `journalctl -u
|
||||||
|
nominatim-updates.service`
|
||||||
|
|
||||||
|
|
||||||
#### Catch-up mode
|
#### Catch-up mode
|
||||||
@@ -155,13 +158,13 @@ all changes from the server until the database is up-to-date. The catch-up mode
|
|||||||
still respects the parameter `NOMINATIM_REPLICATION_MAX_DIFF`. It downloads and
|
still respects the parameter `NOMINATIM_REPLICATION_MAX_DIFF`. It downloads and
|
||||||
applies the changes in appropriate batches until all is done.
|
applies the changes in appropriate batches until all is done.
|
||||||
|
|
||||||
The catch-up mode is foremost useful to bring the database up to speed after the
|
The catch-up mode is foremost useful to bring the database up to date after the
|
||||||
initial import. Give that the service usually is not in production at this
|
initial import. Give that the service usually is not in production at this
|
||||||
point, you can temporarily be a bit more generous with the batch size and
|
point, you can temporarily be a bit more generous with the batch size and
|
||||||
number of threads you use for the updates by running catch-up like this:
|
number of threads you use for the updates by running catch-up like this:
|
||||||
|
|
||||||
```
|
```
|
||||||
cd /srv/nominatim
|
cd /srv/nominatim-project
|
||||||
NOMINATIM_REPLICATION_MAX_DIFF=5000 nominatim replication --catch-up --threads 15
|
NOMINATIM_REPLICATION_MAX_DIFF=5000 nominatim replication --catch-up --threads 15
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -173,13 +176,13 @@ replication catch-up at whatever interval you desire.
|
|||||||
When running scheduled updates with catch-up, it is a good idea to choose
|
When running scheduled updates with catch-up, it is a good idea to choose
|
||||||
a replication source with an update frequency that is an order of magnitude
|
a replication source with an update frequency that is an order of magnitude
|
||||||
lower. For example, if you want to update once a day, use an hourly updated
|
lower. For example, if you want to update once a day, use an hourly updated
|
||||||
source. This makes sure that you don't miss an entire day of updates when
|
source. This ensures that you don't miss an entire day of updates when
|
||||||
the source is unexpectedly late to publish its update.
|
the source is unexpectedly late to publish its update.
|
||||||
|
|
||||||
If you want to use the source with the same update frequency (e.g. a daily
|
If you want to use the source with the same update frequency (e.g. a daily
|
||||||
updated source with daily updates), use the
|
updated source with daily updates), use the
|
||||||
continuous update mode. It ensures to re-request the newest update until it
|
once mode together with a frequently run systemd script as described above.
|
||||||
is published.
|
It ensures to re-request the newest update until they have been published.
|
||||||
|
|
||||||
|
|
||||||
#### Continuous updates
|
#### Continuous updates
|
||||||
@@ -197,36 +200,3 @@ parameters:
|
|||||||
|
|
||||||
The update application keeps running forever and retrieves and applies
|
The update application keeps running forever and retrieves and applies
|
||||||
new updates from the server as they are published.
|
new updates from the server as they are published.
|
||||||
|
|
||||||
You can run this command as a simple systemd service. Create a service
|
|
||||||
description like that in `/etc/systemd/system/nominatim-updates.service`:
|
|
||||||
|
|
||||||
```
|
|
||||||
[Unit]
|
|
||||||
Description=Continuous updates of Nominatim
|
|
||||||
|
|
||||||
[Service]
|
|
||||||
WorkingDirectory=/srv/nominatim
|
|
||||||
ExecStart=nominatim replication
|
|
||||||
StandardOutput=append:/var/log/nominatim-updates.log
|
|
||||||
StandardError=append:/var/log/nominatim-updates.error.log
|
|
||||||
User=nominatim
|
|
||||||
Group=nominatim
|
|
||||||
Type=simple
|
|
||||||
|
|
||||||
[Install]
|
|
||||||
WantedBy=multi-user.target
|
|
||||||
```
|
|
||||||
|
|
||||||
Replace the `WorkingDirectory` with your project directory. Also adapt user
|
|
||||||
and group names as required.
|
|
||||||
|
|
||||||
Now activate the service and start the updates:
|
|
||||||
|
|
||||||
```
|
|
||||||
sudo systemctl daemon-reload
|
|
||||||
sudo systemctl enable nominatim-updates
|
|
||||||
sudo systemctl start nominatim-updates
|
|
||||||
```
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -326,7 +326,7 @@ defined primary names are forgotten.)
|
|||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| :----- | :---------- |
|
| :----- | :---------- |
|
||||||
| core | Basic set of recogniced names for all places. |
|
| core | Basic set of recognized names for all places. |
|
||||||
| address | Additional names useful when indexing full addresses. |
|
| address | Additional names useful when indexing full addresses. |
|
||||||
| poi | Extended set of recognized names for pois. Use on top of the core set. |
|
| poi | Extended set of recognized names for pois. Use on top of the core set. |
|
||||||
|
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ queries. This happens in two stages:
|
|||||||
as during the import process but may involve other processing like,
|
as during the import process but may involve other processing like,
|
||||||
for example, word break detection.
|
for example, word break detection.
|
||||||
2. The **token analysis** step breaks down the query parts into tokens,
|
2. The **token analysis** step breaks down the query parts into tokens,
|
||||||
looks them up in the database and assignes them possible functions and
|
looks them up in the database and assigns them possible functions and
|
||||||
probabilities.
|
probabilities.
|
||||||
|
|
||||||
Query processing can be further customized while the rest of the analysis
|
Query processing can be further customized while the rest of the analysis
|
||||||
|
|||||||
@@ -425,7 +425,7 @@ function Place:write_row(k, v)
|
|||||||
if self.geometry == nil then
|
if self.geometry == nil then
|
||||||
self.geometry = self.geom_func(self.object)
|
self.geometry = self.geom_func(self.object)
|
||||||
end
|
end
|
||||||
if self.geometry:is_null() then
|
if self.geometry == nil or self.geometry:is_null() then
|
||||||
return 0
|
return 0
|
||||||
end
|
end
|
||||||
|
|
||||||
@@ -608,6 +608,9 @@ function module.process_way(object)
|
|||||||
|
|
||||||
if geom:is_null() then
|
if geom:is_null() then
|
||||||
geom = o:as_linestring()
|
geom = o:as_linestring()
|
||||||
|
if geom:is_null() or geom:length() > 30 then
|
||||||
|
return nil
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
return geom
|
return geom
|
||||||
|
|||||||
@@ -17,28 +17,6 @@ CREATE TYPE nearfeaturecentr AS (
|
|||||||
centroid GEOMETRY
|
centroid GEOMETRY
|
||||||
);
|
);
|
||||||
|
|
||||||
-- feature intersects geometry
|
|
||||||
-- for areas and linestrings they must touch at least along a line
|
|
||||||
CREATE OR REPLACE FUNCTION is_relevant_geometry(de9im TEXT, geom_type TEXT)
|
|
||||||
RETURNS BOOLEAN
|
|
||||||
AS $$
|
|
||||||
BEGIN
|
|
||||||
IF substring(de9im from 1 for 2) != 'FF' THEN
|
|
||||||
RETURN TRUE;
|
|
||||||
END IF;
|
|
||||||
|
|
||||||
IF geom_type = 'ST_Point' THEN
|
|
||||||
RETURN substring(de9im from 4 for 1) = '0';
|
|
||||||
END IF;
|
|
||||||
|
|
||||||
IF geom_type in ('ST_LineString', 'ST_MultiLineString') THEN
|
|
||||||
RETURN substring(de9im from 4 for 1) = '1';
|
|
||||||
END IF;
|
|
||||||
|
|
||||||
RETURN substring(de9im from 4 for 1) = '2';
|
|
||||||
END
|
|
||||||
$$ LANGUAGE plpgsql IMMUTABLE;
|
|
||||||
|
|
||||||
CREATE OR REPLACE function getNearFeatures(in_partition INTEGER, feature GEOMETRY,
|
CREATE OR REPLACE function getNearFeatures(in_partition INTEGER, feature GEOMETRY,
|
||||||
feature_centroid GEOMETRY,
|
feature_centroid GEOMETRY,
|
||||||
maxrank INTEGER)
|
maxrank INTEGER)
|
||||||
@@ -59,7 +37,12 @@ BEGIN
|
|||||||
isguess, postcode, centroid
|
isguess, postcode, centroid
|
||||||
FROM location_area_large_{{ partition }}
|
FROM location_area_large_{{ partition }}
|
||||||
WHERE geometry && feature
|
WHERE geometry && feature
|
||||||
AND is_relevant_geometry(ST_Relate(geometry, feature), ST_GeometryType(feature))
|
AND CASE WHEN ST_Dimension(feature) = 0
|
||||||
|
THEN _ST_Covers(geometry, feature)
|
||||||
|
WHEN ST_Dimension(feature) = 2
|
||||||
|
THEN ST_Relate(geometry, feature, 'T********')
|
||||||
|
ELSE ST_NPoints(ST_Intersection(geometry, feature)) > 1
|
||||||
|
END
|
||||||
AND rank_address < maxrank
|
AND rank_address < maxrank
|
||||||
-- Postcodes currently still use rank_search to define for which
|
-- Postcodes currently still use rank_search to define for which
|
||||||
-- features they are relevant.
|
-- features they are relevant.
|
||||||
@@ -142,14 +125,16 @@ BEGIN
|
|||||||
|
|
||||||
IF in_rank_search <= 4 and not in_estimate THEN
|
IF in_rank_search <= 4 and not in_estimate THEN
|
||||||
INSERT INTO location_area_country (place_id, country_code, geometry)
|
INSERT INTO location_area_country (place_id, country_code, geometry)
|
||||||
values (in_place_id, in_country_code, in_geometry);
|
(SELECT in_place_id, in_country_code, geom
|
||||||
|
FROM split_geometry(in_geometry) as geom);
|
||||||
RETURN TRUE;
|
RETURN TRUE;
|
||||||
END IF;
|
END IF;
|
||||||
|
|
||||||
{% for partition in db.partitions %}
|
{% for partition in db.partitions %}
|
||||||
IF in_partition = {{ partition }} THEN
|
IF in_partition = {{ partition }} THEN
|
||||||
INSERT INTO location_area_large_{{ partition }} (partition, place_id, country_code, keywords, rank_search, rank_address, isguess, postcode, centroid, geometry)
|
INSERT INTO location_area_large_{{ partition }} (partition, place_id, country_code, keywords, rank_search, rank_address, isguess, postcode, centroid, geometry)
|
||||||
values (in_partition, in_place_id, in_country_code, in_keywords, in_rank_search, in_rank_address, in_estimate, postcode, in_centroid, in_geometry);
|
(SELECT in_partition, in_place_id, in_country_code, in_keywords, in_rank_search, in_rank_address, in_estimate, postcode, in_centroid, geom
|
||||||
|
FROM split_geometry(in_geometry) as geom);
|
||||||
RETURN TRUE;
|
RETURN TRUE;
|
||||||
END IF;
|
END IF;
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
|
|||||||
@@ -348,8 +348,6 @@ CREATE OR REPLACE FUNCTION add_location(place_id BIGINT, country_code varchar(2)
|
|||||||
RETURNS BOOLEAN
|
RETURNS BOOLEAN
|
||||||
AS $$
|
AS $$
|
||||||
DECLARE
|
DECLARE
|
||||||
locationid INTEGER;
|
|
||||||
secgeo GEOMETRY;
|
|
||||||
postcode TEXT;
|
postcode TEXT;
|
||||||
BEGIN
|
BEGIN
|
||||||
PERFORM deleteLocationArea(partition, place_id, rank_search);
|
PERFORM deleteLocationArea(partition, place_id, rank_search);
|
||||||
@@ -360,18 +358,19 @@ BEGIN
|
|||||||
postcode := upper(trim (in_postcode));
|
postcode := upper(trim (in_postcode));
|
||||||
END IF;
|
END IF;
|
||||||
|
|
||||||
IF ST_GeometryType(geometry) in ('ST_Polygon','ST_MultiPolygon') THEN
|
IF ST_Dimension(geometry) = 2 THEN
|
||||||
FOR secgeo IN select split_geometry(geometry) AS geom LOOP
|
RETURN insertLocationAreaLarge(partition, place_id, country_code, keywords,
|
||||||
PERFORM insertLocationAreaLarge(partition, place_id, country_code, keywords, rank_search, rank_address, false, postcode, centroid, secgeo);
|
rank_search, rank_address, false, postcode,
|
||||||
END LOOP;
|
centroid, geometry);
|
||||||
|
|
||||||
ELSEIF ST_GeometryType(geometry) = 'ST_Point' THEN
|
|
||||||
secgeo := place_node_fuzzy_area(geometry, rank_search);
|
|
||||||
PERFORM insertLocationAreaLarge(partition, place_id, country_code, keywords, rank_search, rank_address, true, postcode, centroid, secgeo);
|
|
||||||
|
|
||||||
END IF;
|
END IF;
|
||||||
|
|
||||||
RETURN true;
|
IF ST_Dimension(geometry) = 0 THEN
|
||||||
|
RETURN insertLocationAreaLarge(partition, place_id, country_code, keywords,
|
||||||
|
rank_search, rank_address, true, postcode,
|
||||||
|
centroid, place_node_fuzzy_area(geometry, rank_search));
|
||||||
|
END IF;
|
||||||
|
|
||||||
|
RETURN false;
|
||||||
END;
|
END;
|
||||||
$$
|
$$
|
||||||
LANGUAGE plpgsql;
|
LANGUAGE plpgsql;
|
||||||
@@ -394,19 +393,21 @@ DECLARE
|
|||||||
geo RECORD;
|
geo RECORD;
|
||||||
area FLOAT;
|
area FLOAT;
|
||||||
remainingdepth INTEGER;
|
remainingdepth INTEGER;
|
||||||
added INTEGER;
|
|
||||||
BEGIN
|
BEGIN
|
||||||
|
|
||||||
-- RAISE WARNING 'quad_split_geometry: maxarea=%, depth=%',maxarea,maxdepth;
|
-- RAISE WARNING 'quad_split_geometry: maxarea=%, depth=%',maxarea,maxdepth;
|
||||||
|
|
||||||
IF (ST_GeometryType(geometry) not in ('ST_Polygon','ST_MultiPolygon') OR NOT ST_IsValid(geometry)) THEN
|
IF not ST_IsValid(geometry) THEN
|
||||||
|
RETURN;
|
||||||
|
END IF;
|
||||||
|
|
||||||
|
IF ST_Dimension(geometry) != 2 OR maxdepth <= 1 THEN
|
||||||
RETURN NEXT geometry;
|
RETURN NEXT geometry;
|
||||||
RETURN;
|
RETURN;
|
||||||
END IF;
|
END IF;
|
||||||
|
|
||||||
remainingdepth := maxdepth - 1;
|
remainingdepth := maxdepth - 1;
|
||||||
area := ST_AREA(geometry);
|
area := ST_AREA(geometry);
|
||||||
IF remainingdepth < 1 OR area < maxarea THEN
|
IF area < maxarea THEN
|
||||||
RETURN NEXT geometry;
|
RETURN NEXT geometry;
|
||||||
RETURN;
|
RETURN;
|
||||||
END IF;
|
END IF;
|
||||||
@@ -426,7 +427,6 @@ BEGIN
|
|||||||
xmid := (xmin+xmax)/2;
|
xmid := (xmin+xmax)/2;
|
||||||
ymid := (ymin+ymax)/2;
|
ymid := (ymin+ymax)/2;
|
||||||
|
|
||||||
added := 0;
|
|
||||||
FOR seg IN 1..4 LOOP
|
FOR seg IN 1..4 LOOP
|
||||||
|
|
||||||
IF seg = 1 THEN
|
IF seg = 1 THEN
|
||||||
@@ -442,16 +442,13 @@ BEGIN
|
|||||||
secbox := ST_SetSRID(ST_MakeBox2D(ST_Point(xmid,ymid),ST_Point(xmax,ymax)),4326);
|
secbox := ST_SetSRID(ST_MakeBox2D(ST_Point(xmid,ymid),ST_Point(xmax,ymax)),4326);
|
||||||
END IF;
|
END IF;
|
||||||
|
|
||||||
IF st_intersects(geometry, secbox) THEN
|
secgeo := st_intersection(geometry, secbox);
|
||||||
secgeo := st_intersection(geometry, secbox);
|
IF NOT ST_IsEmpty(secgeo) AND ST_Dimension(secgeo) = 2 THEN
|
||||||
IF NOT ST_IsEmpty(secgeo) AND ST_GeometryType(secgeo) in ('ST_Polygon','ST_MultiPolygon') THEN
|
FOR geo IN SELECT quad_split_geometry(secgeo, maxarea, remainingdepth) as geom LOOP
|
||||||
FOR geo IN select quad_split_geometry(secgeo, maxarea, remainingdepth) as geom LOOP
|
IF NOT ST_IsEmpty(geo.geom) AND ST_Dimension(geo.geom) = 2 THEN
|
||||||
IF NOT ST_IsEmpty(geo.geom) AND ST_GeometryType(geo.geom) in ('ST_Polygon','ST_MultiPolygon') THEN
|
RETURN NEXT geo.geom;
|
||||||
added := added + 1;
|
END IF;
|
||||||
RETURN NEXT geo.geom;
|
END LOOP;
|
||||||
END IF;
|
|
||||||
END LOOP;
|
|
||||||
END IF;
|
|
||||||
END IF;
|
END IF;
|
||||||
END LOOP;
|
END LOOP;
|
||||||
|
|
||||||
@@ -467,10 +464,22 @@ CREATE OR REPLACE FUNCTION split_geometry(geometry GEOMETRY)
|
|||||||
DECLARE
|
DECLARE
|
||||||
geo RECORD;
|
geo RECORD;
|
||||||
BEGIN
|
BEGIN
|
||||||
-- 10000000000 is ~~ 1x1 degree
|
IF ST_GeometryType(geometry) = 'ST_MultiPolygon'
|
||||||
FOR geo IN select quad_split_geometry(geometry, 0.25, 20) as geom LOOP
|
and ST_Area(geometry) * 10 > ST_Area(Box2D(geometry))
|
||||||
RETURN NEXT geo.geom;
|
THEN
|
||||||
END LOOP;
|
FOR geo IN
|
||||||
|
SELECT quad_split_geometry(g, 0.25, 20) as geom
|
||||||
|
FROM (SELECT (ST_Dump(geometry)).geom::geometry(Polygon, 4326) AS g) xx
|
||||||
|
LOOP
|
||||||
|
RETURN NEXT geo.geom;
|
||||||
|
END LOOP;
|
||||||
|
ELSE
|
||||||
|
FOR geo IN
|
||||||
|
SELECT quad_split_geometry(geometry, 0.25, 20) as geom
|
||||||
|
LOOP
|
||||||
|
RETURN NEXT geo.geom;
|
||||||
|
END LOOP;
|
||||||
|
END IF;
|
||||||
RETURN;
|
RETURN;
|
||||||
END;
|
END;
|
||||||
$$
|
$$
|
||||||
|
|||||||
@@ -23,8 +23,8 @@
|
|||||||
"allotments" : 22,
|
"allotments" : 22,
|
||||||
"neighbourhood" : [20, 22],
|
"neighbourhood" : [20, 22],
|
||||||
"quarter" : [20, 22],
|
"quarter" : [20, 22],
|
||||||
"isolated_dwelling" : [22, 20],
|
"isolated_dwelling" : [22, 25],
|
||||||
"farm" : [22, 20],
|
"farm" : [22, 25],
|
||||||
"city_block" : 25,
|
"city_block" : 25,
|
||||||
"mountain_pass" : 25,
|
"mountain_pass" : 25,
|
||||||
"square" : 25,
|
"square" : 25,
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
query-preprocessing:
|
query-preprocessing:
|
||||||
|
- step: split_japanese_phrases
|
||||||
- step: normalize
|
- step: normalize
|
||||||
normalization:
|
normalization:
|
||||||
- ":: lower ()"
|
- ":: lower ()"
|
||||||
@@ -9,16 +10,17 @@ normalization:
|
|||||||
- "'nº' > 'no'"
|
- "'nº' > 'no'"
|
||||||
- "ª > a"
|
- "ª > a"
|
||||||
- "º > o"
|
- "º > o"
|
||||||
- "[[:Punctuation:][:Symbol:]\u02bc] > ' '"
|
- "[[:Punctuation:][:Symbol:][\u02bc] - [-:]]+ > '-'"
|
||||||
- "ß > 'ss'" # German szet is unambiguously equal to double ss
|
- "ß > 'ss'" # German szet is unambiguously equal to double ss
|
||||||
- "[^[:alnum:] [:Canonical_Combining_Class=Virama:] [:Space:]] >"
|
- "[^[:alnum:] [:Canonical_Combining_Class=Virama:] [:Space:] [-:]] >"
|
||||||
- "[:Lm:] >"
|
- "[:Lm:] >"
|
||||||
- ":: [[:Number:]] Latin ()"
|
- ":: [[:Number:]] Latin ()"
|
||||||
- ":: [[:Number:]] Ascii ();"
|
- ":: [[:Number:]] Ascii ();"
|
||||||
- ":: [[:Number:]] NFD ();"
|
- ":: [[:Number:]] NFD ();"
|
||||||
- "[[:Nonspacing Mark:] [:Cf:]] >;"
|
- "[[:Nonspacing Mark:] [:Cf:]] >;"
|
||||||
- "[:Space:]+ > ' '"
|
- "[-:]?[:Space:]+[-:]? > ' '"
|
||||||
transliteration:
|
transliteration:
|
||||||
|
- "[-:] > ' '"
|
||||||
- ":: Latin ()"
|
- ":: Latin ()"
|
||||||
- !include icu-rules/extended-unicode-to-asccii.yaml
|
- !include icu-rules/extended-unicode-to-asccii.yaml
|
||||||
- ":: Ascii ()"
|
- ":: Ascii ()"
|
||||||
|
|||||||
@@ -0,0 +1,61 @@
|
|||||||
|
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||||
|
#
|
||||||
|
# This file is part of Nominatim. (https://nominatim.org)
|
||||||
|
#
|
||||||
|
# Copyright (C) 2025 by the Nominatim developer community.
|
||||||
|
# For a full list of authors see the git log.
|
||||||
|
"""
|
||||||
|
This file divides Japanese addresses into three categories:
|
||||||
|
prefecture, municipality, and other.
|
||||||
|
The division is not strict but simple using these keywords.
|
||||||
|
"""
|
||||||
|
from typing import List
|
||||||
|
import re
|
||||||
|
|
||||||
|
from .config import QueryConfig
|
||||||
|
from .base import QueryProcessingFunc
|
||||||
|
from ..search.query import Phrase
|
||||||
|
|
||||||
|
MATCH_PATTERNS = [
|
||||||
|
r'''
|
||||||
|
(...??[都都道府県縣]) # [group1] prefecture
|
||||||
|
(.+?[市区區町村]) # [group2] municipalities (city/wards/towns/villages)
|
||||||
|
(.+) # [group3] other words
|
||||||
|
''',
|
||||||
|
r'''
|
||||||
|
(...??[都都道府県縣]) # [group1] prefecture
|
||||||
|
(.+) # [group3] other words
|
||||||
|
''',
|
||||||
|
r'''
|
||||||
|
(.+?[市区區町村]) # [group2] municipalities (city/wards/towns/villages)
|
||||||
|
(.+) # [group3] other words
|
||||||
|
'''
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class _JapanesePreprocessing:
|
||||||
|
|
||||||
|
def __init__(self, config: QueryConfig) -> None:
|
||||||
|
self.config = config
|
||||||
|
|
||||||
|
def split_phrase(self, phrase: Phrase) -> Phrase:
|
||||||
|
"""
|
||||||
|
This function performs a division on the given text using a regular expression.
|
||||||
|
"""
|
||||||
|
for pattern in MATCH_PATTERNS:
|
||||||
|
result = re.match(pattern, phrase.text, re.VERBOSE)
|
||||||
|
if result is not None:
|
||||||
|
return Phrase(phrase.ptype, ':'.join(result.groups()))
|
||||||
|
|
||||||
|
return phrase
|
||||||
|
|
||||||
|
def __call__(self, phrases: List[Phrase]) -> List[Phrase]:
|
||||||
|
"""Split a Japanese address using japanese_tokenizer.
|
||||||
|
"""
|
||||||
|
return [self.split_phrase(p) for p in phrases]
|
||||||
|
|
||||||
|
|
||||||
|
def create(config: QueryConfig) -> QueryProcessingFunc:
|
||||||
|
""" Create a function of japanese preprocessing.
|
||||||
|
"""
|
||||||
|
return _JapanesePreprocessing(config)
|
||||||
@@ -433,6 +433,7 @@ PENALTY_WORDCHANGE = {
|
|||||||
BreakType.START: 0.0,
|
BreakType.START: 0.0,
|
||||||
BreakType.END: 0.0,
|
BreakType.END: 0.0,
|
||||||
BreakType.PHRASE: 0.0,
|
BreakType.PHRASE: 0.0,
|
||||||
|
BreakType.SOFT_PHRASE: 0.0,
|
||||||
BreakType.WORD: 0.1,
|
BreakType.WORD: 0.1,
|
||||||
BreakType.PART: 0.2,
|
BreakType.PART: 0.2,
|
||||||
BreakType.TOKEN: 0.4
|
BreakType.TOKEN: 0.4
|
||||||
|
|||||||
@@ -133,7 +133,7 @@ class ForwardGeocoder:
|
|||||||
"""
|
"""
|
||||||
assert self.query_analyzer is not None
|
assert self.query_analyzer is not None
|
||||||
qwords = [word for phrase in query.source
|
qwords = [word for phrase in query.source
|
||||||
for word in re.split('[, ]+', phrase.text) if word]
|
for word in re.split('[-,: ]+', phrase.text) if word]
|
||||||
if not qwords:
|
if not qwords:
|
||||||
return
|
return
|
||||||
|
|
||||||
@@ -146,7 +146,7 @@ class ForwardGeocoder:
|
|||||||
distance = 0.0
|
distance = 0.0
|
||||||
norm = self.query_analyzer.normalize_text(' '.join((result.display_name,
|
norm = self.query_analyzer.normalize_text(' '.join((result.display_name,
|
||||||
result.country_code or '')))
|
result.country_code or '')))
|
||||||
words = set((w for w in norm.split(' ') if w))
|
words = set((w for w in re.split('[-,: ]+', norm) if w))
|
||||||
if not words:
|
if not words:
|
||||||
continue
|
continue
|
||||||
for qword in qwords:
|
for qword in qwords:
|
||||||
|
|||||||
@@ -7,10 +7,12 @@
|
|||||||
"""
|
"""
|
||||||
Implementation of query analysis for the ICU tokenizer.
|
Implementation of query analysis for the ICU tokenizer.
|
||||||
"""
|
"""
|
||||||
from typing import Tuple, Dict, List, Optional, NamedTuple, Iterator, Any, cast
|
from typing import Tuple, Dict, List, Optional, Iterator, Any, cast
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
import dataclasses
|
import dataclasses
|
||||||
import difflib
|
import difflib
|
||||||
|
import re
|
||||||
|
from itertools import zip_longest
|
||||||
|
|
||||||
from icu import Transliterator
|
from icu import Transliterator
|
||||||
|
|
||||||
@@ -34,17 +36,30 @@ DB_TO_TOKEN_TYPE = {
|
|||||||
'C': qmod.TokenType.COUNTRY
|
'C': qmod.TokenType.COUNTRY
|
||||||
}
|
}
|
||||||
|
|
||||||
|
PENALTY_IN_TOKEN_BREAK = {
|
||||||
|
qmod.BreakType.START: 0.5,
|
||||||
|
qmod.BreakType.END: 0.5,
|
||||||
|
qmod.BreakType.PHRASE: 0.5,
|
||||||
|
qmod.BreakType.SOFT_PHRASE: 0.5,
|
||||||
|
qmod.BreakType.WORD: 0.1,
|
||||||
|
qmod.BreakType.PART: 0.0,
|
||||||
|
qmod.BreakType.TOKEN: 0.0
|
||||||
|
}
|
||||||
|
|
||||||
class QueryPart(NamedTuple):
|
|
||||||
|
@dataclasses.dataclass
|
||||||
|
class QueryPart:
|
||||||
""" Normalized and transliterated form of a single term in the query.
|
""" Normalized and transliterated form of a single term in the query.
|
||||||
When the term came out of a split during the transliteration,
|
When the term came out of a split during the transliteration,
|
||||||
the normalized string is the full word before transliteration.
|
the normalized string is the full word before transliteration.
|
||||||
The word number keeps track of the word before transliteration
|
The word number keeps track of the word before transliteration
|
||||||
and can be used to identify partial transliterated terms.
|
and can be used to identify partial transliterated terms.
|
||||||
|
Penalty is the break penalty for the break following the token.
|
||||||
"""
|
"""
|
||||||
token: str
|
token: str
|
||||||
normalized: str
|
normalized: str
|
||||||
word_number: int
|
word_number: int
|
||||||
|
penalty: float
|
||||||
|
|
||||||
|
|
||||||
QueryParts = List[QueryPart]
|
QueryParts = List[QueryPart]
|
||||||
@@ -58,10 +73,12 @@ def yield_words(terms: List[QueryPart], start: int) -> Iterator[Tuple[str, qmod.
|
|||||||
total = len(terms)
|
total = len(terms)
|
||||||
for first in range(start, total):
|
for first in range(start, total):
|
||||||
word = terms[first].token
|
word = terms[first].token
|
||||||
yield word, qmod.TokenRange(first, first + 1)
|
penalty = PENALTY_IN_TOKEN_BREAK[qmod.BreakType.WORD]
|
||||||
|
yield word, qmod.TokenRange(first, first + 1, penalty=penalty)
|
||||||
for last in range(first + 1, min(first + 20, total)):
|
for last in range(first + 1, min(first + 20, total)):
|
||||||
word = ' '.join((word, terms[last].token))
|
word = ' '.join((word, terms[last].token))
|
||||||
yield word, qmod.TokenRange(first, last + 1)
|
penalty += terms[last - 1].penalty
|
||||||
|
yield word, qmod.TokenRange(first, last + 1, penalty=penalty)
|
||||||
|
|
||||||
|
|
||||||
@dataclasses.dataclass
|
@dataclasses.dataclass
|
||||||
@@ -94,25 +111,25 @@ class ICUToken(qmod.Token):
|
|||||||
self.penalty += (distance/len(self.lookup_word))
|
self.penalty += (distance/len(self.lookup_word))
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def from_db_row(row: SaRow) -> 'ICUToken':
|
def from_db_row(row: SaRow, base_penalty: float = 0.0) -> 'ICUToken':
|
||||||
""" Create a ICUToken from the row of the word table.
|
""" Create a ICUToken from the row of the word table.
|
||||||
"""
|
"""
|
||||||
count = 1 if row.info is None else row.info.get('count', 1)
|
count = 1 if row.info is None else row.info.get('count', 1)
|
||||||
addr_count = 1 if row.info is None else row.info.get('addr_count', 1)
|
addr_count = 1 if row.info is None else row.info.get('addr_count', 1)
|
||||||
|
|
||||||
penalty = 0.0
|
penalty = base_penalty
|
||||||
if row.type == 'w':
|
if row.type == 'w':
|
||||||
penalty = 0.3
|
penalty += 0.3
|
||||||
elif row.type == 'W':
|
elif row.type == 'W':
|
||||||
if len(row.word_token) == 1 and row.word_token == row.word:
|
if len(row.word_token) == 1 and row.word_token == row.word:
|
||||||
penalty = 0.2 if row.word.isdigit() else 0.3
|
penalty += 0.2 if row.word.isdigit() else 0.3
|
||||||
elif row.type == 'H':
|
elif row.type == 'H':
|
||||||
penalty = sum(0.1 for c in row.word_token if c != ' ' and not c.isdigit())
|
penalty += sum(0.1 for c in row.word_token if c != ' ' and not c.isdigit())
|
||||||
if all(not c.isdigit() for c in row.word_token):
|
if all(not c.isdigit() for c in row.word_token):
|
||||||
penalty += 0.2 * (len(row.word_token) - 1)
|
penalty += 0.2 * (len(row.word_token) - 1)
|
||||||
elif row.type == 'C':
|
elif row.type == 'C':
|
||||||
if len(row.word_token) == 1:
|
if len(row.word_token) == 1:
|
||||||
penalty = 0.3
|
penalty += 0.3
|
||||||
|
|
||||||
if row.info is None:
|
if row.info is None:
|
||||||
lookup_word = row.word
|
lookup_word = row.word
|
||||||
@@ -202,7 +219,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
|
|||||||
|
|
||||||
for row in await self.lookup_in_db(list(words.keys())):
|
for row in await self.lookup_in_db(list(words.keys())):
|
||||||
for trange in words[row.word_token]:
|
for trange in words[row.word_token]:
|
||||||
token = ICUToken.from_db_row(row)
|
token = ICUToken.from_db_row(row, trange.penalty or 0.0)
|
||||||
if row.type == 'S':
|
if row.type == 'S':
|
||||||
if row.info['op'] in ('in', 'near'):
|
if row.info['op'] in ('in', 'near'):
|
||||||
if trange.start == 0:
|
if trange.start == 0:
|
||||||
@@ -242,16 +259,24 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
|
|||||||
wordnr = 0
|
wordnr = 0
|
||||||
for phrase in query.source:
|
for phrase in query.source:
|
||||||
query.nodes[-1].ptype = phrase.ptype
|
query.nodes[-1].ptype = phrase.ptype
|
||||||
for word in phrase.text.split(' '):
|
phrase_split = re.split('([ :-])', phrase.text)
|
||||||
|
# The zip construct will give us the pairs of word/break from
|
||||||
|
# the regular expression split. As the split array ends on the
|
||||||
|
# final word, we simply use the fillvalue to even out the list and
|
||||||
|
# add the phrase break at the end.
|
||||||
|
for word, breakchar in zip_longest(*[iter(phrase_split)]*2, fillvalue=','):
|
||||||
|
if not word:
|
||||||
|
continue
|
||||||
trans = self.transliterator.transliterate(word)
|
trans = self.transliterator.transliterate(word)
|
||||||
if trans:
|
if trans:
|
||||||
for term in trans.split(' '):
|
for term in trans.split(' '):
|
||||||
if term:
|
if term:
|
||||||
parts.append(QueryPart(term, word, wordnr))
|
parts.append(QueryPart(term, word, wordnr,
|
||||||
|
PENALTY_IN_TOKEN_BREAK[qmod.BreakType.TOKEN]))
|
||||||
query.add_node(qmod.BreakType.TOKEN, phrase.ptype)
|
query.add_node(qmod.BreakType.TOKEN, phrase.ptype)
|
||||||
query.nodes[-1].btype = qmod.BreakType.WORD
|
query.nodes[-1].btype = qmod.BreakType(breakchar)
|
||||||
|
parts[-1].penalty = PENALTY_IN_TOKEN_BREAK[qmod.BreakType(breakchar)]
|
||||||
wordnr += 1
|
wordnr += 1
|
||||||
query.nodes[-1].btype = qmod.BreakType.PHRASE
|
|
||||||
|
|
||||||
for word, wrange in yield_words(parts, phrase_start):
|
for word, wrange in yield_words(parts, phrase_start):
|
||||||
words[word].append(wrange)
|
words[word].append(wrange)
|
||||||
@@ -272,7 +297,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
|
|||||||
""" Add tokens to query that are not saved in the database.
|
""" Add tokens to query that are not saved in the database.
|
||||||
"""
|
"""
|
||||||
for part, node, i in zip(parts, query.nodes, range(1000)):
|
for part, node, i in zip(parts, query.nodes, range(1000)):
|
||||||
if len(part.token) <= 4 and part[0].isdigit()\
|
if len(part.token) <= 4 and part.token.isdigit()\
|
||||||
and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER):
|
and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER):
|
||||||
query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER,
|
query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER,
|
||||||
ICUToken(penalty=0.5, token=0,
|
ICUToken(penalty=0.5, token=0,
|
||||||
|
|||||||
@@ -21,7 +21,13 @@ class BreakType(enum.Enum):
|
|||||||
END = '>'
|
END = '>'
|
||||||
""" End of the query. """
|
""" End of the query. """
|
||||||
PHRASE = ','
|
PHRASE = ','
|
||||||
""" Break between two phrases. """
|
""" Hard break between two phrases. Address parts cannot cross hard
|
||||||
|
phrase boundaries."""
|
||||||
|
SOFT_PHRASE = ':'
|
||||||
|
""" Likely break between two phrases. Address parts should not cross soft
|
||||||
|
phrase boundaries. Soft breaks can be inserted by a preprocessor
|
||||||
|
that is analysing the input string.
|
||||||
|
"""
|
||||||
WORD = ' '
|
WORD = ' '
|
||||||
""" Break between words. """
|
""" Break between words. """
|
||||||
PART = '-'
|
PART = '-'
|
||||||
@@ -116,6 +122,7 @@ class TokenRange:
|
|||||||
"""
|
"""
|
||||||
start: int
|
start: int
|
||||||
end: int
|
end: int
|
||||||
|
penalty: Optional[float] = None
|
||||||
|
|
||||||
def __lt__(self, other: 'TokenRange') -> bool:
|
def __lt__(self, other: 'TokenRange') -> bool:
|
||||||
return self.end <= other.start
|
return self.end <= other.start
|
||||||
|
|||||||
@@ -27,6 +27,7 @@ PENALTY_TOKENCHANGE = {
|
|||||||
qmod.BreakType.START: 0.0,
|
qmod.BreakType.START: 0.0,
|
||||||
qmod.BreakType.END: 0.0,
|
qmod.BreakType.END: 0.0,
|
||||||
qmod.BreakType.PHRASE: 0.0,
|
qmod.BreakType.PHRASE: 0.0,
|
||||||
|
qmod.BreakType.SOFT_PHRASE: 0.0,
|
||||||
qmod.BreakType.WORD: 0.1,
|
qmod.BreakType.WORD: 0.1,
|
||||||
qmod.BreakType.PART: 0.2,
|
qmod.BreakType.PART: 0.2,
|
||||||
qmod.BreakType.TOKEN: 0.4
|
qmod.BreakType.TOKEN: 0.4
|
||||||
|
|||||||
@@ -8,4 +8,4 @@
|
|||||||
Version information for the Nominatim API.
|
Version information for the Nominatim API.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
NOMINATIM_API_VERSION = '4.5.0'
|
NOMINATIM_API_VERSION = '5.0.0'
|
||||||
|
|||||||
@@ -122,13 +122,16 @@ class SetupAll:
|
|||||||
|
|
||||||
LOG.warning('Post-process tables')
|
LOG.warning('Post-process tables')
|
||||||
with connect(args.config.get_libpq_dsn()) as conn:
|
with connect(args.config.get_libpq_dsn()) as conn:
|
||||||
|
conn.autocommit = True
|
||||||
await database_import.create_search_indices(conn, args.config,
|
await database_import.create_search_indices(conn, args.config,
|
||||||
drop=args.no_updates,
|
drop=args.no_updates,
|
||||||
threads=num_threads)
|
threads=num_threads)
|
||||||
LOG.warning('Create search index for default country names.')
|
LOG.warning('Create search index for default country names.')
|
||||||
|
conn.autocommit = False
|
||||||
country_info.create_country_names(conn, tokenizer,
|
country_info.create_country_names(conn, tokenizer,
|
||||||
args.config.get_str_list('LANGUAGES'))
|
args.config.get_str_list('LANGUAGES'))
|
||||||
if args.no_updates:
|
if args.no_updates:
|
||||||
|
conn.autocommit = True
|
||||||
freeze.drop_update_tables(conn)
|
freeze.drop_update_tables(conn)
|
||||||
tokenizer.finalize_import(args.config)
|
tokenizer.finalize_import(args.config)
|
||||||
|
|
||||||
@@ -183,6 +186,7 @@ class SetupAll:
|
|||||||
from ..tools import database_import, refresh
|
from ..tools import database_import, refresh
|
||||||
|
|
||||||
with connect(config.get_libpq_dsn()) as conn:
|
with connect(config.get_libpq_dsn()) as conn:
|
||||||
|
conn.autocommit = True
|
||||||
LOG.warning('Create functions (1st pass)')
|
LOG.warning('Create functions (1st pass)')
|
||||||
refresh.create_functions(conn, config, False, False)
|
refresh.create_functions(conn, config, False, False)
|
||||||
LOG.warning('Create tables')
|
LOG.warning('Create tables')
|
||||||
|
|||||||
@@ -25,6 +25,8 @@ class ICUTokenAnalysis:
|
|||||||
|
|
||||||
def __init__(self, norm_rules: str, trans_rules: str,
|
def __init__(self, norm_rules: str, trans_rules: str,
|
||||||
analysis_rules: Mapping[Optional[str], 'TokenAnalyzerRule']):
|
analysis_rules: Mapping[Optional[str], 'TokenAnalyzerRule']):
|
||||||
|
# additional break signs are not relevant during name analysis
|
||||||
|
norm_rules += ";[[:Space:][-:]]+ > ' ';"
|
||||||
self.normalizer = Transliterator.createFromRules("icu_normalization",
|
self.normalizer = Transliterator.createFromRules("icu_normalization",
|
||||||
norm_rules)
|
norm_rules)
|
||||||
trans_rules += ";[:Space:]+ > ' '"
|
trans_rules += ";[:Space:]+ > ' '"
|
||||||
|
|||||||
@@ -55,7 +55,7 @@ def parse_version(version: str) -> NominatimVersion:
|
|||||||
return NominatimVersion(*[int(x) for x in parts[:2] + parts[2].split('-')])
|
return NominatimVersion(*[int(x) for x in parts[:2] + parts[2].split('-')])
|
||||||
|
|
||||||
|
|
||||||
NOMINATIM_VERSION = parse_version('4.5.0-0')
|
NOMINATIM_VERSION = parse_version('5.0.0-0')
|
||||||
|
|
||||||
POSTGRESQL_REQUIRED_VERSION = (12, 0)
|
POSTGRESQL_REQUIRED_VERSION = (12, 0)
|
||||||
POSTGIS_REQUIRED_VERSION = (3, 0)
|
POSTGIS_REQUIRED_VERSION = (3, 0)
|
||||||
|
|||||||
@@ -267,3 +267,34 @@ Feature: Rank assignment
|
|||||||
| object | rank_search | rank_address |
|
| object | rank_search | rank_address |
|
||||||
| N23:amenity | 30 | 30 |
|
| N23:amenity | 30 | 30 |
|
||||||
| N23:place | 16 | 16 |
|
| N23:place | 16 | 16 |
|
||||||
|
|
||||||
|
Scenario: Address rank 25 is only used for addr:place
|
||||||
|
Given the grid
|
||||||
|
| 10 | 33 | 34 | 11 |
|
||||||
|
Given the places
|
||||||
|
| osm | class | type | name |
|
||||||
|
| N10 | place | village | vil |
|
||||||
|
| N11 | place | farm | farm |
|
||||||
|
And the places
|
||||||
|
| osm | class | type | name | geometry |
|
||||||
|
| W1 | highway | residential | RD | 33,11 |
|
||||||
|
And the places
|
||||||
|
| osm | class | type | name | addr+farm | geometry |
|
||||||
|
| W2 | highway | residential | RD2 | farm | 34,11 |
|
||||||
|
And the places
|
||||||
|
| osm | class | type | housenr |
|
||||||
|
| N33 | place | house | 23 |
|
||||||
|
And the places
|
||||||
|
| osm | class | type | housenr | addr+place |
|
||||||
|
| N34 | place | house | 23 | farm |
|
||||||
|
When importing
|
||||||
|
Then placex contains
|
||||||
|
| object | parent_place_id |
|
||||||
|
| N11 | N10 |
|
||||||
|
| N33 | W1 |
|
||||||
|
| N34 | N11 |
|
||||||
|
And place_addressline contains
|
||||||
|
| object | address |
|
||||||
|
| W1 | N10 |
|
||||||
|
| W2 | N10 |
|
||||||
|
| W2 | N11 |
|
||||||
|
|||||||
@@ -0,0 +1,34 @@
|
|||||||
|
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||||
|
#
|
||||||
|
# This file is part of Nominatim. (https://nominatim.org)
|
||||||
|
#
|
||||||
|
# Copyright (C) 2025 by the Nominatim developer community.
|
||||||
|
# For a full list of authors see the git log.
|
||||||
|
"""
|
||||||
|
Tests for japanese phrase splitting.
|
||||||
|
"""
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from icu import Transliterator
|
||||||
|
|
||||||
|
import nominatim_api.search.query as qmod
|
||||||
|
from nominatim_api.query_preprocessing.config import QueryConfig
|
||||||
|
from nominatim_api.query_preprocessing import split_japanese_phrases
|
||||||
|
|
||||||
|
def run_preprocessor_on(query):
|
||||||
|
proc = split_japanese_phrases.create(QueryConfig().set_normalizer(None))
|
||||||
|
|
||||||
|
return proc(query)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('inp,outp', [('大阪府大阪市大阪', '大阪府:大阪市:大阪'),
|
||||||
|
('大阪府大阪', '大阪府:大阪'),
|
||||||
|
('大阪市大阪', '大阪市:大阪')])
|
||||||
|
def test_split_phrases(inp, outp):
|
||||||
|
query = [qmod.Phrase(qmod.PhraseType.NONE, inp)]
|
||||||
|
|
||||||
|
out = run_preprocessor_on(query)
|
||||||
|
|
||||||
|
assert out == [qmod.Phrase(qmod.PhraseType.NONE, outp)]
|
||||||
@@ -23,14 +23,10 @@ def test_refresh_import_secondary_importance_non_existing(dsn):
|
|||||||
|
|
||||||
def test_refresh_import_secondary_importance_testdb(dsn, src_dir, temp_db_conn, temp_db_cursor):
|
def test_refresh_import_secondary_importance_testdb(dsn, src_dir, temp_db_conn, temp_db_cursor):
|
||||||
temp_db_cursor.execute('CREATE EXTENSION postgis')
|
temp_db_cursor.execute('CREATE EXTENSION postgis')
|
||||||
|
temp_db_cursor.execute('CREATE EXTENSION postgis_raster')
|
||||||
|
assert refresh.import_secondary_importance(dsn, src_dir / 'test' / 'testdb') == 0
|
||||||
|
|
||||||
if postgis_version_tuple(temp_db_conn)[0] < 3:
|
assert temp_db_cursor.table_exists('secondary_importance')
|
||||||
assert refresh.import_secondary_importance(dsn, src_dir / 'test' / 'testdb') > 0
|
|
||||||
else:
|
|
||||||
temp_db_cursor.execute('CREATE EXTENSION postgis_raster')
|
|
||||||
assert refresh.import_secondary_importance(dsn, src_dir / 'test' / 'testdb') == 0
|
|
||||||
|
|
||||||
assert temp_db_cursor.table_exists('secondary_importance')
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("replace", (True, False))
|
@pytest.mark.parametrize("replace", (True, False))
|
||||||
@@ -41,8 +37,7 @@ def test_refresh_import_wikipedia(dsn, src_dir, table_factory, temp_db_cursor, r
|
|||||||
# use the small wikipedia file for the API testdb
|
# use the small wikipedia file for the API testdb
|
||||||
assert refresh.import_wikipedia_articles(dsn, src_dir / 'test' / 'testdb') == 0
|
assert refresh.import_wikipedia_articles(dsn, src_dir / 'test' / 'testdb') == 0
|
||||||
|
|
||||||
assert temp_db_cursor.table_rows('wikipedia_article') > 0
|
assert temp_db_cursor.table_rows('wikimedia_importance') > 0
|
||||||
assert temp_db_cursor.table_rows('wikipedia_redirect') > 0
|
|
||||||
|
|
||||||
|
|
||||||
def test_recompute_importance(placex_table, table_factory, temp_db_conn, temp_db_cursor):
|
def test_recompute_importance(placex_table, table_factory, temp_db_conn, temp_db_cursor):
|
||||||
|
|||||||
BIN
test/testdb/wikimedia-importance.csv.gz
Normal file
BIN
test/testdb/wikimedia-importance.csv.gz
Normal file
Binary file not shown.
Binary file not shown.
Reference in New Issue
Block a user