Compare commits

...

21 Commits

Author SHA1 Message Date
Sarah Hoffmann
ee8915f2b6 prepare 5.0.0 release 2025-02-05 10:54:38 +01:00
Sarah Hoffmann
5475bf7b9c Merge pull request #3635 from lonvia/replace-wikimedia-importance-test-data
Update wikimedia importance file for test database
2025-01-14 16:49:52 +01:00
Sarah Hoffmann
95e2d8c846 adapt tests to changed wikimedia importance test table 2025-01-14 14:19:17 +01:00
Sarah Hoffmann
7552818866 replace wikimedia importance file for test data with CSV version 2025-01-14 09:16:25 +01:00
Sarah Hoffmann
db3991af74 Merge pull request #3626 from lonvia/import-performance
Import performance
2025-01-10 16:44:33 +01:00
Sarah Hoffmann
4523b9aaed Merge pull request #3631 from lonvia/avoid-transactions
Creating tables and indexes in autocommit mode
2025-01-10 16:44:18 +01:00
Sarah Hoffmann
8b1cabebd6 Merge pull request #3633 from lonvia/restrict-long-ways
Ignore overly long ways during import
2025-01-10 16:06:37 +01:00
Sarah Hoffmann
0cf636a80c ignore overly long ways during import 2025-01-10 13:55:43 +01:00
Sarah Hoffmann
c2cb6722fe use autocommit when creating tables and indexes
Might avoid some deadlock situations with autovacuum.
2025-01-09 17:14:37 +01:00
Sarah Hoffmann
f8337bedb2 Merge pull request #3629 from lonvia/additional-breaks
Introduce new break types and phrase splitting for Japanese addresses
2025-01-09 13:55:29 +01:00
Sarah Hoffmann
efc09a5cfc add japanese phrase preprocessing
Code adapted from GSOC code by @miku.
2025-01-09 09:24:10 +01:00
Sarah Hoffmann
86ad9efa8a keep break indicators [:-] during normalisation
All punctuation will be converted to '-'. Soft breaks : may be
added by preprocessors. The break signs are only used during
query analysis and are ignored during import token analysis.
2025-01-09 09:21:55 +01:00
Sarah Hoffmann
d984100e23 add inner word break penalty 2025-01-07 21:42:25 +01:00
Sarah Hoffmann
499110f549 add SOFT_PHRASE break and enable parsing
Also enables parsing of PART breaks.
2025-01-06 17:10:24 +01:00
Sarah Hoffmann
267e5dac0d split up MultiPolygons before adding them to large_areas table 2024-12-22 09:15:16 +01:00
Sarah Hoffmann
32d3eb46d5 move geometry split into insertLocationAreaLarge()
thus insert only needs to be called once.
2024-12-22 09:15:16 +01:00
Sarah Hoffmann
c8a0dc8af1 more efficient belongs-to-address determination 2024-12-22 09:15:16 +01:00
Sarah Hoffmann
14ecfc7834 Merge pull request #3619 from lonvia/demote-farms
Remove farms and isolated dwellings from computed addresses
2024-12-22 09:13:42 +01:00
Sarah Hoffmann
cad44eb00c remove farms and isolated dwellings from computed addresses
Farms and isolated dwellings are usually confined to a very small
area. It does not make sense if they are automatically used in
addressing surrounding features. Still works to use them for
parenting when used with addr:place.
2024-12-20 22:59:02 +01:00
Sarah Hoffmann
f76dbb0a16 docs: update Update docs for virtualenv use 2024-12-20 11:27:45 +01:00
Sarah Hoffmann
8dd218a1d0 Merge pull request #3618 from osm-search/settings-md-table-space-osm-index
Settings.md - one setting was repeated
2024-12-19 08:40:31 +01:00
27 changed files with 342 additions and 156 deletions

View File

@@ -87,7 +87,6 @@ Checklist for releases:
* [ ] increase versions in
* `src/nominatim_api/version.py`
* `src/nominatim_db/version.py`
* CMakeLists.txt
* [ ] update `ChangeLog` (copy information from patch releases from release branch)
* [ ] complete `docs/admin/Migration.md`
* [ ] update EOL dates in `SECURITY.md`

View File

@@ -1,3 +1,28 @@
5.0.0
* increase required versions for PostgreSQL (12+), PostGIS (3.0+)
* remove installation via cmake and debundle osm2pgsql
* remove deprecated PHP frontend
* remove deprecated legacy tokenizer
* add configurable pre-processing of queries
* add query pre-processor to split up Japanese addresses
* rewrite of osm2pgsql style implementation
(also adds support for osm2pgsql-themepark)
* reduce the number of SQL queries needed to complete a 'lookup' call
* improve computation of centroid for lines with only two points
* improve bbox output for postcode areas
* improve result order by returning the largest object when other things are
equal
* add fallback for reverse geocoding to default country tables
* exclude postcode areas from reverse geocoding
* disable search endpoint when database is reverse-only (regression)
* minor performance improvements to area split algorithm
* switch table and index creation to use autocommit mode to avoid deadlocks
* drop overly long ways during import
* restrict automatic migrations to versions 4.3+
* switch linting from pylint to flake8
* switch tests to use a wikimedia test file in the new CSV style
* various fixes and improvements to documentation
4.5.0
* allow building Nominatim as a pip package
* make osm2pgsql building optional

View File

@@ -9,10 +9,10 @@ versions.
| Version | End of support for security updates |
| ------- | ----------------------------------- |
| 5.0.x | 2027-02-06
| 4.5.x | 2026-09-12 |
| 4.4.x | 2026-03-07 |
| 4.3.x | 2025-09-07 |
| 4.2.x | 2024-11-24 |
## Reporting a Vulnerability

View File

@@ -9,19 +9,15 @@ the following steps:
* Update the frontend: `pip install -U nominatim-api`
* (optionally) Restart updates
If you are still using CMake for the installation of Nominatim, then you
need to update the software in one step before migrating the database.
It is not recommended to do this while the machine is serving requests.
Below you find additional migrations and hints about other structural and
breaking changes. **Please read them before running the migration.**
!!! note
If you are migrating from a version <4.3, you need to install 4.3
first and migrate to 4.3 first. Then you can migrate to the current
and migrate to 4.3 first. Then you can migrate to the current
version. It is strongly recommended to do a reimport instead.
## 4.5.0 -> master
## 4.5.0 -> 5.0.0
### PHP frontend removed
@@ -33,6 +29,42 @@ needed. It currently omits a warning and does otherwise nothing. It will be
removed in later versions of Nominatim. So make sure you remove it from your
scripts.
### CMake building removed
Nominatim can now only be installed via pip. Please follow the installation
instructions for the current version to change to pip.
### osm2pgsql no longer vendored in
Nominatim no longer ships its own version of osm2pgsql. Please install a
stock version of osm2pgsql from your distribution. See the
[installation instruction for osm2pgsql](https://osm2pgsql.org/doc/install.html)
for details. A minimum version of 1.8 is required. The current stable versions
of Ubuntu and Debian already ship with an appropriate versions. For older
installation, you may have to compile a newer osm2pgsql yourself.
### Legacy tokenizer removed
The `legacy` tokenizer is no longer enabled. This tokenizer has been superseded
by the `ICU` tokenizer a long time ago. In the unlikely case that your database
still uses the `legacy` tokenizer, you must reimport your database.
### osm2pgsql style overhauled
There are some fundamental changes to how customized osm2pgsql styles should
be written. The changes are mostly backwards compatible, i.e. custom styles
should still work with the new implementation. The only exception is a
customization of the `process_tags()` function. This function is no longer
considered public and neither are the helper functions used in it.
They currently still work but will be removed at some point. If you have
been making changes to `process_tags`, please review your style and try
to switch to the new convenience functions.
For more information on the changes, see the
[pull request](https://github.com/osm-search/Nominatim/pull/3615)
and read the new
[customization documentation](https://nominatim.org/release-docs/latest/customize/Import-Styles/).
## 4.4.0 -> 4.5.0
### New structure for Python packages

View File

@@ -68,10 +68,10 @@ the update interval no new data has been published yet, it will go to sleep
until the next expected update and only then attempt to download the next batch.
The one-time mode is particularly useful if you want to run updates continuously
but need to schedule other work in between updates. For example, the main
service at osm.org uses it, to regularly recompute postcodes -- a process that
must not be run while updates are in progress. Its update script
looks like this:
but need to schedule other work in between updates. For example, you might
want to regularly recompute postcodes -- a process that
must not be run while updates are in progress. An update script refreshing
postcodes regularly might look like this:
```sh
#!/bin/bash
@@ -109,17 +109,19 @@ Unit=nominatim-updates.service
WantedBy=multi-user.target
```
And then a similar service definition: `/etc/systemd/system/nominatim-updates.service`:
`OnUnitActiveSec` defines how often the individual update command is run.
Then add a service definition for the timer in `/etc/systemd/system/nominatim-updates.service`:
```
[Unit]
Description=Single updates of Nominatim
[Service]
WorkingDirectory=/srv/nominatim
ExecStart=nominatim replication --once
StandardOutput=append:/var/log/nominatim-updates.log
StandardError=append:/var/log/nominatim-updates.error.log
WorkingDirectory=/srv/nominatim-project
ExecStart=/srv/nominatim-venv/bin/nominatim replication --once
StandardOutput=journald
StandardError=inherit
User=nominatim
Group=nominatim
Type=simple
@@ -128,9 +130,9 @@ Type=simple
WantedBy=multi-user.target
```
Replace the `WorkingDirectory` with your project directory. Also adapt user and
group names as required. `OnUnitActiveSec` defines how often the individual
update command is run.
Replace the `WorkingDirectory` with your project directory. `ExecStart` points
to the nominatim binary that was installed in your virtualenv earlier.
Finally, you might need to adapt user and group names as required.
Now activate the service and start the updates:
@@ -140,12 +142,13 @@ sudo systemctl enable nominatim-updates.timer
sudo systemctl start nominatim-updates.timer
```
You can stop future data updates, while allowing any current, in-progress
You can stop future data updates while allowing any current, in-progress
update steps to finish, by running `sudo systemctl stop
nominatim-updates.timer` and waiting until `nominatim-updates.service` isn't
running (`sudo systemctl is-active nominatim-updates.service`). Current output
from the update can be seen like above (`systemctl status
nominatim-updates.service`).
running (`sudo systemctl is-active nominatim-updates.service`).
To check the output from the update process, use journalctl: `journalctl -u
nominatim-updates.service`
#### Catch-up mode
@@ -155,13 +158,13 @@ all changes from the server until the database is up-to-date. The catch-up mode
still respects the parameter `NOMINATIM_REPLICATION_MAX_DIFF`. It downloads and
applies the changes in appropriate batches until all is done.
The catch-up mode is foremost useful to bring the database up to speed after the
The catch-up mode is foremost useful to bring the database up to date after the
initial import. Give that the service usually is not in production at this
point, you can temporarily be a bit more generous with the batch size and
number of threads you use for the updates by running catch-up like this:
```
cd /srv/nominatim
cd /srv/nominatim-project
NOMINATIM_REPLICATION_MAX_DIFF=5000 nominatim replication --catch-up --threads 15
```
@@ -173,13 +176,13 @@ replication catch-up at whatever interval you desire.
When running scheduled updates with catch-up, it is a good idea to choose
a replication source with an update frequency that is an order of magnitude
lower. For example, if you want to update once a day, use an hourly updated
source. This makes sure that you don't miss an entire day of updates when
source. This ensures that you don't miss an entire day of updates when
the source is unexpectedly late to publish its update.
If you want to use the source with the same update frequency (e.g. a daily
updated source with daily updates), use the
continuous update mode. It ensures to re-request the newest update until it
is published.
once mode together with a frequently run systemd script as described above.
It ensures to re-request the newest update until they have been published.
#### Continuous updates
@@ -197,36 +200,3 @@ parameters:
The update application keeps running forever and retrieves and applies
new updates from the server as they are published.
You can run this command as a simple systemd service. Create a service
description like that in `/etc/systemd/system/nominatim-updates.service`:
```
[Unit]
Description=Continuous updates of Nominatim
[Service]
WorkingDirectory=/srv/nominatim
ExecStart=nominatim replication
StandardOutput=append:/var/log/nominatim-updates.log
StandardError=append:/var/log/nominatim-updates.error.log
User=nominatim
Group=nominatim
Type=simple
[Install]
WantedBy=multi-user.target
```
Replace the `WorkingDirectory` with your project directory. Also adapt user
and group names as required.
Now activate the service and start the updates:
```
sudo systemctl daemon-reload
sudo systemctl enable nominatim-updates
sudo systemctl start nominatim-updates
```

View File

@@ -326,7 +326,7 @@ defined primary names are forgotten.)
| Name | Description |
| :----- | :---------- |
| core | Basic set of recogniced names for all places. |
| core | Basic set of recognized names for all places. |
| address | Additional names useful when indexing full addresses. |
| poi | Extended set of recognized names for pois. Use on top of the core set. |

View File

@@ -50,7 +50,7 @@ queries. This happens in two stages:
as during the import process but may involve other processing like,
for example, word break detection.
2. The **token analysis** step breaks down the query parts into tokens,
looks them up in the database and assignes them possible functions and
looks them up in the database and assigns them possible functions and
probabilities.
Query processing can be further customized while the rest of the analysis

View File

@@ -425,7 +425,7 @@ function Place:write_row(k, v)
if self.geometry == nil then
self.geometry = self.geom_func(self.object)
end
if self.geometry:is_null() then
if self.geometry == nil or self.geometry:is_null() then
return 0
end
@@ -608,6 +608,9 @@ function module.process_way(object)
if geom:is_null() then
geom = o:as_linestring()
if geom:is_null() or geom:length() > 30 then
return nil
end
end
return geom

View File

@@ -17,28 +17,6 @@ CREATE TYPE nearfeaturecentr AS (
centroid GEOMETRY
);
-- feature intersects geometry
-- for areas and linestrings they must touch at least along a line
CREATE OR REPLACE FUNCTION is_relevant_geometry(de9im TEXT, geom_type TEXT)
RETURNS BOOLEAN
AS $$
BEGIN
IF substring(de9im from 1 for 2) != 'FF' THEN
RETURN TRUE;
END IF;
IF geom_type = 'ST_Point' THEN
RETURN substring(de9im from 4 for 1) = '0';
END IF;
IF geom_type in ('ST_LineString', 'ST_MultiLineString') THEN
RETURN substring(de9im from 4 for 1) = '1';
END IF;
RETURN substring(de9im from 4 for 1) = '2';
END
$$ LANGUAGE plpgsql IMMUTABLE;
CREATE OR REPLACE function getNearFeatures(in_partition INTEGER, feature GEOMETRY,
feature_centroid GEOMETRY,
maxrank INTEGER)
@@ -59,7 +37,12 @@ BEGIN
isguess, postcode, centroid
FROM location_area_large_{{ partition }}
WHERE geometry && feature
AND is_relevant_geometry(ST_Relate(geometry, feature), ST_GeometryType(feature))
AND CASE WHEN ST_Dimension(feature) = 0
THEN _ST_Covers(geometry, feature)
WHEN ST_Dimension(feature) = 2
THEN ST_Relate(geometry, feature, 'T********')
ELSE ST_NPoints(ST_Intersection(geometry, feature)) > 1
END
AND rank_address < maxrank
-- Postcodes currently still use rank_search to define for which
-- features they are relevant.
@@ -142,14 +125,16 @@ BEGIN
IF in_rank_search <= 4 and not in_estimate THEN
INSERT INTO location_area_country (place_id, country_code, geometry)
values (in_place_id, in_country_code, in_geometry);
(SELECT in_place_id, in_country_code, geom
FROM split_geometry(in_geometry) as geom);
RETURN TRUE;
END IF;
{% for partition in db.partitions %}
IF in_partition = {{ partition }} THEN
INSERT INTO location_area_large_{{ partition }} (partition, place_id, country_code, keywords, rank_search, rank_address, isguess, postcode, centroid, geometry)
values (in_partition, in_place_id, in_country_code, in_keywords, in_rank_search, in_rank_address, in_estimate, postcode, in_centroid, in_geometry);
(SELECT in_partition, in_place_id, in_country_code, in_keywords, in_rank_search, in_rank_address, in_estimate, postcode, in_centroid, geom
FROM split_geometry(in_geometry) as geom);
RETURN TRUE;
END IF;
{% endfor %}

View File

@@ -348,8 +348,6 @@ CREATE OR REPLACE FUNCTION add_location(place_id BIGINT, country_code varchar(2)
RETURNS BOOLEAN
AS $$
DECLARE
locationid INTEGER;
secgeo GEOMETRY;
postcode TEXT;
BEGIN
PERFORM deleteLocationArea(partition, place_id, rank_search);
@@ -360,18 +358,19 @@ BEGIN
postcode := upper(trim (in_postcode));
END IF;
IF ST_GeometryType(geometry) in ('ST_Polygon','ST_MultiPolygon') THEN
FOR secgeo IN select split_geometry(geometry) AS geom LOOP
PERFORM insertLocationAreaLarge(partition, place_id, country_code, keywords, rank_search, rank_address, false, postcode, centroid, secgeo);
END LOOP;
ELSEIF ST_GeometryType(geometry) = 'ST_Point' THEN
secgeo := place_node_fuzzy_area(geometry, rank_search);
PERFORM insertLocationAreaLarge(partition, place_id, country_code, keywords, rank_search, rank_address, true, postcode, centroid, secgeo);
IF ST_Dimension(geometry) = 2 THEN
RETURN insertLocationAreaLarge(partition, place_id, country_code, keywords,
rank_search, rank_address, false, postcode,
centroid, geometry);
END IF;
RETURN true;
IF ST_Dimension(geometry) = 0 THEN
RETURN insertLocationAreaLarge(partition, place_id, country_code, keywords,
rank_search, rank_address, true, postcode,
centroid, place_node_fuzzy_area(geometry, rank_search));
END IF;
RETURN false;
END;
$$
LANGUAGE plpgsql;
@@ -394,19 +393,21 @@ DECLARE
geo RECORD;
area FLOAT;
remainingdepth INTEGER;
added INTEGER;
BEGIN
-- RAISE WARNING 'quad_split_geometry: maxarea=%, depth=%',maxarea,maxdepth;
IF (ST_GeometryType(geometry) not in ('ST_Polygon','ST_MultiPolygon') OR NOT ST_IsValid(geometry)) THEN
IF not ST_IsValid(geometry) THEN
RETURN;
END IF;
IF ST_Dimension(geometry) != 2 OR maxdepth <= 1 THEN
RETURN NEXT geometry;
RETURN;
END IF;
remainingdepth := maxdepth - 1;
area := ST_AREA(geometry);
IF remainingdepth < 1 OR area < maxarea THEN
IF area < maxarea THEN
RETURN NEXT geometry;
RETURN;
END IF;
@@ -426,7 +427,6 @@ BEGIN
xmid := (xmin+xmax)/2;
ymid := (ymin+ymax)/2;
added := 0;
FOR seg IN 1..4 LOOP
IF seg = 1 THEN
@@ -442,16 +442,13 @@ BEGIN
secbox := ST_SetSRID(ST_MakeBox2D(ST_Point(xmid,ymid),ST_Point(xmax,ymax)),4326);
END IF;
IF st_intersects(geometry, secbox) THEN
secgeo := st_intersection(geometry, secbox);
IF NOT ST_IsEmpty(secgeo) AND ST_GeometryType(secgeo) in ('ST_Polygon','ST_MultiPolygon') THEN
FOR geo IN select quad_split_geometry(secgeo, maxarea, remainingdepth) as geom LOOP
IF NOT ST_IsEmpty(geo.geom) AND ST_GeometryType(geo.geom) in ('ST_Polygon','ST_MultiPolygon') THEN
added := added + 1;
RETURN NEXT geo.geom;
END IF;
END LOOP;
END IF;
secgeo := st_intersection(geometry, secbox);
IF NOT ST_IsEmpty(secgeo) AND ST_Dimension(secgeo) = 2 THEN
FOR geo IN SELECT quad_split_geometry(secgeo, maxarea, remainingdepth) as geom LOOP
IF NOT ST_IsEmpty(geo.geom) AND ST_Dimension(geo.geom) = 2 THEN
RETURN NEXT geo.geom;
END IF;
END LOOP;
END IF;
END LOOP;
@@ -467,10 +464,22 @@ CREATE OR REPLACE FUNCTION split_geometry(geometry GEOMETRY)
DECLARE
geo RECORD;
BEGIN
-- 10000000000 is ~~ 1x1 degree
FOR geo IN select quad_split_geometry(geometry, 0.25, 20) as geom LOOP
RETURN NEXT geo.geom;
END LOOP;
IF ST_GeometryType(geometry) = 'ST_MultiPolygon'
and ST_Area(geometry) * 10 > ST_Area(Box2D(geometry))
THEN
FOR geo IN
SELECT quad_split_geometry(g, 0.25, 20) as geom
FROM (SELECT (ST_Dump(geometry)).geom::geometry(Polygon, 4326) AS g) xx
LOOP
RETURN NEXT geo.geom;
END LOOP;
ELSE
FOR geo IN
SELECT quad_split_geometry(geometry, 0.25, 20) as geom
LOOP
RETURN NEXT geo.geom;
END LOOP;
END IF;
RETURN;
END;
$$

View File

@@ -23,8 +23,8 @@
"allotments" : 22,
"neighbourhood" : [20, 22],
"quarter" : [20, 22],
"isolated_dwelling" : [22, 20],
"farm" : [22, 20],
"isolated_dwelling" : [22, 25],
"farm" : [22, 25],
"city_block" : 25,
"mountain_pass" : 25,
"square" : 25,

View File

@@ -1,4 +1,5 @@
query-preprocessing:
- step: split_japanese_phrases
- step: normalize
normalization:
- ":: lower ()"
@@ -9,16 +10,17 @@ normalization:
- "'nº' > 'no'"
- "ª > a"
- "º > o"
- "[[:Punctuation:][:Symbol:]\u02bc] > ' '"
- "[[:Punctuation:][:Symbol:][\u02bc] - [-:]]+ > '-'"
- "ß > 'ss'" # German szet is unambiguously equal to double ss
- "[^[:alnum:] [:Canonical_Combining_Class=Virama:] [:Space:]] >"
- "[^[:alnum:] [:Canonical_Combining_Class=Virama:] [:Space:] [-:]] >"
- "[:Lm:] >"
- ":: [[:Number:]] Latin ()"
- ":: [[:Number:]] Ascii ();"
- ":: [[:Number:]] NFD ();"
- "[[:Nonspacing Mark:] [:Cf:]] >;"
- "[:Space:]+ > ' '"
- "[-:]?[:Space:]+[-:]? > ' '"
transliteration:
- "[-:] > ' '"
- ":: Latin ()"
- !include icu-rules/extended-unicode-to-asccii.yaml
- ":: Ascii ()"

View File

@@ -0,0 +1,61 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2025 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
This file divides Japanese addresses into three categories:
prefecture, municipality, and other.
The division is not strict but simple using these keywords.
"""
from typing import List
import re
from .config import QueryConfig
from .base import QueryProcessingFunc
from ..search.query import Phrase
MATCH_PATTERNS = [
r'''
(...??[都都道府県縣]) # [group1] prefecture
(.+?[市区區町村]) # [group2] municipalities (city/wards/towns/villages)
(.+) # [group3] other words
''',
r'''
(...??[都都道府県縣]) # [group1] prefecture
(.+) # [group3] other words
''',
r'''
(.+?[市区區町村]) # [group2] municipalities (city/wards/towns/villages)
(.+) # [group3] other words
'''
]
class _JapanesePreprocessing:
def __init__(self, config: QueryConfig) -> None:
self.config = config
def split_phrase(self, phrase: Phrase) -> Phrase:
"""
This function performs a division on the given text using a regular expression.
"""
for pattern in MATCH_PATTERNS:
result = re.match(pattern, phrase.text, re.VERBOSE)
if result is not None:
return Phrase(phrase.ptype, ':'.join(result.groups()))
return phrase
def __call__(self, phrases: List[Phrase]) -> List[Phrase]:
"""Split a Japanese address using japanese_tokenizer.
"""
return [self.split_phrase(p) for p in phrases]
def create(config: QueryConfig) -> QueryProcessingFunc:
""" Create a function of japanese preprocessing.
"""
return _JapanesePreprocessing(config)

View File

@@ -433,6 +433,7 @@ PENALTY_WORDCHANGE = {
BreakType.START: 0.0,
BreakType.END: 0.0,
BreakType.PHRASE: 0.0,
BreakType.SOFT_PHRASE: 0.0,
BreakType.WORD: 0.1,
BreakType.PART: 0.2,
BreakType.TOKEN: 0.4

View File

@@ -133,7 +133,7 @@ class ForwardGeocoder:
"""
assert self.query_analyzer is not None
qwords = [word for phrase in query.source
for word in re.split('[, ]+', phrase.text) if word]
for word in re.split('[-,: ]+', phrase.text) if word]
if not qwords:
return
@@ -146,7 +146,7 @@ class ForwardGeocoder:
distance = 0.0
norm = self.query_analyzer.normalize_text(' '.join((result.display_name,
result.country_code or '')))
words = set((w for w in norm.split(' ') if w))
words = set((w for w in re.split('[-,: ]+', norm) if w))
if not words:
continue
for qword in qwords:

View File

@@ -7,10 +7,12 @@
"""
Implementation of query analysis for the ICU tokenizer.
"""
from typing import Tuple, Dict, List, Optional, NamedTuple, Iterator, Any, cast
from typing import Tuple, Dict, List, Optional, Iterator, Any, cast
from collections import defaultdict
import dataclasses
import difflib
import re
from itertools import zip_longest
from icu import Transliterator
@@ -34,17 +36,30 @@ DB_TO_TOKEN_TYPE = {
'C': qmod.TokenType.COUNTRY
}
PENALTY_IN_TOKEN_BREAK = {
qmod.BreakType.START: 0.5,
qmod.BreakType.END: 0.5,
qmod.BreakType.PHRASE: 0.5,
qmod.BreakType.SOFT_PHRASE: 0.5,
qmod.BreakType.WORD: 0.1,
qmod.BreakType.PART: 0.0,
qmod.BreakType.TOKEN: 0.0
}
class QueryPart(NamedTuple):
@dataclasses.dataclass
class QueryPart:
""" Normalized and transliterated form of a single term in the query.
When the term came out of a split during the transliteration,
the normalized string is the full word before transliteration.
The word number keeps track of the word before transliteration
and can be used to identify partial transliterated terms.
Penalty is the break penalty for the break following the token.
"""
token: str
normalized: str
word_number: int
penalty: float
QueryParts = List[QueryPart]
@@ -58,10 +73,12 @@ def yield_words(terms: List[QueryPart], start: int) -> Iterator[Tuple[str, qmod.
total = len(terms)
for first in range(start, total):
word = terms[first].token
yield word, qmod.TokenRange(first, first + 1)
penalty = PENALTY_IN_TOKEN_BREAK[qmod.BreakType.WORD]
yield word, qmod.TokenRange(first, first + 1, penalty=penalty)
for last in range(first + 1, min(first + 20, total)):
word = ' '.join((word, terms[last].token))
yield word, qmod.TokenRange(first, last + 1)
penalty += terms[last - 1].penalty
yield word, qmod.TokenRange(first, last + 1, penalty=penalty)
@dataclasses.dataclass
@@ -94,25 +111,25 @@ class ICUToken(qmod.Token):
self.penalty += (distance/len(self.lookup_word))
@staticmethod
def from_db_row(row: SaRow) -> 'ICUToken':
def from_db_row(row: SaRow, base_penalty: float = 0.0) -> 'ICUToken':
""" Create a ICUToken from the row of the word table.
"""
count = 1 if row.info is None else row.info.get('count', 1)
addr_count = 1 if row.info is None else row.info.get('addr_count', 1)
penalty = 0.0
penalty = base_penalty
if row.type == 'w':
penalty = 0.3
penalty += 0.3
elif row.type == 'W':
if len(row.word_token) == 1 and row.word_token == row.word:
penalty = 0.2 if row.word.isdigit() else 0.3
penalty += 0.2 if row.word.isdigit() else 0.3
elif row.type == 'H':
penalty = sum(0.1 for c in row.word_token if c != ' ' and not c.isdigit())
penalty += sum(0.1 for c in row.word_token if c != ' ' and not c.isdigit())
if all(not c.isdigit() for c in row.word_token):
penalty += 0.2 * (len(row.word_token) - 1)
elif row.type == 'C':
if len(row.word_token) == 1:
penalty = 0.3
penalty += 0.3
if row.info is None:
lookup_word = row.word
@@ -202,7 +219,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
for row in await self.lookup_in_db(list(words.keys())):
for trange in words[row.word_token]:
token = ICUToken.from_db_row(row)
token = ICUToken.from_db_row(row, trange.penalty or 0.0)
if row.type == 'S':
if row.info['op'] in ('in', 'near'):
if trange.start == 0:
@@ -242,16 +259,24 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
wordnr = 0
for phrase in query.source:
query.nodes[-1].ptype = phrase.ptype
for word in phrase.text.split(' '):
phrase_split = re.split('([ :-])', phrase.text)
# The zip construct will give us the pairs of word/break from
# the regular expression split. As the split array ends on the
# final word, we simply use the fillvalue to even out the list and
# add the phrase break at the end.
for word, breakchar in zip_longest(*[iter(phrase_split)]*2, fillvalue=','):
if not word:
continue
trans = self.transliterator.transliterate(word)
if trans:
for term in trans.split(' '):
if term:
parts.append(QueryPart(term, word, wordnr))
parts.append(QueryPart(term, word, wordnr,
PENALTY_IN_TOKEN_BREAK[qmod.BreakType.TOKEN]))
query.add_node(qmod.BreakType.TOKEN, phrase.ptype)
query.nodes[-1].btype = qmod.BreakType.WORD
query.nodes[-1].btype = qmod.BreakType(breakchar)
parts[-1].penalty = PENALTY_IN_TOKEN_BREAK[qmod.BreakType(breakchar)]
wordnr += 1
query.nodes[-1].btype = qmod.BreakType.PHRASE
for word, wrange in yield_words(parts, phrase_start):
words[word].append(wrange)
@@ -272,7 +297,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
""" Add tokens to query that are not saved in the database.
"""
for part, node, i in zip(parts, query.nodes, range(1000)):
if len(part.token) <= 4 and part[0].isdigit()\
if len(part.token) <= 4 and part.token.isdigit()\
and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER):
query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER,
ICUToken(penalty=0.5, token=0,

View File

@@ -21,7 +21,13 @@ class BreakType(enum.Enum):
END = '>'
""" End of the query. """
PHRASE = ','
""" Break between two phrases. """
""" Hard break between two phrases. Address parts cannot cross hard
phrase boundaries."""
SOFT_PHRASE = ':'
""" Likely break between two phrases. Address parts should not cross soft
phrase boundaries. Soft breaks can be inserted by a preprocessor
that is analysing the input string.
"""
WORD = ' '
""" Break between words. """
PART = '-'
@@ -116,6 +122,7 @@ class TokenRange:
"""
start: int
end: int
penalty: Optional[float] = None
def __lt__(self, other: 'TokenRange') -> bool:
return self.end <= other.start

View File

@@ -27,6 +27,7 @@ PENALTY_TOKENCHANGE = {
qmod.BreakType.START: 0.0,
qmod.BreakType.END: 0.0,
qmod.BreakType.PHRASE: 0.0,
qmod.BreakType.SOFT_PHRASE: 0.0,
qmod.BreakType.WORD: 0.1,
qmod.BreakType.PART: 0.2,
qmod.BreakType.TOKEN: 0.4

View File

@@ -8,4 +8,4 @@
Version information for the Nominatim API.
"""
NOMINATIM_API_VERSION = '4.5.0'
NOMINATIM_API_VERSION = '5.0.0'

View File

@@ -122,13 +122,16 @@ class SetupAll:
LOG.warning('Post-process tables')
with connect(args.config.get_libpq_dsn()) as conn:
conn.autocommit = True
await database_import.create_search_indices(conn, args.config,
drop=args.no_updates,
threads=num_threads)
LOG.warning('Create search index for default country names.')
conn.autocommit = False
country_info.create_country_names(conn, tokenizer,
args.config.get_str_list('LANGUAGES'))
if args.no_updates:
conn.autocommit = True
freeze.drop_update_tables(conn)
tokenizer.finalize_import(args.config)
@@ -183,6 +186,7 @@ class SetupAll:
from ..tools import database_import, refresh
with connect(config.get_libpq_dsn()) as conn:
conn.autocommit = True
LOG.warning('Create functions (1st pass)')
refresh.create_functions(conn, config, False, False)
LOG.warning('Create tables')

View File

@@ -25,6 +25,8 @@ class ICUTokenAnalysis:
def __init__(self, norm_rules: str, trans_rules: str,
analysis_rules: Mapping[Optional[str], 'TokenAnalyzerRule']):
# additional break signs are not relevant during name analysis
norm_rules += ";[[:Space:][-:]]+ > ' ';"
self.normalizer = Transliterator.createFromRules("icu_normalization",
norm_rules)
trans_rules += ";[:Space:]+ > ' '"

View File

@@ -55,7 +55,7 @@ def parse_version(version: str) -> NominatimVersion:
return NominatimVersion(*[int(x) for x in parts[:2] + parts[2].split('-')])
NOMINATIM_VERSION = parse_version('4.5.0-0')
NOMINATIM_VERSION = parse_version('5.0.0-0')
POSTGRESQL_REQUIRED_VERSION = (12, 0)
POSTGIS_REQUIRED_VERSION = (3, 0)

View File

@@ -267,3 +267,34 @@ Feature: Rank assignment
| object | rank_search | rank_address |
| N23:amenity | 30 | 30 |
| N23:place | 16 | 16 |
Scenario: Address rank 25 is only used for addr:place
Given the grid
| 10 | 33 | 34 | 11 |
Given the places
| osm | class | type | name |
| N10 | place | village | vil |
| N11 | place | farm | farm |
And the places
| osm | class | type | name | geometry |
| W1 | highway | residential | RD | 33,11 |
And the places
| osm | class | type | name | addr+farm | geometry |
| W2 | highway | residential | RD2 | farm | 34,11 |
And the places
| osm | class | type | housenr |
| N33 | place | house | 23 |
And the places
| osm | class | type | housenr | addr+place |
| N34 | place | house | 23 | farm |
When importing
Then placex contains
| object | parent_place_id |
| N11 | N10 |
| N33 | W1 |
| N34 | N11 |
And place_addressline contains
| object | address |
| W1 | N10 |
| W2 | N10 |
| W2 | N11 |

View File

@@ -0,0 +1,34 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2025 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Tests for japanese phrase splitting.
"""
from pathlib import Path
import pytest
from icu import Transliterator
import nominatim_api.search.query as qmod
from nominatim_api.query_preprocessing.config import QueryConfig
from nominatim_api.query_preprocessing import split_japanese_phrases
def run_preprocessor_on(query):
proc = split_japanese_phrases.create(QueryConfig().set_normalizer(None))
return proc(query)
@pytest.mark.parametrize('inp,outp', [('大阪府大阪市大阪', '大阪府:大阪市:大阪'),
('大阪府大阪', '大阪府:大阪'),
('大阪市大阪', '大阪市:大阪')])
def test_split_phrases(inp, outp):
query = [qmod.Phrase(qmod.PhraseType.NONE, inp)]
out = run_preprocessor_on(query)
assert out == [qmod.Phrase(qmod.PhraseType.NONE, outp)]

View File

@@ -23,14 +23,10 @@ def test_refresh_import_secondary_importance_non_existing(dsn):
def test_refresh_import_secondary_importance_testdb(dsn, src_dir, temp_db_conn, temp_db_cursor):
temp_db_cursor.execute('CREATE EXTENSION postgis')
temp_db_cursor.execute('CREATE EXTENSION postgis_raster')
assert refresh.import_secondary_importance(dsn, src_dir / 'test' / 'testdb') == 0
if postgis_version_tuple(temp_db_conn)[0] < 3:
assert refresh.import_secondary_importance(dsn, src_dir / 'test' / 'testdb') > 0
else:
temp_db_cursor.execute('CREATE EXTENSION postgis_raster')
assert refresh.import_secondary_importance(dsn, src_dir / 'test' / 'testdb') == 0
assert temp_db_cursor.table_exists('secondary_importance')
assert temp_db_cursor.table_exists('secondary_importance')
@pytest.mark.parametrize("replace", (True, False))
@@ -41,8 +37,7 @@ def test_refresh_import_wikipedia(dsn, src_dir, table_factory, temp_db_cursor, r
# use the small wikipedia file for the API testdb
assert refresh.import_wikipedia_articles(dsn, src_dir / 'test' / 'testdb') == 0
assert temp_db_cursor.table_rows('wikipedia_article') > 0
assert temp_db_cursor.table_rows('wikipedia_redirect') > 0
assert temp_db_cursor.table_rows('wikimedia_importance') > 0
def test_recompute_importance(placex_table, table_factory, temp_db_conn, temp_db_cursor):

Binary file not shown.