mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-02-14 18:37:58 +00:00
Compare commits
25 Commits
tokenizers
...
v5.0.0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ee8915f2b6 | ||
|
|
5475bf7b9c | ||
|
|
95e2d8c846 | ||
|
|
7552818866 | ||
|
|
db3991af74 | ||
|
|
4523b9aaed | ||
|
|
8b1cabebd6 | ||
|
|
0cf636a80c | ||
|
|
c2cb6722fe | ||
|
|
f8337bedb2 | ||
|
|
efc09a5cfc | ||
|
|
86ad9efa8a | ||
|
|
d984100e23 | ||
|
|
499110f549 | ||
|
|
267e5dac0d | ||
|
|
32d3eb46d5 | ||
|
|
c8a0dc8af1 | ||
|
|
14ecfc7834 | ||
|
|
cad44eb00c | ||
|
|
f76dbb0a16 | ||
|
|
8dd218a1d0 | ||
|
|
501e13483e | ||
|
|
b1d25e404f | ||
|
|
71fceb6854 | ||
|
|
a06e123d70 |
@@ -87,7 +87,6 @@ Checklist for releases:
|
|||||||
* [ ] increase versions in
|
* [ ] increase versions in
|
||||||
* `src/nominatim_api/version.py`
|
* `src/nominatim_api/version.py`
|
||||||
* `src/nominatim_db/version.py`
|
* `src/nominatim_db/version.py`
|
||||||
* CMakeLists.txt
|
|
||||||
* [ ] update `ChangeLog` (copy information from patch releases from release branch)
|
* [ ] update `ChangeLog` (copy information from patch releases from release branch)
|
||||||
* [ ] complete `docs/admin/Migration.md`
|
* [ ] complete `docs/admin/Migration.md`
|
||||||
* [ ] update EOL dates in `SECURITY.md`
|
* [ ] update EOL dates in `SECURITY.md`
|
||||||
|
|||||||
25
ChangeLog
25
ChangeLog
@@ -1,3 +1,28 @@
|
|||||||
|
5.0.0
|
||||||
|
* increase required versions for PostgreSQL (12+), PostGIS (3.0+)
|
||||||
|
* remove installation via cmake and debundle osm2pgsql
|
||||||
|
* remove deprecated PHP frontend
|
||||||
|
* remove deprecated legacy tokenizer
|
||||||
|
* add configurable pre-processing of queries
|
||||||
|
* add query pre-processor to split up Japanese addresses
|
||||||
|
* rewrite of osm2pgsql style implementation
|
||||||
|
(also adds support for osm2pgsql-themepark)
|
||||||
|
* reduce the number of SQL queries needed to complete a 'lookup' call
|
||||||
|
* improve computation of centroid for lines with only two points
|
||||||
|
* improve bbox output for postcode areas
|
||||||
|
* improve result order by returning the largest object when other things are
|
||||||
|
equal
|
||||||
|
* add fallback for reverse geocoding to default country tables
|
||||||
|
* exclude postcode areas from reverse geocoding
|
||||||
|
* disable search endpoint when database is reverse-only (regression)
|
||||||
|
* minor performance improvements to area split algorithm
|
||||||
|
* switch table and index creation to use autocommit mode to avoid deadlocks
|
||||||
|
* drop overly long ways during import
|
||||||
|
* restrict automatic migrations to versions 4.3+
|
||||||
|
* switch linting from pylint to flake8
|
||||||
|
* switch tests to use a wikimedia test file in the new CSV style
|
||||||
|
* various fixes and improvements to documentation
|
||||||
|
|
||||||
4.5.0
|
4.5.0
|
||||||
* allow building Nominatim as a pip package
|
* allow building Nominatim as a pip package
|
||||||
* make osm2pgsql building optional
|
* make osm2pgsql building optional
|
||||||
|
|||||||
@@ -9,10 +9,10 @@ versions.
|
|||||||
|
|
||||||
| Version | End of support for security updates |
|
| Version | End of support for security updates |
|
||||||
| ------- | ----------------------------------- |
|
| ------- | ----------------------------------- |
|
||||||
|
| 5.0.x | 2027-02-06
|
||||||
| 4.5.x | 2026-09-12 |
|
| 4.5.x | 2026-09-12 |
|
||||||
| 4.4.x | 2026-03-07 |
|
| 4.4.x | 2026-03-07 |
|
||||||
| 4.3.x | 2025-09-07 |
|
| 4.3.x | 2025-09-07 |
|
||||||
| 4.2.x | 2024-11-24 |
|
|
||||||
|
|
||||||
## Reporting a Vulnerability
|
## Reporting a Vulnerability
|
||||||
|
|
||||||
|
|||||||
@@ -9,19 +9,15 @@ the following steps:
|
|||||||
* Update the frontend: `pip install -U nominatim-api`
|
* Update the frontend: `pip install -U nominatim-api`
|
||||||
* (optionally) Restart updates
|
* (optionally) Restart updates
|
||||||
|
|
||||||
If you are still using CMake for the installation of Nominatim, then you
|
|
||||||
need to update the software in one step before migrating the database.
|
|
||||||
It is not recommended to do this while the machine is serving requests.
|
|
||||||
|
|
||||||
Below you find additional migrations and hints about other structural and
|
Below you find additional migrations and hints about other structural and
|
||||||
breaking changes. **Please read them before running the migration.**
|
breaking changes. **Please read them before running the migration.**
|
||||||
|
|
||||||
!!! note
|
!!! note
|
||||||
If you are migrating from a version <4.3, you need to install 4.3
|
If you are migrating from a version <4.3, you need to install 4.3
|
||||||
first and migrate to 4.3 first. Then you can migrate to the current
|
and migrate to 4.3 first. Then you can migrate to the current
|
||||||
version. It is strongly recommended to do a reimport instead.
|
version. It is strongly recommended to do a reimport instead.
|
||||||
|
|
||||||
## 4.5.0 -> master
|
## 4.5.0 -> 5.0.0
|
||||||
|
|
||||||
### PHP frontend removed
|
### PHP frontend removed
|
||||||
|
|
||||||
@@ -33,6 +29,42 @@ needed. It currently omits a warning and does otherwise nothing. It will be
|
|||||||
removed in later versions of Nominatim. So make sure you remove it from your
|
removed in later versions of Nominatim. So make sure you remove it from your
|
||||||
scripts.
|
scripts.
|
||||||
|
|
||||||
|
### CMake building removed
|
||||||
|
|
||||||
|
Nominatim can now only be installed via pip. Please follow the installation
|
||||||
|
instructions for the current version to change to pip.
|
||||||
|
|
||||||
|
### osm2pgsql no longer vendored in
|
||||||
|
|
||||||
|
Nominatim no longer ships its own version of osm2pgsql. Please install a
|
||||||
|
stock version of osm2pgsql from your distribution. See the
|
||||||
|
[installation instruction for osm2pgsql](https://osm2pgsql.org/doc/install.html)
|
||||||
|
for details. A minimum version of 1.8 is required. The current stable versions
|
||||||
|
of Ubuntu and Debian already ship with an appropriate versions. For older
|
||||||
|
installation, you may have to compile a newer osm2pgsql yourself.
|
||||||
|
|
||||||
|
### Legacy tokenizer removed
|
||||||
|
|
||||||
|
The `legacy` tokenizer is no longer enabled. This tokenizer has been superseded
|
||||||
|
by the `ICU` tokenizer a long time ago. In the unlikely case that your database
|
||||||
|
still uses the `legacy` tokenizer, you must reimport your database.
|
||||||
|
|
||||||
|
### osm2pgsql style overhauled
|
||||||
|
|
||||||
|
There are some fundamental changes to how customized osm2pgsql styles should
|
||||||
|
be written. The changes are mostly backwards compatible, i.e. custom styles
|
||||||
|
should still work with the new implementation. The only exception is a
|
||||||
|
customization of the `process_tags()` function. This function is no longer
|
||||||
|
considered public and neither are the helper functions used in it.
|
||||||
|
They currently still work but will be removed at some point. If you have
|
||||||
|
been making changes to `process_tags`, please review your style and try
|
||||||
|
to switch to the new convenience functions.
|
||||||
|
|
||||||
|
For more information on the changes, see the
|
||||||
|
[pull request](https://github.com/osm-search/Nominatim/pull/3615)
|
||||||
|
and read the new
|
||||||
|
[customization documentation](https://nominatim.org/release-docs/latest/customize/Import-Styles/).
|
||||||
|
|
||||||
## 4.4.0 -> 4.5.0
|
## 4.4.0 -> 4.5.0
|
||||||
|
|
||||||
### New structure for Python packages
|
### New structure for Python packages
|
||||||
|
|||||||
@@ -68,10 +68,10 @@ the update interval no new data has been published yet, it will go to sleep
|
|||||||
until the next expected update and only then attempt to download the next batch.
|
until the next expected update and only then attempt to download the next batch.
|
||||||
|
|
||||||
The one-time mode is particularly useful if you want to run updates continuously
|
The one-time mode is particularly useful if you want to run updates continuously
|
||||||
but need to schedule other work in between updates. For example, the main
|
but need to schedule other work in between updates. For example, you might
|
||||||
service at osm.org uses it, to regularly recompute postcodes -- a process that
|
want to regularly recompute postcodes -- a process that
|
||||||
must not be run while updates are in progress. Its update script
|
must not be run while updates are in progress. An update script refreshing
|
||||||
looks like this:
|
postcodes regularly might look like this:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
@@ -109,17 +109,19 @@ Unit=nominatim-updates.service
|
|||||||
WantedBy=multi-user.target
|
WantedBy=multi-user.target
|
||||||
```
|
```
|
||||||
|
|
||||||
And then a similar service definition: `/etc/systemd/system/nominatim-updates.service`:
|
`OnUnitActiveSec` defines how often the individual update command is run.
|
||||||
|
|
||||||
|
Then add a service definition for the timer in `/etc/systemd/system/nominatim-updates.service`:
|
||||||
|
|
||||||
```
|
```
|
||||||
[Unit]
|
[Unit]
|
||||||
Description=Single updates of Nominatim
|
Description=Single updates of Nominatim
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
WorkingDirectory=/srv/nominatim
|
WorkingDirectory=/srv/nominatim-project
|
||||||
ExecStart=nominatim replication --once
|
ExecStart=/srv/nominatim-venv/bin/nominatim replication --once
|
||||||
StandardOutput=append:/var/log/nominatim-updates.log
|
StandardOutput=journald
|
||||||
StandardError=append:/var/log/nominatim-updates.error.log
|
StandardError=inherit
|
||||||
User=nominatim
|
User=nominatim
|
||||||
Group=nominatim
|
Group=nominatim
|
||||||
Type=simple
|
Type=simple
|
||||||
@@ -128,9 +130,9 @@ Type=simple
|
|||||||
WantedBy=multi-user.target
|
WantedBy=multi-user.target
|
||||||
```
|
```
|
||||||
|
|
||||||
Replace the `WorkingDirectory` with your project directory. Also adapt user and
|
Replace the `WorkingDirectory` with your project directory. `ExecStart` points
|
||||||
group names as required. `OnUnitActiveSec` defines how often the individual
|
to the nominatim binary that was installed in your virtualenv earlier.
|
||||||
update command is run.
|
Finally, you might need to adapt user and group names as required.
|
||||||
|
|
||||||
Now activate the service and start the updates:
|
Now activate the service and start the updates:
|
||||||
|
|
||||||
@@ -140,12 +142,13 @@ sudo systemctl enable nominatim-updates.timer
|
|||||||
sudo systemctl start nominatim-updates.timer
|
sudo systemctl start nominatim-updates.timer
|
||||||
```
|
```
|
||||||
|
|
||||||
You can stop future data updates, while allowing any current, in-progress
|
You can stop future data updates while allowing any current, in-progress
|
||||||
update steps to finish, by running `sudo systemctl stop
|
update steps to finish, by running `sudo systemctl stop
|
||||||
nominatim-updates.timer` and waiting until `nominatim-updates.service` isn't
|
nominatim-updates.timer` and waiting until `nominatim-updates.service` isn't
|
||||||
running (`sudo systemctl is-active nominatim-updates.service`). Current output
|
running (`sudo systemctl is-active nominatim-updates.service`).
|
||||||
from the update can be seen like above (`systemctl status
|
|
||||||
nominatim-updates.service`).
|
To check the output from the update process, use journalctl: `journalctl -u
|
||||||
|
nominatim-updates.service`
|
||||||
|
|
||||||
|
|
||||||
#### Catch-up mode
|
#### Catch-up mode
|
||||||
@@ -155,13 +158,13 @@ all changes from the server until the database is up-to-date. The catch-up mode
|
|||||||
still respects the parameter `NOMINATIM_REPLICATION_MAX_DIFF`. It downloads and
|
still respects the parameter `NOMINATIM_REPLICATION_MAX_DIFF`. It downloads and
|
||||||
applies the changes in appropriate batches until all is done.
|
applies the changes in appropriate batches until all is done.
|
||||||
|
|
||||||
The catch-up mode is foremost useful to bring the database up to speed after the
|
The catch-up mode is foremost useful to bring the database up to date after the
|
||||||
initial import. Give that the service usually is not in production at this
|
initial import. Give that the service usually is not in production at this
|
||||||
point, you can temporarily be a bit more generous with the batch size and
|
point, you can temporarily be a bit more generous with the batch size and
|
||||||
number of threads you use for the updates by running catch-up like this:
|
number of threads you use for the updates by running catch-up like this:
|
||||||
|
|
||||||
```
|
```
|
||||||
cd /srv/nominatim
|
cd /srv/nominatim-project
|
||||||
NOMINATIM_REPLICATION_MAX_DIFF=5000 nominatim replication --catch-up --threads 15
|
NOMINATIM_REPLICATION_MAX_DIFF=5000 nominatim replication --catch-up --threads 15
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -173,13 +176,13 @@ replication catch-up at whatever interval you desire.
|
|||||||
When running scheduled updates with catch-up, it is a good idea to choose
|
When running scheduled updates with catch-up, it is a good idea to choose
|
||||||
a replication source with an update frequency that is an order of magnitude
|
a replication source with an update frequency that is an order of magnitude
|
||||||
lower. For example, if you want to update once a day, use an hourly updated
|
lower. For example, if you want to update once a day, use an hourly updated
|
||||||
source. This makes sure that you don't miss an entire day of updates when
|
source. This ensures that you don't miss an entire day of updates when
|
||||||
the source is unexpectedly late to publish its update.
|
the source is unexpectedly late to publish its update.
|
||||||
|
|
||||||
If you want to use the source with the same update frequency (e.g. a daily
|
If you want to use the source with the same update frequency (e.g. a daily
|
||||||
updated source with daily updates), use the
|
updated source with daily updates), use the
|
||||||
continuous update mode. It ensures to re-request the newest update until it
|
once mode together with a frequently run systemd script as described above.
|
||||||
is published.
|
It ensures to re-request the newest update until they have been published.
|
||||||
|
|
||||||
|
|
||||||
#### Continuous updates
|
#### Continuous updates
|
||||||
@@ -197,36 +200,3 @@ parameters:
|
|||||||
|
|
||||||
The update application keeps running forever and retrieves and applies
|
The update application keeps running forever and retrieves and applies
|
||||||
new updates from the server as they are published.
|
new updates from the server as they are published.
|
||||||
|
|
||||||
You can run this command as a simple systemd service. Create a service
|
|
||||||
description like that in `/etc/systemd/system/nominatim-updates.service`:
|
|
||||||
|
|
||||||
```
|
|
||||||
[Unit]
|
|
||||||
Description=Continuous updates of Nominatim
|
|
||||||
|
|
||||||
[Service]
|
|
||||||
WorkingDirectory=/srv/nominatim
|
|
||||||
ExecStart=nominatim replication
|
|
||||||
StandardOutput=append:/var/log/nominatim-updates.log
|
|
||||||
StandardError=append:/var/log/nominatim-updates.error.log
|
|
||||||
User=nominatim
|
|
||||||
Group=nominatim
|
|
||||||
Type=simple
|
|
||||||
|
|
||||||
[Install]
|
|
||||||
WantedBy=multi-user.target
|
|
||||||
```
|
|
||||||
|
|
||||||
Replace the `WorkingDirectory` with your project directory. Also adapt user
|
|
||||||
and group names as required.
|
|
||||||
|
|
||||||
Now activate the service and start the updates:
|
|
||||||
|
|
||||||
```
|
|
||||||
sudo systemctl daemon-reload
|
|
||||||
sudo systemctl enable nominatim-updates
|
|
||||||
sudo systemctl start nominatim-updates
|
|
||||||
```
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -60,7 +60,7 @@ The _main tags_ classify what kind of place the OSM object represents. One
|
|||||||
OSM object can have more than one main tag. In such case one database entry
|
OSM object can have more than one main tag. In such case one database entry
|
||||||
is created for each main tag. _Name tags_ represent searchable names of the
|
is created for each main tag. _Name tags_ represent searchable names of the
|
||||||
place. _Address tags_ are used to compute the address hierarchy of the place.
|
place. _Address tags_ are used to compute the address hierarchy of the place.
|
||||||
Address are used for searching and for creating a display name of the place.
|
Address tags are used for searching and for creating a display name of the place.
|
||||||
_Extra tags_ are any tags that are not directly related to search but
|
_Extra tags_ are any tags that are not directly related to search but
|
||||||
contain interesting additional information.
|
contain interesting additional information.
|
||||||
|
|
||||||
@@ -76,7 +76,7 @@ in which category.
|
|||||||
|
|
||||||
The flex style offers a number of functions to set the classification of
|
The flex style offers a number of functions to set the classification of
|
||||||
each OSM tag. Most of these functions can also take a preset string instead
|
each OSM tag. Most of these functions can also take a preset string instead
|
||||||
of a tag descriptions. These presets describe common configurations that
|
of a tag description. These presets describe common configurations that
|
||||||
are also used in the definition of the predefined styles. This section
|
are also used in the definition of the predefined styles. This section
|
||||||
lists the configuration functions and the accepted presets.
|
lists the configuration functions and the accepted presets.
|
||||||
|
|
||||||
@@ -95,7 +95,7 @@ Any other string is matched exactly against tag keys.
|
|||||||
takes a lua table parameter which defines for keys and key/value
|
takes a lua table parameter which defines for keys and key/value
|
||||||
combinations, how they are classified.
|
combinations, how they are classified.
|
||||||
|
|
||||||
The following classifications are recognised:
|
The following classifications are recognized:
|
||||||
|
|
||||||
| classification | meaning |
|
| classification | meaning |
|
||||||
| :-------------- | :------ |
|
| :-------------- | :------ |
|
||||||
@@ -133,7 +133,7 @@ the same.
|
|||||||
In this example an object with a `boundary` tag will only be included
|
In this example an object with a `boundary` tag will only be included
|
||||||
when it has a value of `administrative`. Objects with `highway` tags are
|
when it has a value of `administrative`. Objects with `highway` tags are
|
||||||
always included with two exceptions: the troll tag `highway=no` is
|
always included with two exceptions: the troll tag `highway=no` is
|
||||||
deleted on the spot and when the value is `street_lamp` then the object
|
deleted on the spot. And when the value is `street_lamp` then the object
|
||||||
must have a name, too. Finally, if a `landuse` tag is present then
|
must have a name, too. Finally, if a `landuse` tag is present then
|
||||||
it will be used independently of the concrete value when neither boundary
|
it will be used independently of the concrete value when neither boundary
|
||||||
nor highway tags were found and the object is named.
|
nor highway tags were found and the object is named.
|
||||||
@@ -143,7 +143,7 @@ the same.
|
|||||||
| Name | Description |
|
| Name | Description |
|
||||||
| :----- | :---------- |
|
| :----- | :---------- |
|
||||||
| admin | Basic tag set collecting places and administrative boundaries. This set is needed also to ensure proper address computation and should therefore always be present. You can disable selected place types like `place=locality` after adding this set, if they are not relevant for your use case. |
|
| admin | Basic tag set collecting places and administrative boundaries. This set is needed also to ensure proper address computation and should therefore always be present. You can disable selected place types like `place=locality` after adding this set, if they are not relevant for your use case. |
|
||||||
| all_boundaries | Extends the set of recognised boundaries and places to all available ones. |
|
| all_boundaries | Extends the set of recognized boundaries and places to all available ones. |
|
||||||
| natural | Tags for natural features like rivers and mountain peaks. |
|
| natural | Tags for natural features like rivers and mountain peaks. |
|
||||||
| street/default | Tags for streets. Major streets are always included, minor ones only when they have a name. |
|
| street/default | Tags for streets. Major streets are always included, minor ones only when they have a name. |
|
||||||
| street/car | Tags for all streets that can be used by a motor vehicle. |
|
| street/car | Tags for all streets that can be used by a motor vehicle. |
|
||||||
@@ -229,7 +229,7 @@ in turn take precedence over prefix matches.
|
|||||||
| Name | Description |
|
| Name | Description |
|
||||||
| :----- | :---------- |
|
| :----- | :---------- |
|
||||||
| metatags | Tags with meta information about the OSM tag like source, notes and import sources. |
|
| metatags | Tags with meta information about the OSM tag like source, notes and import sources. |
|
||||||
| name | Non-names that describe in fact properties or name parts. These names can throw off search and should always be removed. |
|
| name | Non-names that actually describe properties or name parts. These names can throw off search and should always be removed. |
|
||||||
| address | Extra `addr:*` tags that are not useful for Nominatim. |
|
| address | Extra `addr:*` tags that are not useful for Nominatim. |
|
||||||
|
|
||||||
|
|
||||||
@@ -305,7 +305,7 @@ the database independently of the presence of other main tags.
|
|||||||
`set_name_tags()` overwrites the current configuration, while
|
`set_name_tags()` overwrites the current configuration, while
|
||||||
`modify_name_tags()` replaces the fields that are given. (Be aware that
|
`modify_name_tags()` replaces the fields that are given. (Be aware that
|
||||||
the fields are replaced as a whole. `main = {'foo_name'}` will cause
|
the fields are replaced as a whole. `main = {'foo_name'}` will cause
|
||||||
`foo_name` to become the only recognised primary name. Any previously
|
`foo_name` to become the only recognized primary name. Any previously
|
||||||
defined primary names are forgotten.)
|
defined primary names are forgotten.)
|
||||||
|
|
||||||
!!! example
|
!!! example
|
||||||
@@ -326,9 +326,9 @@ defined primary names are forgotten.)
|
|||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| :----- | :---------- |
|
| :----- | :---------- |
|
||||||
| core | Basic set of recognised names for all places. |
|
| core | Basic set of recognized names for all places. |
|
||||||
| address | Additional names useful when indexing full addresses. |
|
| address | Additional names useful when indexing full addresses. |
|
||||||
| poi | Extended set of recognised names for pois. Use on top of the core set. |
|
| poi | Extended set of recognized names for pois. Use on top of the core set. |
|
||||||
|
|
||||||
### Address tags
|
### Address tags
|
||||||
|
|
||||||
@@ -376,8 +376,8 @@ the fields are replaced as a whole.)
|
|||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| :----- | :---------- |
|
| :----- | :---------- |
|
||||||
| core | Basic set of tags needed to recognise address relationship for any place. Always include this. |
|
| core | Basic set of tags needed to recognize address relationship for any place. Always include this. |
|
||||||
| houses | Additional set of tags needed to recognise proper addresses |
|
| houses | Additional set of tags needed to recognize proper addresses |
|
||||||
|
|
||||||
### Handling of unclassified tags
|
### Handling of unclassified tags
|
||||||
|
|
||||||
@@ -514,7 +514,7 @@ Themepark topics offer two configuration options:
|
|||||||
|
|
||||||
The customization functions described in the
|
The customization functions described in the
|
||||||
[Changing recognized tags](#changing-the-recognized-tags) section
|
[Changing recognized tags](#changing-the-recognized-tags) section
|
||||||
are available from the theme. To access the theme you need to explicitly initialise it.
|
are available from the theme. To access the theme you need to explicitly initialize it.
|
||||||
|
|
||||||
!!! Example
|
!!! Example
|
||||||
``` lua
|
``` lua
|
||||||
@@ -568,7 +568,7 @@ gazetteer output.
|
|||||||
|
|
||||||
## Changing the style of existing databases
|
## Changing the style of existing databases
|
||||||
|
|
||||||
There is normally no issue changing the style of a database that is already
|
There is usually no issue changing the style of a database that is already
|
||||||
imported and now kept up-to-date with change files. Just be aware that any
|
imported and now kept up-to-date with change files. Just be aware that any
|
||||||
change in the style applies to updates only. If you want to change the data
|
change in the style applies to updates only. If you want to change the data
|
||||||
that is already in the database, then a reimport is necessary.
|
that is already in the database, then a reimport is necessary.
|
||||||
|
|||||||
@@ -336,7 +336,7 @@ NOMINATIM_TABLESPACE_SEARCH_INDEX
|
|||||||
NOMINATIM_TABLESPACE_OSM_DATA
|
NOMINATIM_TABLESPACE_OSM_DATA
|
||||||
: Raw OSM data cache used for import and updates.
|
: Raw OSM data cache used for import and updates.
|
||||||
|
|
||||||
NOMINATIM_TABLESPACE_OSM_DATA
|
NOMINATIM_TABLESPACE_OSM_INDEX
|
||||||
: Indexes on the raw OSM data cache.
|
: Indexes on the raw OSM data cache.
|
||||||
|
|
||||||
NOMINATIM_TABLESPACE_PLACE_DATA
|
NOMINATIM_TABLESPACE_PLACE_DATA
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ queries. This happens in two stages:
|
|||||||
as during the import process but may involve other processing like,
|
as during the import process but may involve other processing like,
|
||||||
for example, word break detection.
|
for example, word break detection.
|
||||||
2. The **token analysis** step breaks down the query parts into tokens,
|
2. The **token analysis** step breaks down the query parts into tokens,
|
||||||
looks them up in the database and assignes them possible functions and
|
looks them up in the database and assigns them possible functions and
|
||||||
probabilities.
|
probabilities.
|
||||||
|
|
||||||
Query processing can be further customized while the rest of the analysis
|
Query processing can be further customized while the rest of the analysis
|
||||||
|
|||||||
@@ -425,7 +425,7 @@ function Place:write_row(k, v)
|
|||||||
if self.geometry == nil then
|
if self.geometry == nil then
|
||||||
self.geometry = self.geom_func(self.object)
|
self.geometry = self.geom_func(self.object)
|
||||||
end
|
end
|
||||||
if self.geometry:is_null() then
|
if self.geometry == nil or self.geometry:is_null() then
|
||||||
return 0
|
return 0
|
||||||
end
|
end
|
||||||
|
|
||||||
@@ -608,6 +608,9 @@ function module.process_way(object)
|
|||||||
|
|
||||||
if geom:is_null() then
|
if geom:is_null() then
|
||||||
geom = o:as_linestring()
|
geom = o:as_linestring()
|
||||||
|
if geom:is_null() or geom:length() > 30 then
|
||||||
|
return nil
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
return geom
|
return geom
|
||||||
|
|||||||
@@ -17,28 +17,6 @@ CREATE TYPE nearfeaturecentr AS (
|
|||||||
centroid GEOMETRY
|
centroid GEOMETRY
|
||||||
);
|
);
|
||||||
|
|
||||||
-- feature intersects geometry
|
|
||||||
-- for areas and linestrings they must touch at least along a line
|
|
||||||
CREATE OR REPLACE FUNCTION is_relevant_geometry(de9im TEXT, geom_type TEXT)
|
|
||||||
RETURNS BOOLEAN
|
|
||||||
AS $$
|
|
||||||
BEGIN
|
|
||||||
IF substring(de9im from 1 for 2) != 'FF' THEN
|
|
||||||
RETURN TRUE;
|
|
||||||
END IF;
|
|
||||||
|
|
||||||
IF geom_type = 'ST_Point' THEN
|
|
||||||
RETURN substring(de9im from 4 for 1) = '0';
|
|
||||||
END IF;
|
|
||||||
|
|
||||||
IF geom_type in ('ST_LineString', 'ST_MultiLineString') THEN
|
|
||||||
RETURN substring(de9im from 4 for 1) = '1';
|
|
||||||
END IF;
|
|
||||||
|
|
||||||
RETURN substring(de9im from 4 for 1) = '2';
|
|
||||||
END
|
|
||||||
$$ LANGUAGE plpgsql IMMUTABLE;
|
|
||||||
|
|
||||||
CREATE OR REPLACE function getNearFeatures(in_partition INTEGER, feature GEOMETRY,
|
CREATE OR REPLACE function getNearFeatures(in_partition INTEGER, feature GEOMETRY,
|
||||||
feature_centroid GEOMETRY,
|
feature_centroid GEOMETRY,
|
||||||
maxrank INTEGER)
|
maxrank INTEGER)
|
||||||
@@ -59,7 +37,12 @@ BEGIN
|
|||||||
isguess, postcode, centroid
|
isguess, postcode, centroid
|
||||||
FROM location_area_large_{{ partition }}
|
FROM location_area_large_{{ partition }}
|
||||||
WHERE geometry && feature
|
WHERE geometry && feature
|
||||||
AND is_relevant_geometry(ST_Relate(geometry, feature), ST_GeometryType(feature))
|
AND CASE WHEN ST_Dimension(feature) = 0
|
||||||
|
THEN _ST_Covers(geometry, feature)
|
||||||
|
WHEN ST_Dimension(feature) = 2
|
||||||
|
THEN ST_Relate(geometry, feature, 'T********')
|
||||||
|
ELSE ST_NPoints(ST_Intersection(geometry, feature)) > 1
|
||||||
|
END
|
||||||
AND rank_address < maxrank
|
AND rank_address < maxrank
|
||||||
-- Postcodes currently still use rank_search to define for which
|
-- Postcodes currently still use rank_search to define for which
|
||||||
-- features they are relevant.
|
-- features they are relevant.
|
||||||
@@ -142,14 +125,16 @@ BEGIN
|
|||||||
|
|
||||||
IF in_rank_search <= 4 and not in_estimate THEN
|
IF in_rank_search <= 4 and not in_estimate THEN
|
||||||
INSERT INTO location_area_country (place_id, country_code, geometry)
|
INSERT INTO location_area_country (place_id, country_code, geometry)
|
||||||
values (in_place_id, in_country_code, in_geometry);
|
(SELECT in_place_id, in_country_code, geom
|
||||||
|
FROM split_geometry(in_geometry) as geom);
|
||||||
RETURN TRUE;
|
RETURN TRUE;
|
||||||
END IF;
|
END IF;
|
||||||
|
|
||||||
{% for partition in db.partitions %}
|
{% for partition in db.partitions %}
|
||||||
IF in_partition = {{ partition }} THEN
|
IF in_partition = {{ partition }} THEN
|
||||||
INSERT INTO location_area_large_{{ partition }} (partition, place_id, country_code, keywords, rank_search, rank_address, isguess, postcode, centroid, geometry)
|
INSERT INTO location_area_large_{{ partition }} (partition, place_id, country_code, keywords, rank_search, rank_address, isguess, postcode, centroid, geometry)
|
||||||
values (in_partition, in_place_id, in_country_code, in_keywords, in_rank_search, in_rank_address, in_estimate, postcode, in_centroid, in_geometry);
|
(SELECT in_partition, in_place_id, in_country_code, in_keywords, in_rank_search, in_rank_address, in_estimate, postcode, in_centroid, geom
|
||||||
|
FROM split_geometry(in_geometry) as geom);
|
||||||
RETURN TRUE;
|
RETURN TRUE;
|
||||||
END IF;
|
END IF;
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
|
|||||||
@@ -348,8 +348,6 @@ CREATE OR REPLACE FUNCTION add_location(place_id BIGINT, country_code varchar(2)
|
|||||||
RETURNS BOOLEAN
|
RETURNS BOOLEAN
|
||||||
AS $$
|
AS $$
|
||||||
DECLARE
|
DECLARE
|
||||||
locationid INTEGER;
|
|
||||||
secgeo GEOMETRY;
|
|
||||||
postcode TEXT;
|
postcode TEXT;
|
||||||
BEGIN
|
BEGIN
|
||||||
PERFORM deleteLocationArea(partition, place_id, rank_search);
|
PERFORM deleteLocationArea(partition, place_id, rank_search);
|
||||||
@@ -360,18 +358,19 @@ BEGIN
|
|||||||
postcode := upper(trim (in_postcode));
|
postcode := upper(trim (in_postcode));
|
||||||
END IF;
|
END IF;
|
||||||
|
|
||||||
IF ST_GeometryType(geometry) in ('ST_Polygon','ST_MultiPolygon') THEN
|
IF ST_Dimension(geometry) = 2 THEN
|
||||||
FOR secgeo IN select split_geometry(geometry) AS geom LOOP
|
RETURN insertLocationAreaLarge(partition, place_id, country_code, keywords,
|
||||||
PERFORM insertLocationAreaLarge(partition, place_id, country_code, keywords, rank_search, rank_address, false, postcode, centroid, secgeo);
|
rank_search, rank_address, false, postcode,
|
||||||
END LOOP;
|
centroid, geometry);
|
||||||
|
|
||||||
ELSEIF ST_GeometryType(geometry) = 'ST_Point' THEN
|
|
||||||
secgeo := place_node_fuzzy_area(geometry, rank_search);
|
|
||||||
PERFORM insertLocationAreaLarge(partition, place_id, country_code, keywords, rank_search, rank_address, true, postcode, centroid, secgeo);
|
|
||||||
|
|
||||||
END IF;
|
END IF;
|
||||||
|
|
||||||
RETURN true;
|
IF ST_Dimension(geometry) = 0 THEN
|
||||||
|
RETURN insertLocationAreaLarge(partition, place_id, country_code, keywords,
|
||||||
|
rank_search, rank_address, true, postcode,
|
||||||
|
centroid, place_node_fuzzy_area(geometry, rank_search));
|
||||||
|
END IF;
|
||||||
|
|
||||||
|
RETURN false;
|
||||||
END;
|
END;
|
||||||
$$
|
$$
|
||||||
LANGUAGE plpgsql;
|
LANGUAGE plpgsql;
|
||||||
@@ -394,19 +393,21 @@ DECLARE
|
|||||||
geo RECORD;
|
geo RECORD;
|
||||||
area FLOAT;
|
area FLOAT;
|
||||||
remainingdepth INTEGER;
|
remainingdepth INTEGER;
|
||||||
added INTEGER;
|
|
||||||
BEGIN
|
BEGIN
|
||||||
|
|
||||||
-- RAISE WARNING 'quad_split_geometry: maxarea=%, depth=%',maxarea,maxdepth;
|
-- RAISE WARNING 'quad_split_geometry: maxarea=%, depth=%',maxarea,maxdepth;
|
||||||
|
|
||||||
IF (ST_GeometryType(geometry) not in ('ST_Polygon','ST_MultiPolygon') OR NOT ST_IsValid(geometry)) THEN
|
IF not ST_IsValid(geometry) THEN
|
||||||
|
RETURN;
|
||||||
|
END IF;
|
||||||
|
|
||||||
|
IF ST_Dimension(geometry) != 2 OR maxdepth <= 1 THEN
|
||||||
RETURN NEXT geometry;
|
RETURN NEXT geometry;
|
||||||
RETURN;
|
RETURN;
|
||||||
END IF;
|
END IF;
|
||||||
|
|
||||||
remainingdepth := maxdepth - 1;
|
remainingdepth := maxdepth - 1;
|
||||||
area := ST_AREA(geometry);
|
area := ST_AREA(geometry);
|
||||||
IF remainingdepth < 1 OR area < maxarea THEN
|
IF area < maxarea THEN
|
||||||
RETURN NEXT geometry;
|
RETURN NEXT geometry;
|
||||||
RETURN;
|
RETURN;
|
||||||
END IF;
|
END IF;
|
||||||
@@ -426,7 +427,6 @@ BEGIN
|
|||||||
xmid := (xmin+xmax)/2;
|
xmid := (xmin+xmax)/2;
|
||||||
ymid := (ymin+ymax)/2;
|
ymid := (ymin+ymax)/2;
|
||||||
|
|
||||||
added := 0;
|
|
||||||
FOR seg IN 1..4 LOOP
|
FOR seg IN 1..4 LOOP
|
||||||
|
|
||||||
IF seg = 1 THEN
|
IF seg = 1 THEN
|
||||||
@@ -442,16 +442,13 @@ BEGIN
|
|||||||
secbox := ST_SetSRID(ST_MakeBox2D(ST_Point(xmid,ymid),ST_Point(xmax,ymax)),4326);
|
secbox := ST_SetSRID(ST_MakeBox2D(ST_Point(xmid,ymid),ST_Point(xmax,ymax)),4326);
|
||||||
END IF;
|
END IF;
|
||||||
|
|
||||||
IF st_intersects(geometry, secbox) THEN
|
secgeo := st_intersection(geometry, secbox);
|
||||||
secgeo := st_intersection(geometry, secbox);
|
IF NOT ST_IsEmpty(secgeo) AND ST_Dimension(secgeo) = 2 THEN
|
||||||
IF NOT ST_IsEmpty(secgeo) AND ST_GeometryType(secgeo) in ('ST_Polygon','ST_MultiPolygon') THEN
|
FOR geo IN SELECT quad_split_geometry(secgeo, maxarea, remainingdepth) as geom LOOP
|
||||||
FOR geo IN select quad_split_geometry(secgeo, maxarea, remainingdepth) as geom LOOP
|
IF NOT ST_IsEmpty(geo.geom) AND ST_Dimension(geo.geom) = 2 THEN
|
||||||
IF NOT ST_IsEmpty(geo.geom) AND ST_GeometryType(geo.geom) in ('ST_Polygon','ST_MultiPolygon') THEN
|
RETURN NEXT geo.geom;
|
||||||
added := added + 1;
|
END IF;
|
||||||
RETURN NEXT geo.geom;
|
END LOOP;
|
||||||
END IF;
|
|
||||||
END LOOP;
|
|
||||||
END IF;
|
|
||||||
END IF;
|
END IF;
|
||||||
END LOOP;
|
END LOOP;
|
||||||
|
|
||||||
@@ -467,10 +464,22 @@ CREATE OR REPLACE FUNCTION split_geometry(geometry GEOMETRY)
|
|||||||
DECLARE
|
DECLARE
|
||||||
geo RECORD;
|
geo RECORD;
|
||||||
BEGIN
|
BEGIN
|
||||||
-- 10000000000 is ~~ 1x1 degree
|
IF ST_GeometryType(geometry) = 'ST_MultiPolygon'
|
||||||
FOR geo IN select quad_split_geometry(geometry, 0.25, 20) as geom LOOP
|
and ST_Area(geometry) * 10 > ST_Area(Box2D(geometry))
|
||||||
RETURN NEXT geo.geom;
|
THEN
|
||||||
END LOOP;
|
FOR geo IN
|
||||||
|
SELECT quad_split_geometry(g, 0.25, 20) as geom
|
||||||
|
FROM (SELECT (ST_Dump(geometry)).geom::geometry(Polygon, 4326) AS g) xx
|
||||||
|
LOOP
|
||||||
|
RETURN NEXT geo.geom;
|
||||||
|
END LOOP;
|
||||||
|
ELSE
|
||||||
|
FOR geo IN
|
||||||
|
SELECT quad_split_geometry(geometry, 0.25, 20) as geom
|
||||||
|
LOOP
|
||||||
|
RETURN NEXT geo.geom;
|
||||||
|
END LOOP;
|
||||||
|
END IF;
|
||||||
RETURN;
|
RETURN;
|
||||||
END;
|
END;
|
||||||
$$
|
$$
|
||||||
|
|||||||
@@ -23,8 +23,8 @@
|
|||||||
"allotments" : 22,
|
"allotments" : 22,
|
||||||
"neighbourhood" : [20, 22],
|
"neighbourhood" : [20, 22],
|
||||||
"quarter" : [20, 22],
|
"quarter" : [20, 22],
|
||||||
"isolated_dwelling" : [22, 20],
|
"isolated_dwelling" : [22, 25],
|
||||||
"farm" : [22, 20],
|
"farm" : [22, 25],
|
||||||
"city_block" : 25,
|
"city_block" : 25,
|
||||||
"mountain_pass" : 25,
|
"mountain_pass" : 25,
|
||||||
"square" : 25,
|
"square" : 25,
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
query-preprocessing:
|
query-preprocessing:
|
||||||
|
- step: split_japanese_phrases
|
||||||
- step: normalize
|
- step: normalize
|
||||||
normalization:
|
normalization:
|
||||||
- ":: lower ()"
|
- ":: lower ()"
|
||||||
@@ -9,16 +10,17 @@ normalization:
|
|||||||
- "'nº' > 'no'"
|
- "'nº' > 'no'"
|
||||||
- "ª > a"
|
- "ª > a"
|
||||||
- "º > o"
|
- "º > o"
|
||||||
- "[[:Punctuation:][:Symbol:]\u02bc] > ' '"
|
- "[[:Punctuation:][:Symbol:][\u02bc] - [-:]]+ > '-'"
|
||||||
- "ß > 'ss'" # German szet is unambiguously equal to double ss
|
- "ß > 'ss'" # German szet is unambiguously equal to double ss
|
||||||
- "[^[:alnum:] [:Canonical_Combining_Class=Virama:] [:Space:]] >"
|
- "[^[:alnum:] [:Canonical_Combining_Class=Virama:] [:Space:] [-:]] >"
|
||||||
- "[:Lm:] >"
|
- "[:Lm:] >"
|
||||||
- ":: [[:Number:]] Latin ()"
|
- ":: [[:Number:]] Latin ()"
|
||||||
- ":: [[:Number:]] Ascii ();"
|
- ":: [[:Number:]] Ascii ();"
|
||||||
- ":: [[:Number:]] NFD ();"
|
- ":: [[:Number:]] NFD ();"
|
||||||
- "[[:Nonspacing Mark:] [:Cf:]] >;"
|
- "[[:Nonspacing Mark:] [:Cf:]] >;"
|
||||||
- "[:Space:]+ > ' '"
|
- "[-:]?[:Space:]+[-:]? > ' '"
|
||||||
transliteration:
|
transliteration:
|
||||||
|
- "[-:] > ' '"
|
||||||
- ":: Latin ()"
|
- ":: Latin ()"
|
||||||
- !include icu-rules/extended-unicode-to-asccii.yaml
|
- !include icu-rules/extended-unicode-to-asccii.yaml
|
||||||
- ":: Ascii ()"
|
- ":: Ascii ()"
|
||||||
|
|||||||
@@ -0,0 +1,61 @@
|
|||||||
|
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||||
|
#
|
||||||
|
# This file is part of Nominatim. (https://nominatim.org)
|
||||||
|
#
|
||||||
|
# Copyright (C) 2025 by the Nominatim developer community.
|
||||||
|
# For a full list of authors see the git log.
|
||||||
|
"""
|
||||||
|
This file divides Japanese addresses into three categories:
|
||||||
|
prefecture, municipality, and other.
|
||||||
|
The division is not strict but simple using these keywords.
|
||||||
|
"""
|
||||||
|
from typing import List
|
||||||
|
import re
|
||||||
|
|
||||||
|
from .config import QueryConfig
|
||||||
|
from .base import QueryProcessingFunc
|
||||||
|
from ..search.query import Phrase
|
||||||
|
|
||||||
|
MATCH_PATTERNS = [
|
||||||
|
r'''
|
||||||
|
(...??[都都道府県縣]) # [group1] prefecture
|
||||||
|
(.+?[市区區町村]) # [group2] municipalities (city/wards/towns/villages)
|
||||||
|
(.+) # [group3] other words
|
||||||
|
''',
|
||||||
|
r'''
|
||||||
|
(...??[都都道府県縣]) # [group1] prefecture
|
||||||
|
(.+) # [group3] other words
|
||||||
|
''',
|
||||||
|
r'''
|
||||||
|
(.+?[市区區町村]) # [group2] municipalities (city/wards/towns/villages)
|
||||||
|
(.+) # [group3] other words
|
||||||
|
'''
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class _JapanesePreprocessing:
|
||||||
|
|
||||||
|
def __init__(self, config: QueryConfig) -> None:
|
||||||
|
self.config = config
|
||||||
|
|
||||||
|
def split_phrase(self, phrase: Phrase) -> Phrase:
|
||||||
|
"""
|
||||||
|
This function performs a division on the given text using a regular expression.
|
||||||
|
"""
|
||||||
|
for pattern in MATCH_PATTERNS:
|
||||||
|
result = re.match(pattern, phrase.text, re.VERBOSE)
|
||||||
|
if result is not None:
|
||||||
|
return Phrase(phrase.ptype, ':'.join(result.groups()))
|
||||||
|
|
||||||
|
return phrase
|
||||||
|
|
||||||
|
def __call__(self, phrases: List[Phrase]) -> List[Phrase]:
|
||||||
|
"""Split a Japanese address using japanese_tokenizer.
|
||||||
|
"""
|
||||||
|
return [self.split_phrase(p) for p in phrases]
|
||||||
|
|
||||||
|
|
||||||
|
def create(config: QueryConfig) -> QueryProcessingFunc:
|
||||||
|
""" Create a function of japanese preprocessing.
|
||||||
|
"""
|
||||||
|
return _JapanesePreprocessing(config)
|
||||||
@@ -433,6 +433,7 @@ PENALTY_WORDCHANGE = {
|
|||||||
BreakType.START: 0.0,
|
BreakType.START: 0.0,
|
||||||
BreakType.END: 0.0,
|
BreakType.END: 0.0,
|
||||||
BreakType.PHRASE: 0.0,
|
BreakType.PHRASE: 0.0,
|
||||||
|
BreakType.SOFT_PHRASE: 0.0,
|
||||||
BreakType.WORD: 0.1,
|
BreakType.WORD: 0.1,
|
||||||
BreakType.PART: 0.2,
|
BreakType.PART: 0.2,
|
||||||
BreakType.TOKEN: 0.4
|
BreakType.TOKEN: 0.4
|
||||||
|
|||||||
@@ -133,7 +133,7 @@ class ForwardGeocoder:
|
|||||||
"""
|
"""
|
||||||
assert self.query_analyzer is not None
|
assert self.query_analyzer is not None
|
||||||
qwords = [word for phrase in query.source
|
qwords = [word for phrase in query.source
|
||||||
for word in re.split('[, ]+', phrase.text) if word]
|
for word in re.split('[-,: ]+', phrase.text) if word]
|
||||||
if not qwords:
|
if not qwords:
|
||||||
return
|
return
|
||||||
|
|
||||||
@@ -146,7 +146,7 @@ class ForwardGeocoder:
|
|||||||
distance = 0.0
|
distance = 0.0
|
||||||
norm = self.query_analyzer.normalize_text(' '.join((result.display_name,
|
norm = self.query_analyzer.normalize_text(' '.join((result.display_name,
|
||||||
result.country_code or '')))
|
result.country_code or '')))
|
||||||
words = set((w for w in norm.split(' ') if w))
|
words = set((w for w in re.split('[-,: ]+', norm) if w))
|
||||||
if not words:
|
if not words:
|
||||||
continue
|
continue
|
||||||
for qword in qwords:
|
for qword in qwords:
|
||||||
|
|||||||
@@ -7,10 +7,12 @@
|
|||||||
"""
|
"""
|
||||||
Implementation of query analysis for the ICU tokenizer.
|
Implementation of query analysis for the ICU tokenizer.
|
||||||
"""
|
"""
|
||||||
from typing import Tuple, Dict, List, Optional, NamedTuple, Iterator, Any, cast
|
from typing import Tuple, Dict, List, Optional, Iterator, Any, cast
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
import dataclasses
|
import dataclasses
|
||||||
import difflib
|
import difflib
|
||||||
|
import re
|
||||||
|
from itertools import zip_longest
|
||||||
|
|
||||||
from icu import Transliterator
|
from icu import Transliterator
|
||||||
|
|
||||||
@@ -34,17 +36,30 @@ DB_TO_TOKEN_TYPE = {
|
|||||||
'C': qmod.TokenType.COUNTRY
|
'C': qmod.TokenType.COUNTRY
|
||||||
}
|
}
|
||||||
|
|
||||||
|
PENALTY_IN_TOKEN_BREAK = {
|
||||||
|
qmod.BreakType.START: 0.5,
|
||||||
|
qmod.BreakType.END: 0.5,
|
||||||
|
qmod.BreakType.PHRASE: 0.5,
|
||||||
|
qmod.BreakType.SOFT_PHRASE: 0.5,
|
||||||
|
qmod.BreakType.WORD: 0.1,
|
||||||
|
qmod.BreakType.PART: 0.0,
|
||||||
|
qmod.BreakType.TOKEN: 0.0
|
||||||
|
}
|
||||||
|
|
||||||
class QueryPart(NamedTuple):
|
|
||||||
|
@dataclasses.dataclass
|
||||||
|
class QueryPart:
|
||||||
""" Normalized and transliterated form of a single term in the query.
|
""" Normalized and transliterated form of a single term in the query.
|
||||||
When the term came out of a split during the transliteration,
|
When the term came out of a split during the transliteration,
|
||||||
the normalized string is the full word before transliteration.
|
the normalized string is the full word before transliteration.
|
||||||
The word number keeps track of the word before transliteration
|
The word number keeps track of the word before transliteration
|
||||||
and can be used to identify partial transliterated terms.
|
and can be used to identify partial transliterated terms.
|
||||||
|
Penalty is the break penalty for the break following the token.
|
||||||
"""
|
"""
|
||||||
token: str
|
token: str
|
||||||
normalized: str
|
normalized: str
|
||||||
word_number: int
|
word_number: int
|
||||||
|
penalty: float
|
||||||
|
|
||||||
|
|
||||||
QueryParts = List[QueryPart]
|
QueryParts = List[QueryPart]
|
||||||
@@ -58,10 +73,12 @@ def yield_words(terms: List[QueryPart], start: int) -> Iterator[Tuple[str, qmod.
|
|||||||
total = len(terms)
|
total = len(terms)
|
||||||
for first in range(start, total):
|
for first in range(start, total):
|
||||||
word = terms[first].token
|
word = terms[first].token
|
||||||
yield word, qmod.TokenRange(first, first + 1)
|
penalty = PENALTY_IN_TOKEN_BREAK[qmod.BreakType.WORD]
|
||||||
|
yield word, qmod.TokenRange(first, first + 1, penalty=penalty)
|
||||||
for last in range(first + 1, min(first + 20, total)):
|
for last in range(first + 1, min(first + 20, total)):
|
||||||
word = ' '.join((word, terms[last].token))
|
word = ' '.join((word, terms[last].token))
|
||||||
yield word, qmod.TokenRange(first, last + 1)
|
penalty += terms[last - 1].penalty
|
||||||
|
yield word, qmod.TokenRange(first, last + 1, penalty=penalty)
|
||||||
|
|
||||||
|
|
||||||
@dataclasses.dataclass
|
@dataclasses.dataclass
|
||||||
@@ -94,25 +111,25 @@ class ICUToken(qmod.Token):
|
|||||||
self.penalty += (distance/len(self.lookup_word))
|
self.penalty += (distance/len(self.lookup_word))
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def from_db_row(row: SaRow) -> 'ICUToken':
|
def from_db_row(row: SaRow, base_penalty: float = 0.0) -> 'ICUToken':
|
||||||
""" Create a ICUToken from the row of the word table.
|
""" Create a ICUToken from the row of the word table.
|
||||||
"""
|
"""
|
||||||
count = 1 if row.info is None else row.info.get('count', 1)
|
count = 1 if row.info is None else row.info.get('count', 1)
|
||||||
addr_count = 1 if row.info is None else row.info.get('addr_count', 1)
|
addr_count = 1 if row.info is None else row.info.get('addr_count', 1)
|
||||||
|
|
||||||
penalty = 0.0
|
penalty = base_penalty
|
||||||
if row.type == 'w':
|
if row.type == 'w':
|
||||||
penalty = 0.3
|
penalty += 0.3
|
||||||
elif row.type == 'W':
|
elif row.type == 'W':
|
||||||
if len(row.word_token) == 1 and row.word_token == row.word:
|
if len(row.word_token) == 1 and row.word_token == row.word:
|
||||||
penalty = 0.2 if row.word.isdigit() else 0.3
|
penalty += 0.2 if row.word.isdigit() else 0.3
|
||||||
elif row.type == 'H':
|
elif row.type == 'H':
|
||||||
penalty = sum(0.1 for c in row.word_token if c != ' ' and not c.isdigit())
|
penalty += sum(0.1 for c in row.word_token if c != ' ' and not c.isdigit())
|
||||||
if all(not c.isdigit() for c in row.word_token):
|
if all(not c.isdigit() for c in row.word_token):
|
||||||
penalty += 0.2 * (len(row.word_token) - 1)
|
penalty += 0.2 * (len(row.word_token) - 1)
|
||||||
elif row.type == 'C':
|
elif row.type == 'C':
|
||||||
if len(row.word_token) == 1:
|
if len(row.word_token) == 1:
|
||||||
penalty = 0.3
|
penalty += 0.3
|
||||||
|
|
||||||
if row.info is None:
|
if row.info is None:
|
||||||
lookup_word = row.word
|
lookup_word = row.word
|
||||||
@@ -202,7 +219,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
|
|||||||
|
|
||||||
for row in await self.lookup_in_db(list(words.keys())):
|
for row in await self.lookup_in_db(list(words.keys())):
|
||||||
for trange in words[row.word_token]:
|
for trange in words[row.word_token]:
|
||||||
token = ICUToken.from_db_row(row)
|
token = ICUToken.from_db_row(row, trange.penalty or 0.0)
|
||||||
if row.type == 'S':
|
if row.type == 'S':
|
||||||
if row.info['op'] in ('in', 'near'):
|
if row.info['op'] in ('in', 'near'):
|
||||||
if trange.start == 0:
|
if trange.start == 0:
|
||||||
@@ -242,16 +259,24 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
|
|||||||
wordnr = 0
|
wordnr = 0
|
||||||
for phrase in query.source:
|
for phrase in query.source:
|
||||||
query.nodes[-1].ptype = phrase.ptype
|
query.nodes[-1].ptype = phrase.ptype
|
||||||
for word in phrase.text.split(' '):
|
phrase_split = re.split('([ :-])', phrase.text)
|
||||||
|
# The zip construct will give us the pairs of word/break from
|
||||||
|
# the regular expression split. As the split array ends on the
|
||||||
|
# final word, we simply use the fillvalue to even out the list and
|
||||||
|
# add the phrase break at the end.
|
||||||
|
for word, breakchar in zip_longest(*[iter(phrase_split)]*2, fillvalue=','):
|
||||||
|
if not word:
|
||||||
|
continue
|
||||||
trans = self.transliterator.transliterate(word)
|
trans = self.transliterator.transliterate(word)
|
||||||
if trans:
|
if trans:
|
||||||
for term in trans.split(' '):
|
for term in trans.split(' '):
|
||||||
if term:
|
if term:
|
||||||
parts.append(QueryPart(term, word, wordnr))
|
parts.append(QueryPart(term, word, wordnr,
|
||||||
|
PENALTY_IN_TOKEN_BREAK[qmod.BreakType.TOKEN]))
|
||||||
query.add_node(qmod.BreakType.TOKEN, phrase.ptype)
|
query.add_node(qmod.BreakType.TOKEN, phrase.ptype)
|
||||||
query.nodes[-1].btype = qmod.BreakType.WORD
|
query.nodes[-1].btype = qmod.BreakType(breakchar)
|
||||||
|
parts[-1].penalty = PENALTY_IN_TOKEN_BREAK[qmod.BreakType(breakchar)]
|
||||||
wordnr += 1
|
wordnr += 1
|
||||||
query.nodes[-1].btype = qmod.BreakType.PHRASE
|
|
||||||
|
|
||||||
for word, wrange in yield_words(parts, phrase_start):
|
for word, wrange in yield_words(parts, phrase_start):
|
||||||
words[word].append(wrange)
|
words[word].append(wrange)
|
||||||
@@ -272,7 +297,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
|
|||||||
""" Add tokens to query that are not saved in the database.
|
""" Add tokens to query that are not saved in the database.
|
||||||
"""
|
"""
|
||||||
for part, node, i in zip(parts, query.nodes, range(1000)):
|
for part, node, i in zip(parts, query.nodes, range(1000)):
|
||||||
if len(part.token) <= 4 and part[0].isdigit()\
|
if len(part.token) <= 4 and part.token.isdigit()\
|
||||||
and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER):
|
and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER):
|
||||||
query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER,
|
query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER,
|
||||||
ICUToken(penalty=0.5, token=0,
|
ICUToken(penalty=0.5, token=0,
|
||||||
|
|||||||
@@ -21,7 +21,13 @@ class BreakType(enum.Enum):
|
|||||||
END = '>'
|
END = '>'
|
||||||
""" End of the query. """
|
""" End of the query. """
|
||||||
PHRASE = ','
|
PHRASE = ','
|
||||||
""" Break between two phrases. """
|
""" Hard break between two phrases. Address parts cannot cross hard
|
||||||
|
phrase boundaries."""
|
||||||
|
SOFT_PHRASE = ':'
|
||||||
|
""" Likely break between two phrases. Address parts should not cross soft
|
||||||
|
phrase boundaries. Soft breaks can be inserted by a preprocessor
|
||||||
|
that is analysing the input string.
|
||||||
|
"""
|
||||||
WORD = ' '
|
WORD = ' '
|
||||||
""" Break between words. """
|
""" Break between words. """
|
||||||
PART = '-'
|
PART = '-'
|
||||||
@@ -116,6 +122,7 @@ class TokenRange:
|
|||||||
"""
|
"""
|
||||||
start: int
|
start: int
|
||||||
end: int
|
end: int
|
||||||
|
penalty: Optional[float] = None
|
||||||
|
|
||||||
def __lt__(self, other: 'TokenRange') -> bool:
|
def __lt__(self, other: 'TokenRange') -> bool:
|
||||||
return self.end <= other.start
|
return self.end <= other.start
|
||||||
|
|||||||
@@ -27,6 +27,7 @@ PENALTY_TOKENCHANGE = {
|
|||||||
qmod.BreakType.START: 0.0,
|
qmod.BreakType.START: 0.0,
|
||||||
qmod.BreakType.END: 0.0,
|
qmod.BreakType.END: 0.0,
|
||||||
qmod.BreakType.PHRASE: 0.0,
|
qmod.BreakType.PHRASE: 0.0,
|
||||||
|
qmod.BreakType.SOFT_PHRASE: 0.0,
|
||||||
qmod.BreakType.WORD: 0.1,
|
qmod.BreakType.WORD: 0.1,
|
||||||
qmod.BreakType.PART: 0.2,
|
qmod.BreakType.PART: 0.2,
|
||||||
qmod.BreakType.TOKEN: 0.4
|
qmod.BreakType.TOKEN: 0.4
|
||||||
|
|||||||
@@ -8,4 +8,4 @@
|
|||||||
Version information for the Nominatim API.
|
Version information for the Nominatim API.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
NOMINATIM_API_VERSION = '4.5.0'
|
NOMINATIM_API_VERSION = '5.0.0'
|
||||||
|
|||||||
@@ -122,13 +122,16 @@ class SetupAll:
|
|||||||
|
|
||||||
LOG.warning('Post-process tables')
|
LOG.warning('Post-process tables')
|
||||||
with connect(args.config.get_libpq_dsn()) as conn:
|
with connect(args.config.get_libpq_dsn()) as conn:
|
||||||
|
conn.autocommit = True
|
||||||
await database_import.create_search_indices(conn, args.config,
|
await database_import.create_search_indices(conn, args.config,
|
||||||
drop=args.no_updates,
|
drop=args.no_updates,
|
||||||
threads=num_threads)
|
threads=num_threads)
|
||||||
LOG.warning('Create search index for default country names.')
|
LOG.warning('Create search index for default country names.')
|
||||||
|
conn.autocommit = False
|
||||||
country_info.create_country_names(conn, tokenizer,
|
country_info.create_country_names(conn, tokenizer,
|
||||||
args.config.get_str_list('LANGUAGES'))
|
args.config.get_str_list('LANGUAGES'))
|
||||||
if args.no_updates:
|
if args.no_updates:
|
||||||
|
conn.autocommit = True
|
||||||
freeze.drop_update_tables(conn)
|
freeze.drop_update_tables(conn)
|
||||||
tokenizer.finalize_import(args.config)
|
tokenizer.finalize_import(args.config)
|
||||||
|
|
||||||
@@ -183,6 +186,7 @@ class SetupAll:
|
|||||||
from ..tools import database_import, refresh
|
from ..tools import database_import, refresh
|
||||||
|
|
||||||
with connect(config.get_libpq_dsn()) as conn:
|
with connect(config.get_libpq_dsn()) as conn:
|
||||||
|
conn.autocommit = True
|
||||||
LOG.warning('Create functions (1st pass)')
|
LOG.warning('Create functions (1st pass)')
|
||||||
refresh.create_functions(conn, config, False, False)
|
refresh.create_functions(conn, config, False, False)
|
||||||
LOG.warning('Create tables')
|
LOG.warning('Create tables')
|
||||||
|
|||||||
@@ -25,6 +25,8 @@ class ICUTokenAnalysis:
|
|||||||
|
|
||||||
def __init__(self, norm_rules: str, trans_rules: str,
|
def __init__(self, norm_rules: str, trans_rules: str,
|
||||||
analysis_rules: Mapping[Optional[str], 'TokenAnalyzerRule']):
|
analysis_rules: Mapping[Optional[str], 'TokenAnalyzerRule']):
|
||||||
|
# additional break signs are not relevant during name analysis
|
||||||
|
norm_rules += ";[[:Space:][-:]]+ > ' ';"
|
||||||
self.normalizer = Transliterator.createFromRules("icu_normalization",
|
self.normalizer = Transliterator.createFromRules("icu_normalization",
|
||||||
norm_rules)
|
norm_rules)
|
||||||
trans_rules += ";[:Space:]+ > ' '"
|
trans_rules += ";[:Space:]+ > ' '"
|
||||||
|
|||||||
@@ -55,7 +55,7 @@ def parse_version(version: str) -> NominatimVersion:
|
|||||||
return NominatimVersion(*[int(x) for x in parts[:2] + parts[2].split('-')])
|
return NominatimVersion(*[int(x) for x in parts[:2] + parts[2].split('-')])
|
||||||
|
|
||||||
|
|
||||||
NOMINATIM_VERSION = parse_version('4.5.0-0')
|
NOMINATIM_VERSION = parse_version('5.0.0-0')
|
||||||
|
|
||||||
POSTGRESQL_REQUIRED_VERSION = (12, 0)
|
POSTGRESQL_REQUIRED_VERSION = (12, 0)
|
||||||
POSTGIS_REQUIRED_VERSION = (3, 0)
|
POSTGIS_REQUIRED_VERSION = (3, 0)
|
||||||
|
|||||||
@@ -267,3 +267,34 @@ Feature: Rank assignment
|
|||||||
| object | rank_search | rank_address |
|
| object | rank_search | rank_address |
|
||||||
| N23:amenity | 30 | 30 |
|
| N23:amenity | 30 | 30 |
|
||||||
| N23:place | 16 | 16 |
|
| N23:place | 16 | 16 |
|
||||||
|
|
||||||
|
Scenario: Address rank 25 is only used for addr:place
|
||||||
|
Given the grid
|
||||||
|
| 10 | 33 | 34 | 11 |
|
||||||
|
Given the places
|
||||||
|
| osm | class | type | name |
|
||||||
|
| N10 | place | village | vil |
|
||||||
|
| N11 | place | farm | farm |
|
||||||
|
And the places
|
||||||
|
| osm | class | type | name | geometry |
|
||||||
|
| W1 | highway | residential | RD | 33,11 |
|
||||||
|
And the places
|
||||||
|
| osm | class | type | name | addr+farm | geometry |
|
||||||
|
| W2 | highway | residential | RD2 | farm | 34,11 |
|
||||||
|
And the places
|
||||||
|
| osm | class | type | housenr |
|
||||||
|
| N33 | place | house | 23 |
|
||||||
|
And the places
|
||||||
|
| osm | class | type | housenr | addr+place |
|
||||||
|
| N34 | place | house | 23 | farm |
|
||||||
|
When importing
|
||||||
|
Then placex contains
|
||||||
|
| object | parent_place_id |
|
||||||
|
| N11 | N10 |
|
||||||
|
| N33 | W1 |
|
||||||
|
| N34 | N11 |
|
||||||
|
And place_addressline contains
|
||||||
|
| object | address |
|
||||||
|
| W1 | N10 |
|
||||||
|
| W2 | N10 |
|
||||||
|
| W2 | N11 |
|
||||||
|
|||||||
@@ -0,0 +1,34 @@
|
|||||||
|
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||||
|
#
|
||||||
|
# This file is part of Nominatim. (https://nominatim.org)
|
||||||
|
#
|
||||||
|
# Copyright (C) 2025 by the Nominatim developer community.
|
||||||
|
# For a full list of authors see the git log.
|
||||||
|
"""
|
||||||
|
Tests for japanese phrase splitting.
|
||||||
|
"""
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from icu import Transliterator
|
||||||
|
|
||||||
|
import nominatim_api.search.query as qmod
|
||||||
|
from nominatim_api.query_preprocessing.config import QueryConfig
|
||||||
|
from nominatim_api.query_preprocessing import split_japanese_phrases
|
||||||
|
|
||||||
|
def run_preprocessor_on(query):
|
||||||
|
proc = split_japanese_phrases.create(QueryConfig().set_normalizer(None))
|
||||||
|
|
||||||
|
return proc(query)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('inp,outp', [('大阪府大阪市大阪', '大阪府:大阪市:大阪'),
|
||||||
|
('大阪府大阪', '大阪府:大阪'),
|
||||||
|
('大阪市大阪', '大阪市:大阪')])
|
||||||
|
def test_split_phrases(inp, outp):
|
||||||
|
query = [qmod.Phrase(qmod.PhraseType.NONE, inp)]
|
||||||
|
|
||||||
|
out = run_preprocessor_on(query)
|
||||||
|
|
||||||
|
assert out == [qmod.Phrase(qmod.PhraseType.NONE, outp)]
|
||||||
@@ -23,14 +23,10 @@ def test_refresh_import_secondary_importance_non_existing(dsn):
|
|||||||
|
|
||||||
def test_refresh_import_secondary_importance_testdb(dsn, src_dir, temp_db_conn, temp_db_cursor):
|
def test_refresh_import_secondary_importance_testdb(dsn, src_dir, temp_db_conn, temp_db_cursor):
|
||||||
temp_db_cursor.execute('CREATE EXTENSION postgis')
|
temp_db_cursor.execute('CREATE EXTENSION postgis')
|
||||||
|
temp_db_cursor.execute('CREATE EXTENSION postgis_raster')
|
||||||
|
assert refresh.import_secondary_importance(dsn, src_dir / 'test' / 'testdb') == 0
|
||||||
|
|
||||||
if postgis_version_tuple(temp_db_conn)[0] < 3:
|
assert temp_db_cursor.table_exists('secondary_importance')
|
||||||
assert refresh.import_secondary_importance(dsn, src_dir / 'test' / 'testdb') > 0
|
|
||||||
else:
|
|
||||||
temp_db_cursor.execute('CREATE EXTENSION postgis_raster')
|
|
||||||
assert refresh.import_secondary_importance(dsn, src_dir / 'test' / 'testdb') == 0
|
|
||||||
|
|
||||||
assert temp_db_cursor.table_exists('secondary_importance')
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("replace", (True, False))
|
@pytest.mark.parametrize("replace", (True, False))
|
||||||
@@ -41,8 +37,7 @@ def test_refresh_import_wikipedia(dsn, src_dir, table_factory, temp_db_cursor, r
|
|||||||
# use the small wikipedia file for the API testdb
|
# use the small wikipedia file for the API testdb
|
||||||
assert refresh.import_wikipedia_articles(dsn, src_dir / 'test' / 'testdb') == 0
|
assert refresh.import_wikipedia_articles(dsn, src_dir / 'test' / 'testdb') == 0
|
||||||
|
|
||||||
assert temp_db_cursor.table_rows('wikipedia_article') > 0
|
assert temp_db_cursor.table_rows('wikimedia_importance') > 0
|
||||||
assert temp_db_cursor.table_rows('wikipedia_redirect') > 0
|
|
||||||
|
|
||||||
|
|
||||||
def test_recompute_importance(placex_table, table_factory, temp_db_conn, temp_db_cursor):
|
def test_recompute_importance(placex_table, table_factory, temp_db_conn, temp_db_cursor):
|
||||||
|
|||||||
BIN
test/testdb/wikimedia-importance.csv.gz
Normal file
BIN
test/testdb/wikimedia-importance.csv.gz
Normal file
Binary file not shown.
Binary file not shown.
Reference in New Issue
Block a user