forked from hans/Nominatim
Compare commits
175 Commits
helm-chart
...
v4.0.2
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e943a2c8a4 | ||
|
|
95958458c6 | ||
|
|
3c703c3f14 | ||
|
|
cb66887c3b | ||
|
|
e56add9888 | ||
|
|
9628df3031 | ||
|
|
423f338d04 | ||
|
|
3a2597e5c4 | ||
|
|
641f261495 | ||
|
|
5884a6e7a6 | ||
|
|
10e979e841 | ||
|
|
8dc1441635 | ||
|
|
c79dcfad9a | ||
|
|
1886952666 | ||
|
|
7326b246b7 | ||
|
|
345c812e43 | ||
|
|
fd4ba3989e | ||
|
|
e2d2571ad0 | ||
|
|
d479a0585d | ||
|
|
addfae31b6 | ||
|
|
ccf61db726 | ||
|
|
5b86b2078a | ||
|
|
a069479340 | ||
|
|
d11bf9288e | ||
|
|
86eeb4d2ed | ||
|
|
2275fe59ab | ||
|
|
48be8c33ba | ||
|
|
d3d07128b2 | ||
|
|
37eeccbf4c | ||
|
|
1722fc537f | ||
|
|
b240b182cb | ||
|
|
c0f347fc8c | ||
|
|
53dbe58ada | ||
|
|
2c4b798f9b | ||
|
|
1cf14a8e94 | ||
|
|
4864bf1509 | ||
|
|
9934421442 | ||
|
|
d7267c1603 | ||
|
|
5c778c6d32 | ||
|
|
85797acf1e | ||
|
|
c4f5c11a4e | ||
|
|
5a1c3dbea3 | ||
|
|
8e439d3dd9 | ||
|
|
9ebf921c53 | ||
|
|
7bd9094aaa | ||
|
|
16cc395f78 | ||
|
|
13e7398566 | ||
|
|
8b90ee4364 | ||
|
|
1098ab732f | ||
|
|
507fdd4f40 | ||
|
|
0ae8d7ac08 | ||
|
|
c77df2d1eb | ||
|
|
cefae021db | ||
|
|
771aee8cd8 | ||
|
|
2d13d8b3b6 | ||
|
|
c1fa70639b | ||
|
|
12643c5986 | ||
|
|
a0f5613a23 | ||
|
|
824562357b | ||
|
|
ec7184c533 | ||
|
|
e8e2502e2f | ||
|
|
c86cfefc48 | ||
|
|
2635fe8b4c | ||
|
|
632436d54d | ||
|
|
74be6828dd | ||
|
|
f4acfed48f | ||
|
|
91e1c1bea8 | ||
|
|
bbb9a41ea4 | ||
|
|
f6418887b2 | ||
|
|
a3f8a097a1 | ||
|
|
751563644f | ||
|
|
e52b801cd0 | ||
|
|
445a6428a6 | ||
|
|
d59b26dad7 | ||
|
|
47417d1871 | ||
|
|
381aecb952 | ||
|
|
45344575c6 | ||
|
|
83381625bd | ||
|
|
552fb16cb2 | ||
|
|
75c631f080 | ||
|
|
e2464fdf62 | ||
|
|
9ff98073db | ||
|
|
98ee5def37 | ||
|
|
3649487f5e | ||
|
|
4b007ae740 | ||
|
|
6c79a60e19 | ||
|
|
2a94bfc703 | ||
|
|
299934fd2a | ||
|
|
b18d042832 | ||
|
|
97a10ec218 | ||
|
|
d35400a7d7 | ||
|
|
92f6ec2328 | ||
|
|
9ba2019470 | ||
|
|
c171d88194 | ||
|
|
7cfcbacfc7 | ||
|
|
52847b61a3 | ||
|
|
5a36559834 | ||
|
|
19d4e047f6 | ||
|
|
6b348d43c6 | ||
|
|
732cd27d2e | ||
|
|
8171fe4571 | ||
|
|
16daa57e47 | ||
|
|
5e5addcdbf | ||
|
|
be65c8303f | ||
|
|
231250f2eb | ||
|
|
d44a428b74 | ||
|
|
40f9d52ad8 | ||
|
|
7f3b05c179 | ||
|
|
09c9fad6c3 | ||
|
|
bb18479d5b | ||
|
|
779ea8ac62 | ||
|
|
bd7c7ddad0 | ||
|
|
c6fdcf9b0d | ||
|
|
59fe74ddf6 | ||
|
|
6d7c067461 | ||
|
|
316205e455 | ||
|
|
834ae0a93f | ||
|
|
d562f11298 | ||
|
|
972628c751 | ||
|
|
09b1db63f4 | ||
|
|
e9d54f752c | ||
|
|
c335025167 | ||
|
|
2b2109c89a | ||
|
|
56124546a6 | ||
|
|
336258ecf8 | ||
|
|
b894d2c04a | ||
|
|
8e1d4818ac | ||
|
|
28c98584c1 | ||
|
|
1c42780bb5 | ||
|
|
18554dfed7 | ||
|
|
2e493fec46 | ||
|
|
98c2e08add | ||
|
|
94d3dee369 | ||
|
|
7e7dd769fd | ||
|
|
79da96b369 | ||
|
|
78fcabade8 | ||
|
|
284645f505 | ||
|
|
0b349761a8 | ||
|
|
d18794931a | ||
|
|
b7d4ff3201 | ||
|
|
4c6d674e03 | ||
|
|
2c97af8021 | ||
|
|
832f75a55e | ||
|
|
4e77969545 | ||
|
|
6ebbbfee61 | ||
|
|
0fabeefc3e | ||
|
|
c70d72f06b | ||
|
|
cc141bf1a5 | ||
|
|
199532c802 | ||
|
|
28ee3d0949 | ||
|
|
925195725d | ||
|
|
f6d22df76e | ||
|
|
118858a55e | ||
|
|
656c1291b1 | ||
|
|
f00b8dd1c3 | ||
|
|
5f2b9e317a | ||
|
|
4ae5ba7fc4 | ||
|
|
3656eed9ad | ||
|
|
2e82a6ce03 | ||
|
|
c4b8a3b768 | ||
|
|
1147b83b22 | ||
|
|
0fb8eade13 | ||
|
|
78d11fe628 | ||
|
|
90b40fc3e6 | ||
|
|
e25e268e2e | ||
|
|
68bff31cc9 | ||
|
|
31d9545702 | ||
|
|
e449071a35 | ||
|
|
23e3724abb | ||
|
|
75a5c7013f | ||
|
|
56d24085f9 | ||
|
|
95b82af42a | ||
|
|
87dedde5d6 | ||
|
|
8b6489c60e | ||
|
|
bf4f05fff3 |
@@ -7,6 +7,8 @@ assignees: ''
|
||||
|
||||
---
|
||||
|
||||
<!-- Note: this template is for reporting problems with searching. If you have found an issue with the data, you need to report/fix the issue directly in OpenStreetMap. See https://www.openstreetmap.org/fixthemap for details. -->
|
||||
|
||||
## What did you search for?
|
||||
|
||||
<!-- Please try to provide a link to your search. You can go to https://nominatim.openstreetmap.org and repeat your search there. If you originally found the issue somewhere else, please tell us what software/website you were using. -->
|
||||
@@ -15,11 +17,11 @@ assignees: ''
|
||||
|
||||
## What result did you expect?
|
||||
|
||||
**Is the result in the right place and just named wrongly?**
|
||||
**When the result in the right place and just named wrongly:**
|
||||
|
||||
<!-- Please tell us the display name you expected. -->
|
||||
|
||||
**Is the result missing completely?**
|
||||
**When the result missing completely:**
|
||||
|
||||
<!-- Make sure that the data you are looking for is in OpenStreetMap. Provide a link to the OpenStreetMap object or if you cannot get it, a link to the map on https://openstreetmap.org where you expect the result to be.
|
||||
|
||||
|
||||
237
.github/workflows/ci-tests.yml
vendored
237
.github/workflows/ci-tests.yml
vendored
@@ -3,7 +3,38 @@ name: CI Tests
|
||||
on: [ push, pull_request ]
|
||||
|
||||
jobs:
|
||||
create-archive:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
with:
|
||||
submodules: true
|
||||
|
||||
- uses: actions/cache@v2
|
||||
with:
|
||||
path: |
|
||||
data/country_osm_grid.sql.gz
|
||||
key: nominatim-country-data-1
|
||||
|
||||
- name: Package tarball
|
||||
run: |
|
||||
if [ ! -f data/country_osm_grid.sql.gz ]; then
|
||||
wget --no-verbose -O data/country_osm_grid.sql.gz https://www.nominatim.org/data/country_grid.sql.gz
|
||||
fi
|
||||
cd ..
|
||||
tar czf nominatim-src.tar.bz2 Nominatim
|
||||
mv nominatim-src.tar.bz2 Nominatim
|
||||
|
||||
- name: 'Upload Artifact'
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: full-source
|
||||
path: nominatim-src.tar.bz2
|
||||
retention-days: 1
|
||||
|
||||
tests:
|
||||
needs: create-archive
|
||||
strategy:
|
||||
matrix:
|
||||
ubuntu: [18, 20]
|
||||
@@ -22,10 +53,12 @@ jobs:
|
||||
runs-on: ubuntu-${{ matrix.ubuntu }}.04
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- uses: actions/download-artifact@v2
|
||||
with:
|
||||
submodules: true
|
||||
path: Nominatim
|
||||
name: full-source
|
||||
|
||||
- name: Unpack Nominatim
|
||||
run: tar xf nominatim-src.tar.bz2
|
||||
|
||||
- name: Setup PHP
|
||||
uses: shivammathur/setup-php@v2
|
||||
@@ -39,18 +72,6 @@ jobs:
|
||||
python-version: 3.6
|
||||
if: matrix.ubuntu == 18
|
||||
|
||||
- name: Get Date
|
||||
id: get-date
|
||||
run: |
|
||||
echo "::set-output name=date::$(/bin/date -u "+%Y%W")"
|
||||
shell: bash
|
||||
|
||||
- uses: actions/cache@v2
|
||||
with:
|
||||
path: |
|
||||
country_grid.sql.gz
|
||||
key: nominatim-country-data-${{ steps.get-date.outputs.date }}
|
||||
|
||||
- uses: ./Nominatim/.github/actions/setup-postgresql
|
||||
with:
|
||||
postgresql-version: ${{ matrix.postgresql }}
|
||||
@@ -65,8 +86,7 @@ jobs:
|
||||
if: matrix.ubuntu == 20
|
||||
|
||||
- name: Install test prerequsites
|
||||
run: |
|
||||
pip3 install pylint==2.6.0 pytest pytest-cov behave==1.2.6
|
||||
run: pip3 install pylint==2.6.0 pytest pytest-cov behave==1.2.6
|
||||
if: matrix.ubuntu == 18
|
||||
|
||||
- name: PHP linting
|
||||
@@ -103,11 +123,6 @@ jobs:
|
||||
working-directory: Nominatim/test/bdd
|
||||
if: matrix.ubuntu == 18
|
||||
|
||||
- name: BDD tests (legacy_icu tokenizer)
|
||||
run: |
|
||||
behave -DREMOVE_TEMPLATE=1 -DBUILDDIR=$GITHUB_WORKSPACE/build -DTOKENIZER=legacy_icu --format=progress3
|
||||
working-directory: Nominatim/test/bdd
|
||||
|
||||
- name: Upload coverage to Codecov
|
||||
uses: codecov/codecov-action@v1
|
||||
with:
|
||||
@@ -119,43 +134,35 @@ jobs:
|
||||
verbose: true
|
||||
if: matrix.ubuntu == 20
|
||||
|
||||
import:
|
||||
|
||||
icu-test:
|
||||
needs: create-archive
|
||||
strategy:
|
||||
matrix:
|
||||
ubuntu: [18, 20]
|
||||
ubuntu: [20]
|
||||
include:
|
||||
- ubuntu: 18
|
||||
postgresql: 9.5
|
||||
postgis: 2.5
|
||||
- ubuntu: 20
|
||||
postgresql: 13
|
||||
postgis: 3
|
||||
pytest: py.test-3
|
||||
php: 7.4
|
||||
|
||||
runs-on: ubuntu-${{ matrix.ubuntu }}.04
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- uses: actions/download-artifact@v2
|
||||
with:
|
||||
submodules: true
|
||||
path: Nominatim
|
||||
name: full-source
|
||||
|
||||
- name: Get Date
|
||||
id: get-date
|
||||
run: |
|
||||
echo "::set-output name=date::$(/bin/date -u "+%Y%W")"
|
||||
shell: bash
|
||||
- name: Unpack Nominatim
|
||||
run: tar xf nominatim-src.tar.bz2
|
||||
|
||||
- uses: actions/cache@v2
|
||||
- name: Setup PHP
|
||||
uses: shivammathur/setup-php@v2
|
||||
with:
|
||||
path: |
|
||||
country_grid.sql.gz
|
||||
key: nominatim-country-data-${{ steps.get-date.outputs.date }}
|
||||
|
||||
- uses: actions/cache@v2
|
||||
with:
|
||||
path: |
|
||||
monaco-latest.osm.pbf
|
||||
key: nominatim-test-data-${{ steps.get-date.outputs.date }}
|
||||
php-version: ${{ matrix.php }}
|
||||
coverage: xdebug
|
||||
tools: phpunit, phpcs, composer
|
||||
|
||||
- uses: actions/setup-python@v2
|
||||
with:
|
||||
@@ -166,52 +173,148 @@ jobs:
|
||||
with:
|
||||
postgresql-version: ${{ matrix.postgresql }}
|
||||
postgis-version: ${{ matrix.postgis }}
|
||||
|
||||
- uses: ./Nominatim/.github/actions/build-nominatim
|
||||
with:
|
||||
ubuntu: ${{ matrix.ubuntu }}
|
||||
|
||||
- name: Clean installation
|
||||
run: rm -rf Nominatim build
|
||||
- name: Install test prerequsites
|
||||
run: sudo apt-get install -y -qq python3-behave
|
||||
if: matrix.ubuntu == 20
|
||||
|
||||
- name: Install test prerequsites
|
||||
run: pip3 install behave==1.2.6
|
||||
if: matrix.ubuntu == 18
|
||||
|
||||
- name: BDD tests (icu tokenizer)
|
||||
run: |
|
||||
behave -DREMOVE_TEMPLATE=1 -DBUILDDIR=$GITHUB_WORKSPACE/build -DTOKENIZER=icu --format=progress3
|
||||
working-directory: Nominatim/test/bdd
|
||||
|
||||
|
||||
install:
|
||||
runs-on: ubuntu-latest
|
||||
needs: create-archive
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
name: [Ubuntu-18, Ubuntu-20, Centos-8]
|
||||
include:
|
||||
- name: Ubuntu-18
|
||||
flavour: ubuntu
|
||||
image: "ubuntu:18.04"
|
||||
ubuntu: 18
|
||||
install_mode: install-nginx
|
||||
- name: Ubuntu-20
|
||||
flavour: ubuntu
|
||||
image: "ubuntu:20.04"
|
||||
ubuntu: 20
|
||||
install_mode: install-apache
|
||||
- name: Centos-8
|
||||
flavour: centos
|
||||
image: "centos:8"
|
||||
|
||||
container:
|
||||
image: ${{ matrix.image }}
|
||||
env:
|
||||
LANG: en_US.UTF-8
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: sudo -Hu nominatim bash --noprofile --norc -eo pipefail {0}
|
||||
|
||||
steps:
|
||||
- name: Prepare container (Ubuntu)
|
||||
run: |
|
||||
export APT_LISTCHANGES_FRONTEND=none
|
||||
export DEBIAN_FRONTEND=noninteractive
|
||||
apt-get update -qq
|
||||
apt-get install -y git sudo wget
|
||||
ln -snf /usr/share/zoneinfo/$CONTAINER_TIMEZONE /etc/localtime && echo $CONTAINER_TIMEZONE > /etc/timezone
|
||||
shell: bash
|
||||
if: matrix.flavour == 'ubuntu'
|
||||
|
||||
- name: Prepare container (CentOS)
|
||||
run: |
|
||||
dnf update -y
|
||||
dnf install -y sudo glibc-langpack-en
|
||||
shell: bash
|
||||
if: matrix.flavour == 'centos'
|
||||
|
||||
- name: Setup import user
|
||||
run: |
|
||||
useradd -m nominatim
|
||||
echo 'nominatim ALL=(ALL:ALL) NOPASSWD: ALL' > /etc/sudoers.d/nominiatim
|
||||
echo "/home/nominatim/Nominatim/vagrant/Install-on-${OS}.sh no $INSTALL_MODE" > /home/nominatim/vagrant.sh
|
||||
shell: bash
|
||||
env:
|
||||
OS: ${{ matrix.name }}
|
||||
INSTALL_MODE: ${{ matrix.install_mode }}
|
||||
|
||||
- uses: actions/download-artifact@v2
|
||||
with:
|
||||
name: full-source
|
||||
path: /home/nominatim
|
||||
|
||||
- name: Install Nominatim
|
||||
run: |
|
||||
export USERNAME=nominatim
|
||||
export USERHOME=/home/nominatim
|
||||
export NOSYSTEMD=yes
|
||||
export HAVE_SELINUX=no
|
||||
tar xf nominatim-src.tar.bz2
|
||||
. vagrant.sh
|
||||
working-directory: /home/nominatim
|
||||
|
||||
- name: Prepare import environment
|
||||
run: |
|
||||
if [ ! -f monaco-latest.osm.pbf ]; then
|
||||
wget --no-verbose https://download.geofabrik.de/europe/monaco-latest.osm.pbf
|
||||
fi
|
||||
mkdir data-env
|
||||
cd data-env
|
||||
shell: bash
|
||||
mv Nominatim/test/testdb/apidb-test-data.pbf test.pbf
|
||||
rm -rf Nominatim
|
||||
mkdir data-env-reverse
|
||||
working-directory: /home/nominatim
|
||||
|
||||
- name: Prepare import environment (CentOS)
|
||||
run: |
|
||||
sudo ln -s /usr/local/bin/nominatim /usr/bin/nominatim
|
||||
echo NOMINATIM_DATABASE_WEBUSER="apache" > nominatim-project/.env
|
||||
cp nominatim-project/.env data-env-reverse/.env
|
||||
working-directory: /home/nominatim
|
||||
if: matrix.flavour == 'centos'
|
||||
|
||||
- name: Import
|
||||
run: nominatim import --osm-file ../monaco-latest.osm.pbf
|
||||
shell: bash
|
||||
working-directory: data-env
|
||||
run: nominatim import --osm-file ../test.pbf
|
||||
working-directory: /home/nominatim/nominatim-project
|
||||
|
||||
- name: Import special phrases
|
||||
run: nominatim special-phrases --import-from-wiki
|
||||
working-directory: data-env
|
||||
working-directory: /home/nominatim/nominatim-project
|
||||
|
||||
- name: Check full import
|
||||
run: nominatim admin --check-database
|
||||
working-directory: data-env
|
||||
working-directory: /home/nominatim/nominatim-project
|
||||
|
||||
- name: Warm up database
|
||||
run: nominatim admin --warm
|
||||
working-directory: data-env
|
||||
working-directory: /home/nominatim/nominatim-project
|
||||
|
||||
- name: Prepare update (Ubuntu)
|
||||
run: apt-get install -y python3-pip
|
||||
shell: bash
|
||||
if: matrix.flavour == 'ubuntu'
|
||||
|
||||
- name: Run update
|
||||
run: |
|
||||
nominatim replication --init
|
||||
nominatim replication --once
|
||||
working-directory: data-env
|
||||
pip3 install --user osmium
|
||||
nominatim replication --init
|
||||
NOMINATIM_REPLICATION_MAX_DIFF=1 nominatim replication --once
|
||||
working-directory: /home/nominatim/nominatim-project
|
||||
|
||||
- name: Run reverse-only import
|
||||
run : nominatim import --osm-file ../monaco-latest.osm.pbf --reverse-only --no-updates
|
||||
working-directory: data-env
|
||||
env:
|
||||
NOMINATIM_DATABASE_DSN: pgsql:dbname=reverse
|
||||
run : |
|
||||
echo 'NOMINATIM_DATABASE_DSN="pgsql:dbname=reverse"' >> .env
|
||||
nominatim import --osm-file ../test.pbf --reverse-only --no-updates
|
||||
working-directory: /home/nominatim/data-env-reverse
|
||||
|
||||
- name: Check reverse import
|
||||
run: nominatim admin --check-database
|
||||
working-directory: data-env
|
||||
working-directory: /home/nominatim/data-env-reverse
|
||||
|
||||
7
.gitignore
vendored
7
.gitignore
vendored
@@ -1,12 +1,9 @@
|
||||
*.log
|
||||
*.pyc
|
||||
|
||||
build
|
||||
settings/local.php
|
||||
docs/develop/*.png
|
||||
|
||||
data/wiki_import.sql
|
||||
data/wiki_specialphrases.sql
|
||||
data/osmosischange.osc
|
||||
build
|
||||
|
||||
.vagrant
|
||||
data/country_osm_grid.sql.gz
|
||||
|
||||
@@ -18,9 +18,9 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake")
|
||||
|
||||
project(nominatim)
|
||||
|
||||
set(NOMINATIM_VERSION_MAJOR 3)
|
||||
set(NOMINATIM_VERSION_MINOR 7)
|
||||
set(NOMINATIM_VERSION_PATCH 0)
|
||||
set(NOMINATIM_VERSION_MAJOR 4)
|
||||
set(NOMINATIM_VERSION_MINOR 0)
|
||||
set(NOMINATIM_VERSION_PATCH 2)
|
||||
|
||||
set(NOMINATIM_VERSION "${NOMINATIM_VERSION_MAJOR}.${NOMINATIM_VERSION_MINOR}.${NOMINATIM_VERSION_PATCH}")
|
||||
|
||||
@@ -38,6 +38,7 @@ set(BUILD_TESTS on CACHE BOOL "Build test suite")
|
||||
set(BUILD_DOCS on CACHE BOOL "Build documentation")
|
||||
set(BUILD_MANPAGE on CACHE BOOL "Build Manual Page")
|
||||
set(BUILD_OSM2PGSQL on CACHE BOOL "Build osm2pgsql (expert only)")
|
||||
set(INSTALL_MUNIN_PLUGINS on CACHE BOOL "Install Munin plugins for supervising Nominatim")
|
||||
|
||||
#-----------------------------------------------------------------------------
|
||||
# osm2pgsql (imports/updates only)
|
||||
@@ -153,7 +154,7 @@ if (BUILD_TESTS)
|
||||
if (PHPCS)
|
||||
message(STATUS "Using phpcs binary ${PHPCS}")
|
||||
add_test(NAME phpcs
|
||||
COMMAND ${PHPCS} --report-width=120 --colors lib website utils
|
||||
COMMAND ${PHPCS} --report-width=120 --colors lib-php
|
||||
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR})
|
||||
else()
|
||||
message(WARNING "phpcs not found. PHP linting tests disabled." )
|
||||
@@ -199,7 +200,7 @@ endif()
|
||||
#-----------------------------------------------------------------------------
|
||||
|
||||
if (BUILD_MANPAGE)
|
||||
add_subdirectory(manual)
|
||||
add_subdirectory(man)
|
||||
endif()
|
||||
|
||||
#-----------------------------------------------------------------------------
|
||||
@@ -211,6 +212,7 @@ include(GNUInstallDirs)
|
||||
set(NOMINATIM_DATADIR ${CMAKE_INSTALL_FULL_DATADIR}/${PROJECT_NAME})
|
||||
set(NOMINATIM_LIBDIR ${CMAKE_INSTALL_FULL_LIBDIR}/${PROJECT_NAME})
|
||||
set(NOMINATIM_CONFIGDIR ${CMAKE_INSTALL_FULL_SYSCONFDIR}/${PROJECT_NAME})
|
||||
set(NOMINATIM_MUNINDIR ${CMAKE_INSTALL_FULL_DATADIR}/munin/plugins)
|
||||
|
||||
if (BUILD_IMPORTER)
|
||||
configure_file(${PROJECT_SOURCE_DIR}/cmake/tool-installed.tmpl installed.bin)
|
||||
@@ -258,6 +260,16 @@ install(FILES settings/env.defaults
|
||||
settings/import-address.style
|
||||
settings/import-full.style
|
||||
settings/import-extratags.style
|
||||
settings/legacy_icu_tokenizer.yaml
|
||||
settings/icu-rules/extended-unicode-to-asccii.yaml
|
||||
settings/icu_tokenizer.yaml
|
||||
settings/country_settings.yaml
|
||||
DESTINATION ${NOMINATIM_CONFIGDIR})
|
||||
|
||||
install(DIRECTORY settings/icu-rules
|
||||
DESTINATION ${NOMINATIM_CONFIGDIR})
|
||||
|
||||
if (INSTALL_MUNIN_PLUGINS)
|
||||
install(FILES munin/nominatim_importlag
|
||||
munin/nominatim_query_speed
|
||||
munin/nominatim_requests
|
||||
DESTINATION ${NOMINATIM_MUNINDIR})
|
||||
endif()
|
||||
|
||||
63
ChangeLog
63
ChangeLog
@@ -1,3 +1,65 @@
|
||||
4.0.2
|
||||
|
||||
* fix XSS vulnerability in debug view
|
||||
|
||||
4.0.1
|
||||
|
||||
* fix initialisation error in replication script
|
||||
* ICU tokenizer: avoid any special characters in word tokens
|
||||
* better error message when API php script does not exist
|
||||
* fix quoting of house numbers in SQL queries
|
||||
* small fixes and improvements in search query parsing
|
||||
* add documentation for moving the database to a different machine
|
||||
|
||||
4.0.0
|
||||
|
||||
* refactor name token computation and introduce ICU tokenizer
|
||||
* name processing now happens in the indexer outside the DB
|
||||
* reorganizes abbreviation handling and moves it to the indexing phases
|
||||
* adds preprocessing of names
|
||||
* add country-specific ranking for Spain, Slovakia
|
||||
* partially switch to using SP-GIST indexes
|
||||
* better updating of dependent addresses for name changes in streets
|
||||
* remove unused/broken tables for external housenumbers
|
||||
* move external postcodes to CSV format and no longer save them in tables
|
||||
(adds support for postcodes for arbitrary countries)
|
||||
* remove postcode helper entries from placex (thanks @AntoJvlt)
|
||||
* change required format for TIGER data to CSV
|
||||
* move configuration of default languages from wiki into config file
|
||||
* expect customized configuration files in project directory by default
|
||||
* disable search API for reverse-only import (thanks @darkshredder)
|
||||
* port most of maintenance/import code to Python and remove PHP utils
|
||||
* add catch-up mode for replication
|
||||
* add updating of special phrases (thanks @AntoJvlt)
|
||||
* add support for special phrases in CSV files (thanks @AntoJvlt)
|
||||
* switch to case-independent matching between place and boundary names
|
||||
* remove disabling of reverse query parsing
|
||||
* minor tweaks to search algorithm to avoid more false positives
|
||||
* major overhaul of the administrator and developer documentation
|
||||
* add security disclosure policy
|
||||
* add testing of installation scripts via CI
|
||||
* drop support for Python < 3.6 and Postgresql < 9.5
|
||||
|
||||
3.7.3
|
||||
|
||||
* fix XSS vulnerability in debug view
|
||||
|
||||
3.7.2
|
||||
|
||||
* fix database check for reverse-only imports
|
||||
* do not error out in status API result when import date is missing
|
||||
* add array_key_last function for PHP < 7.3 (thanks to @woodpeck)
|
||||
* fix more url when server name is unknown (thanks to @mogita)
|
||||
* commit changes to replication log table
|
||||
|
||||
3.7.1
|
||||
|
||||
* fix smaller issues with special phrases import (thanks @AntoJvlt)
|
||||
* add index to speed up continued indexing during import
|
||||
* fix index on location_property_tiger(parent_place_id) (thanks @changpingc)
|
||||
* make sure Python code is backward-compatible with Python 3.5
|
||||
* various documentation fixes
|
||||
|
||||
3.7.0
|
||||
|
||||
* switch to dotenv for configuration file
|
||||
@@ -20,7 +82,6 @@
|
||||
* add non-key indexes to speed up housenumber + street searches
|
||||
* switch housenumber field in placex to save transliterated names
|
||||
|
||||
|
||||
3.6.0
|
||||
|
||||
* add full support for searching by and displaying of addr:* tags
|
||||
|
||||
@@ -20,14 +20,6 @@ https://nominatim.org/release-docs/develop/ .
|
||||
Installation
|
||||
============
|
||||
|
||||
**Nominatim is a complex piece of software and runs in a complex environment.
|
||||
Installing and running Nominatim is something for experienced system
|
||||
administrators only who can do some trouble-shooting themselves. We are sorry,
|
||||
but we can not provide installation support. We are all doing this in our free
|
||||
time and there is just so much of that time to go around. Do not open issues in
|
||||
our bug tracker if you need help. Use the discussions forum
|
||||
or ask for help on [help.openstreetmap.org](https://help.openstreetmap.org/).**
|
||||
|
||||
The latest stable release can be downloaded from https://nominatim.org.
|
||||
There you can also find [installation instructions for the release](https://nominatim.org/release-docs/latest/admin/Installation), as well as an extensive [Troubleshooting/FAQ section](https://nominatim.org/release-docs/latest/admin/Faq/).
|
||||
|
||||
|
||||
File diff suppressed because one or more lines are too long
@@ -10,6 +10,7 @@ set (DOC_SOURCES
|
||||
admin
|
||||
develop
|
||||
api
|
||||
customize
|
||||
index.md
|
||||
extra.css
|
||||
styles.css
|
||||
@@ -26,7 +27,10 @@ ADD_CUSTOM_TARGET(doc
|
||||
COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/bash2md.sh ${PROJECT_SOURCE_DIR}/vagrant/Install-on-Centos-8.sh ${CMAKE_CURRENT_BINARY_DIR}/appendix/Install-on-Centos-8.md
|
||||
COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/bash2md.sh ${PROJECT_SOURCE_DIR}/vagrant/Install-on-Ubuntu-18.sh ${CMAKE_CURRENT_BINARY_DIR}/appendix/Install-on-Ubuntu-18.md
|
||||
COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/bash2md.sh ${PROJECT_SOURCE_DIR}/vagrant/Install-on-Ubuntu-20.sh ${CMAKE_CURRENT_BINARY_DIR}/appendix/Install-on-Ubuntu-20.md
|
||||
COMMAND mkdocs build -d ${CMAKE_CURRENT_BINARY_DIR}/../site-html -f ${CMAKE_CURRENT_BINARY_DIR}/../mkdocs.yml
|
||||
COMMAND PYTHONPATH=${PROJECT_SOURCE_DIR} mkdocs build -d ${CMAKE_CURRENT_BINARY_DIR}/../site-html -f ${CMAKE_CURRENT_BINARY_DIR}/../mkdocs.yml
|
||||
)
|
||||
|
||||
|
||||
ADD_CUSTOM_TARGET(serve-doc
|
||||
COMMAND PYTHONPATH=${PROJECT_SOURCE_DIR} mkdocs serve
|
||||
WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
|
||||
)
|
||||
|
||||
@@ -5,9 +5,34 @@ your Nominatim database. It is assumed that you have already successfully
|
||||
installed the Nominatim software itself, if not return to the
|
||||
[installation page](Installation.md).
|
||||
|
||||
## Importing multiple regions
|
||||
## Importing multiple regions (without updates)
|
||||
|
||||
To import multiple regions in your database, you need to configure and run `utils/import_multiple_regions.sh` file. This script will set up the update directory which has the following structure:
|
||||
To import multiple regions in your database you can simply give multiple
|
||||
OSM files to the import command:
|
||||
|
||||
```
|
||||
nominatim import --osm-file file1.pbf --osm-file file2.pbf
|
||||
```
|
||||
|
||||
If you already have imported a file and want to add another one, you can
|
||||
use the add-data function to import the additional data as follows:
|
||||
|
||||
```
|
||||
nominatim add-data --file <FILE>
|
||||
nominatim refresh --postcodes
|
||||
nominatim index -j <NUMBER OF THREADS>
|
||||
```
|
||||
|
||||
Please note that adding additional data is always significantly slower than
|
||||
the original import.
|
||||
|
||||
## Importing multiple regions (with updates)
|
||||
|
||||
If you want to import multiple regions _and_ be able to keep them up-to-date
|
||||
with updates, then you can use the scripts provided in the `utils` directory.
|
||||
|
||||
These scripts will set up an `update` directory in your project directory,
|
||||
which has the following structure:
|
||||
|
||||
```bash
|
||||
update
|
||||
@@ -17,7 +42,6 @@ update
|
||||
│ └── monaco
|
||||
│ └── sequence.state
|
||||
└── tmp
|
||||
├── combined.osm.pbf
|
||||
└── europe
|
||||
├── andorra-latest.osm.pbf
|
||||
└── monaco-latest.osm.pbf
|
||||
@@ -25,87 +49,59 @@ update
|
||||
|
||||
```
|
||||
|
||||
The `sequence.state` files will contain the sequence ID, which will be used by pyosmium to get updates. The tmp folder is used for import dump.
|
||||
The `sequence.state` files contain the sequence ID for each region. They will
|
||||
be used by pyosmium to get updates. The `tmp` folder is used for import dump and
|
||||
can be deleted once the import is complete.
|
||||
|
||||
### Configuring multiple regions
|
||||
|
||||
The file `import_multiple_regions.sh` needs to be edited as per your requirement:
|
||||
|
||||
1. List of countries. eg:
|
||||
|
||||
COUNTRIES="europe/monaco europe/andorra"
|
||||
|
||||
2. Path to Build directory. eg:
|
||||
|
||||
NOMINATIMBUILD="/srv/nominatim/build"
|
||||
|
||||
3. Path to Update directory. eg:
|
||||
|
||||
UPDATEDIR="/srv/nominatim/update"
|
||||
|
||||
4. Replication URL. eg:
|
||||
|
||||
BASEURL="https://download.geofabrik.de"
|
||||
DOWNCOUNTRYPOSTFIX="-latest.osm.pbf"
|
||||
|
||||
### Setting up multiple regions
|
||||
|
||||
!!! tip
|
||||
If your database already exists and you want to add more countries,
|
||||
replace the setting up part
|
||||
`${SETUPFILE} --osm-file ${UPDATEDIR}/tmp/combined.osm.pbf --all 2>&1`
|
||||
with `${UPDATEFILE} --import-file ${UPDATEDIR}/tmp/combined.osm.pbf --index --index-instances N 2>&1`
|
||||
where N is the numbers of CPUs in your system.
|
||||
Create a project directory as described for the
|
||||
[simple import](Import.md#creating-the-project-directory). If necessary,
|
||||
you can also add an `.env` configuration with customized options. In particular,
|
||||
you need to make sure that `NOMINATIM_REPLICATION_UPDATE_INTERVAL` and
|
||||
`NOMINATIM_REPLICATION_RECHECK_INTERVAL` are set according to the update
|
||||
interval of the extract server you use.
|
||||
|
||||
Run the following command from your Nominatim directory after configuring the file.
|
||||
Copy the scripts `utils/import_multiple_regions.sh` and `utils/update_database.sh`
|
||||
into the project directory.
|
||||
|
||||
bash ./utils/import_multiple_regions.sh
|
||||
Now customize both files as per your requirements
|
||||
|
||||
!!! danger "Important"
|
||||
This file uses osmium-tool. It must be installed before executing the import script.
|
||||
Installation instructions can be found [here](https://osmcode.org/osmium-tool/manual.html#installation).
|
||||
|
||||
### Updating multiple regions
|
||||
|
||||
To import multiple regions in your database, you need to configure and run ```utils/update_database.sh```.
|
||||
This uses the update directory set up while setting up the DB.
|
||||
|
||||
### Configuring multiple regions
|
||||
|
||||
The file `update_database.sh` needs to be edited as per your requirement:
|
||||
|
||||
1. List of countries. eg:
|
||||
1. List of countries. e.g.
|
||||
|
||||
COUNTRIES="europe/monaco europe/andorra"
|
||||
|
||||
2. Path to Build directory. eg:
|
||||
2. URL to the service providing the extracts and updates. eg:
|
||||
|
||||
NOMINATIMBUILD="/srv/nominatim/build"
|
||||
|
||||
3. Path to Update directory. eg:
|
||||
|
||||
UPDATEDIR="/srv/nominatim/update"
|
||||
|
||||
4. Replication URL. eg:
|
||||
|
||||
BASEURL="https://download.geofabrik.de"
|
||||
DOWNCOUNTRYPOSTFIX="-updates"
|
||||
DOWNCOUNTRYPOSTFIX="-latest.osm.pbf"
|
||||
|
||||
5. Followup can be set according to your installation. eg: For Photon,
|
||||
5. Followup in the update script can be set according to your installation.
|
||||
E.g. for Photon,
|
||||
|
||||
FOLLOWUP="curl http://localhost:2322/nominatim-update"
|
||||
|
||||
will handle the indexing.
|
||||
|
||||
|
||||
To start the initial import, change into the project directory and run
|
||||
|
||||
```
|
||||
bash import_multiple_regions.sh
|
||||
```
|
||||
|
||||
### Updating the database
|
||||
|
||||
Run the following command from your Nominatim directory after configuring the file.
|
||||
Change into the project directory and run the following command:
|
||||
|
||||
bash ./utils/update_database.sh
|
||||
bash update_database.sh
|
||||
|
||||
This will get diffs from the replication server, import diffs and index the database. The default replication server in the script([Geofabrik](https://download.geofabrik.de)) provides daily updates.
|
||||
This will get diffs from the replication server, import diffs and index
|
||||
the database. The default replication server in the
|
||||
script([Geofabrik](https://download.geofabrik.de)) provides daily updates.
|
||||
|
||||
## Importing Nominatim to an external PostgreSQL database
|
||||
## Using an external PostgreSQL database
|
||||
|
||||
You can install Nominatim using a database that runs on a different server when
|
||||
you have physical access to the file system on the other server. Nominatim
|
||||
@@ -113,6 +109,11 @@ uses a custom normalization library that needs to be made accessible to the
|
||||
PostgreSQL server. This section explains how to set up the normalization
|
||||
library.
|
||||
|
||||
!!! note
|
||||
The external module is only needed when using the legacy tokenizer.
|
||||
If you have choosen the ICU tokenizer, then you can ignore this section
|
||||
and follow the standard import documentation.
|
||||
|
||||
### Option 1: Compiling the library on the database server
|
||||
|
||||
The most sure way to get a working library is to compile it on the database
|
||||
@@ -170,4 +171,45 @@ NOMINATIM_DATABASE_MODULE_PATH="<directory on the database server where nominati
|
||||
```
|
||||
|
||||
Now change the `NOMINATIM_DATABASE_DSN` to point to your remote server and continue
|
||||
to follow the [standard instructions for importing](/admin/Import).
|
||||
to follow the [standard instructions for importing](Import.md).
|
||||
|
||||
|
||||
## Moving the database to another machine
|
||||
|
||||
For some configurations it may be useful to run the import on one machine, then
|
||||
move the database to another machine and run the Nominatim service from there.
|
||||
For example, you might want to use a large machine to be able to run the import
|
||||
quickly but only want a smaller machine for production because there is not so
|
||||
much load. Or you might want to do the import once and then replicate the
|
||||
database to many machines.
|
||||
|
||||
The important thing to keep in mind when transferring the Nominatim installation
|
||||
is that you need to transfer the database _and the project directory_. Both
|
||||
parts are essential for your installation.
|
||||
|
||||
The Nominatim database can be transferred using the `pg_dump`/`pg_restore` tool.
|
||||
Make sure to use the same version of PostgreSQL and PostGIS on source and
|
||||
target machine.
|
||||
|
||||
!!! note
|
||||
Before creating a dump of your Nominatim database, consider running
|
||||
`nominatim freeze` first. Your database looses the ability to receive further
|
||||
data updates but the resulting database is only about a third of the size
|
||||
of a full database.
|
||||
|
||||
Next install Nominatim on the target machine by following the standard installation
|
||||
instructions. Again make sure to use the same version as the source machine.
|
||||
|
||||
You can now copy the project directory from the source machine to the new machine.
|
||||
If necessary, edit the `.env` file to point it to the restored database.
|
||||
Finally run
|
||||
|
||||
nominatim refresh --website
|
||||
|
||||
to make sure that the local installation of Nominatim will be used.
|
||||
|
||||
If you are using the legacy tokenizer you might also have to switch to the
|
||||
PostgreSQL module that was compiled on your target machine. If you get errors
|
||||
that PostgreSQL cannot find or access `nominatim.so` then copy the installed
|
||||
version into the `module` directory of your project directory. The installed
|
||||
copy can usually be found under `/usr/local/lib/nominatim/module/nominatim.so`.
|
||||
|
||||
@@ -1,101 +0,0 @@
|
||||
# Customization of the Database
|
||||
|
||||
This section explains in detail how to configure a Nominatim import and
|
||||
the various means to use external data.
|
||||
|
||||
## External postcode data
|
||||
|
||||
Nominatim creates a table of known postcode centroids during import. This table
|
||||
is used for searches of postcodes and for adding postcodes to places where the
|
||||
OSM data does not provide one. These postcode centroids are mainly computed
|
||||
from the OSM data itself. In addition, Nominatim supports reading postcode
|
||||
information from an external CSV file, to supplement the postcodes that are
|
||||
missing in OSM.
|
||||
|
||||
To enable external postcode support, simply put one CSV file per country into
|
||||
your project directory and name it `<CC>_postcodes.csv`. `<CC>` must be the
|
||||
two-letter country code for which to apply the file. The file may also be
|
||||
gzipped. Then it must be called `<CC>_postcodes.csv.gz`.
|
||||
|
||||
The CSV file must use commas as a delimiter and have a header line. Nominatim
|
||||
expects three columns to be present: `postcode`, `lat` and `lon`. All other
|
||||
columns are ignored. `lon` and `lat` must describe the x and y coordinates of the
|
||||
postcode centroids in WGS84.
|
||||
|
||||
The postcode files are loaded only when there is data for the given country
|
||||
in your database. For example, if there is a `us_postcodes.csv` file in your
|
||||
project directory but you import only an excerpt of Italy, then the US postcodes
|
||||
will simply be ignored.
|
||||
|
||||
As a rule, the external postcode data should be put into the project directory
|
||||
**before** starting the initial import. Still, you can add, remove and update the
|
||||
external postcode data at any time. Simply
|
||||
run:
|
||||
|
||||
```
|
||||
nominatim refresh --postcodes
|
||||
```
|
||||
|
||||
to make the changes visible in your database. Be aware, however, that the changes
|
||||
only have an immediate effect on searches for postcodes. Postcodes that were
|
||||
added to places are only updated, when they are reindexed. That usually happens
|
||||
only during replication updates.
|
||||
|
||||
## Installing Tiger housenumber data for the US
|
||||
|
||||
Nominatim is able to use the official [TIGER](https://www.census.gov/geographies/mapping-files/time-series/geo/tiger-line-file.html)
|
||||
address set to complement the OSM house number data in the US. You can add
|
||||
TIGER data to your own Nominatim instance by following these steps. The
|
||||
entire US adds about 10GB to your database.
|
||||
|
||||
1. Get preprocessed TIGER 2020 data:
|
||||
|
||||
cd $PROJECT_DIR
|
||||
wget https://nominatim.org/data/tiger2020-nominatim-preprocessed.csv.tar.gz
|
||||
|
||||
2. Import the data into your Nominatim database:
|
||||
|
||||
nominatim add-data --tiger-data tiger2020-nominatim-preprocessed.csv.tar.gz
|
||||
|
||||
3. Enable use of the Tiger data in your `.env` by adding:
|
||||
|
||||
echo NOMINATIM_USE_US_TIGER_DATA=yes >> .env
|
||||
|
||||
4. Apply the new settings:
|
||||
|
||||
nominatim refresh --functions
|
||||
|
||||
|
||||
See the [developer's guide](../develop/data-sources.md#us-census-tiger) for more
|
||||
information on how the data got preprocessed.
|
||||
|
||||
## Special phrases import
|
||||
|
||||
As described in the [Importation chapter](Import.md), it is possible to
|
||||
import special phrases from the wiki with the following command:
|
||||
|
||||
```sh
|
||||
nominatim special-phrases --import-from-wiki
|
||||
```
|
||||
|
||||
But, it is also possible to import some phrases from a csv file.
|
||||
To do so, you have access to the following command:
|
||||
|
||||
```sh
|
||||
nominatim special-phrases --import-from-csv <csv file>
|
||||
```
|
||||
|
||||
Note that the two previous import commands will update the phrases from your database.
|
||||
This means that if you import some phrases from a csv file, only the phrases
|
||||
present in the csv file will be kept into the database. All other phrases will
|
||||
be removed.
|
||||
|
||||
If you want to only add new phrases and not update the other ones you can add
|
||||
the argument `--no-replace` to the import command. For example:
|
||||
|
||||
```sh
|
||||
nominatim special-phrases --import-from-csv <csv file> --no-replace
|
||||
```
|
||||
|
||||
This will add the phrases present in the csv file into the database without
|
||||
removing the other ones.
|
||||
@@ -134,7 +134,7 @@ On CentOS v7 the PostgreSQL server is started with `systemd`. Check if
|
||||
`/usr/lib/systemd/system/httpd.service` contains a line `PrivateTmp=true`. If
|
||||
so then Apache cannot see the `/tmp/.s.PGSQL.5432` file. It's a good security
|
||||
feature, so use the
|
||||
[preferred solution](../appendix/Install-on-Centos-7/#adding-selinux-security-settings).
|
||||
[preferred solution](../appendix/Install-on-Centos-7.md#adding-selinux-security-settings).
|
||||
|
||||
However, you can solve this the quick and dirty way by commenting out that line and then run
|
||||
|
||||
@@ -182,7 +182,7 @@ by everybody, e.g.
|
||||
Try `chmod a+r nominatim.so; chmod a+x nominatim.so`.
|
||||
|
||||
When running SELinux, make sure that the
|
||||
[context is set up correctly](../appendix/Install-on-Centos-7/#adding-selinux-security-settings).
|
||||
[context is set up correctly](../appendix/Install-on-Centos-7.md#adding-selinux-security-settings).
|
||||
|
||||
When you recently updated your operating system, updated PostgreSQL to
|
||||
a new version or moved files (e.g. the build directory) you should
|
||||
|
||||
@@ -47,8 +47,9 @@ You can also set the same configuration via environment variables. All
|
||||
settings have a `NOMINATIM_` prefix to avoid conflicts with other environment
|
||||
variables.
|
||||
|
||||
There are lots of configuration settings you can tweak. Have a look
|
||||
at `Nominatim/settings/env.default` for a full list. Most should have a sensible default.
|
||||
There are lots of configuration settings you can tweak. A full reference
|
||||
can be found in the chapter [Configuration Settings](../customize/Settings.md).
|
||||
Most should have a sensible default.
|
||||
|
||||
#### Flatnode files
|
||||
|
||||
@@ -95,7 +96,7 @@ This data can be optionally downloaded into the project directory:
|
||||
wget https://www.nominatim.org/data/us_postcodes.csv.gz
|
||||
|
||||
You can also add your own custom postcode sources, see
|
||||
[Customization of postcodes](Customization.md#external-postcode-data).
|
||||
[Customization of postcodes](../customize/Postcodes.md).
|
||||
|
||||
## Choosing the data to import
|
||||
|
||||
@@ -111,7 +112,7 @@ If you only need geocoding for a smaller region, then precomputed OSM extracts
|
||||
are a good way to reduce the database size and import time.
|
||||
[Geofabrik](https://download.geofabrik.de) offers extracts for most countries.
|
||||
They even have daily updates which can be used with the update process described
|
||||
[in the next section](../Update). There are also
|
||||
[in the next section](Update.md). There are also
|
||||
[other providers for extracts](https://wiki.openstreetmap.org/wiki/Planet.osm#Downloading).
|
||||
|
||||
Please be aware that some extracts are not cut exactly along the country
|
||||
@@ -137,6 +138,14 @@ Note that you still need to provide for sufficient disk space for the initial
|
||||
import. So this option is particularly interesting if you plan to transfer the
|
||||
database or reuse the space later.
|
||||
|
||||
!!! warning
|
||||
The datastructure for updates are also required when adding additional data
|
||||
after the import, for example [TIGER housenumber data](../customize/Tiger.md).
|
||||
If you plan to use those, you must not use the `--no-updates` parameter.
|
||||
Do a normal import, add the external data and once you are done with
|
||||
everything run `nominatim freeze`.
|
||||
|
||||
|
||||
### Reverse-only Imports
|
||||
|
||||
If you only want to use the Nominatim database for reverse lookups or
|
||||
@@ -152,15 +161,15 @@ Nominatim normally sets up a full search database containing administrative
|
||||
boundaries, places, streets, addresses and POI data. There are also other
|
||||
import styles available which only read selected data:
|
||||
|
||||
* **settings/import-admin.style**
|
||||
* **admin**
|
||||
Only import administrative boundaries and places.
|
||||
* **settings/import-street.style**
|
||||
* **street**
|
||||
Like the admin style but also adds streets.
|
||||
* **settings/import-address.style**
|
||||
* **address**
|
||||
Import all data necessary to compute addresses down to house number level.
|
||||
* **settings/import-full.style**
|
||||
* **full**
|
||||
Default style that also includes points of interest.
|
||||
* **settings/import-extratags.style**
|
||||
* **extratags**
|
||||
Like the full style but also adds most of the OSM tags into the extratags
|
||||
column.
|
||||
|
||||
@@ -183,8 +192,8 @@ full | 54h | 640 GB | 330 GB
|
||||
extratags | 54h | 650 GB | 340 GB
|
||||
|
||||
You can also customize the styles further.
|
||||
A [description of the style format](../develop/Import.md#configuring-the-import)
|
||||
can be found in the development section.
|
||||
A [description of the style format](../customize/Import-Styles.md)
|
||||
can be found in the customization guide.
|
||||
|
||||
## Initial import of the data
|
||||
|
||||
@@ -200,7 +209,7 @@ nominatim import --osm-file <data file> 2>&1 | tee setup.log
|
||||
```
|
||||
|
||||
The **project directory** is the one that you have set up at the beginning.
|
||||
See [creating the project directory](Import#creating-the-project-directory).
|
||||
See [creating the project directory](#creating-the-project-directory).
|
||||
|
||||
### Notes on full planet imports
|
||||
|
||||
@@ -219,7 +228,7 @@ to load the OSM data into the PostgreSQL database. This step is very demanding
|
||||
in terms of RAM usage. osm2pgsql and PostgreSQL are running in parallel at
|
||||
this point. PostgreSQL blocks at least the part of RAM that has been configured
|
||||
with the `shared_buffers` parameter during
|
||||
[PostgreSQL tuning](Installation#postgresql-tuning)
|
||||
[PostgreSQL tuning](Installation.md#postgresql-tuning)
|
||||
and needs some memory on top of that. osm2pgsql needs at least 2GB of RAM for
|
||||
its internal data structures, potentially more when it has to process very large
|
||||
relations. In addition it needs to maintain a cache for node locations. The size
|
||||
@@ -238,7 +247,8 @@ reduce the cache size or even consider using a flatnode file.
|
||||
|
||||
### Testing the installation
|
||||
|
||||
Run this script to verify all required tables and indices got created successfully.
|
||||
Run this script to verify that all required tables and indices got created
|
||||
successfully.
|
||||
|
||||
```sh
|
||||
nominatim admin --check-database
|
||||
@@ -261,23 +271,10 @@ reverse query, e.g. `http://localhost:8088/reverse.php?lat=27.1750090510034&lon=
|
||||
To run Nominatim via webservers like Apache or nginx, please read the
|
||||
[Deployment chapter](Deployment.md).
|
||||
|
||||
## Tuning the database
|
||||
|
||||
Accurate word frequency information for search terms helps PostgreSQL's query
|
||||
planner to make the right decisions. Recomputing them can improve the performance
|
||||
of forward geocoding in particular under high load. To recompute word counts run:
|
||||
|
||||
```sh
|
||||
nominatim refresh --word-counts
|
||||
```
|
||||
|
||||
This will take a couple of hours for a full planet installation. You can
|
||||
also defer that step to a later point in time when you realise that
|
||||
performance becomes an issue. Just make sure that updates are stopped before
|
||||
running this function.
|
||||
## Adding search through category phrases
|
||||
|
||||
If you want to be able to search for places by their type through
|
||||
[special key phrases](https://wiki.openstreetmap.org/wiki/Nominatim/Special_Phrases)
|
||||
[special phrases](https://wiki.openstreetmap.org/wiki/Nominatim/Special_Phrases)
|
||||
you also need to import these key phrases like this:
|
||||
|
||||
```sh
|
||||
@@ -288,4 +285,4 @@ Note that this command downloads the phrases from the wiki link above. You
|
||||
need internet access for the step.
|
||||
|
||||
You can also import special phrases from a csv file, for more
|
||||
information please read the [Customization chapter](Customization.md).
|
||||
information please see the [Customization part](../customize/Special-Phrases.md).
|
||||
|
||||
@@ -24,6 +24,10 @@ and can't offer support.
|
||||
|
||||
### Software
|
||||
|
||||
!!! Warning
|
||||
For larger installations you **must have** PostgreSQL 11+ and Postgis 3+
|
||||
otherwise import and queries will be slow to the point of being unusable.
|
||||
|
||||
For compiling:
|
||||
|
||||
* [cmake](https://cmake.org/)
|
||||
@@ -39,7 +43,7 @@ For compiling:
|
||||
For running Nominatim:
|
||||
|
||||
* [PostgreSQL](https://www.postgresql.org) (9.5+ will work, 11+ strongly recommended)
|
||||
* [PostGIS](https://postgis.net) (2.2+)
|
||||
* [PostGIS](https://postgis.net) (2.2+ will work, 3.0+ strongly recommended)
|
||||
* [Python 3](https://www.python.org/) (3.6+)
|
||||
* [Psycopg2](https://www.psycopg.org) (2.7+)
|
||||
* [Python Dotenv](https://github.com/theskumar/python-dotenv)
|
||||
|
||||
51
docs/admin/Maintenance.md
Normal file
51
docs/admin/Maintenance.md
Normal file
@@ -0,0 +1,51 @@
|
||||
This chapter describes the various operations the Nominatim database administrator
|
||||
may use to clean and maintain the database. None of these operations is mandatory
|
||||
but they may help improve the performance and accuracy of results.
|
||||
|
||||
|
||||
## Updating postcodes
|
||||
|
||||
Command: `nominatim refresh --postcodes`
|
||||
|
||||
Postcode centroids (aka 'calculated postcodes') are generated by looking at all
|
||||
postcodes of a country, grouping them and calculating the geometric centroid.
|
||||
There is currently no logic to deal with extreme outliers (typos or other
|
||||
mistakes in OSM data). There is also no check if a postcodes adheres to a
|
||||
country's format, e.g. if Swiss postcodes are 4 digits.
|
||||
|
||||
When running regular updates, postcodes results can be improved by running
|
||||
this command on a regular basis. Note that only the postcode table and the
|
||||
postcode search terms are updated. The postcode that is assigned to each place
|
||||
is only updated when the place is updated.
|
||||
|
||||
The command takes around 70min to run on the planet and needs ca. 40GB of
|
||||
temporary disk space.
|
||||
|
||||
|
||||
## Updating word counts
|
||||
|
||||
Command: `nominatim refresh --word-counts`
|
||||
|
||||
Nominatim keeps frequency statistics about all search terms it indexes. These
|
||||
statistics are currently used to optimise queries to the database. Thus better
|
||||
statistics mean better performance. Word counts are created once after import
|
||||
and are usually sufficient even when running regular updates. You might want
|
||||
to rerun the statistics computation when adding larger amounts of new data,
|
||||
for example, when adding an additional country via `nominatim add-data`.
|
||||
|
||||
|
||||
## Removing large deleted objects
|
||||
|
||||
Nominatim refuses to delete very large areas because often these deletions are
|
||||
accidental and are reverted within hours. Instead the deletions are logged in
|
||||
the `import_polygon_delete` table and left to the administrator to clean up.
|
||||
|
||||
There is currently no command to do that. You can use the following SQL
|
||||
query to force a deletion on all objects that have been deleted more than
|
||||
a certain timespan ago (here: 1 month):
|
||||
|
||||
```sql
|
||||
SELECT place_force_delete(p.place_id) FROM import_polygon_delete d, placex p
|
||||
WHERE p.osm_type = d.osm_type and p.osm_id = d.osm_id
|
||||
and age(p.indexed_date) > '1 month'::interval
|
||||
```
|
||||
@@ -15,6 +15,27 @@ breaking changes. **Please read them before running the migration.**
|
||||
If you are migrating from a version <3.6, then you still have to follow
|
||||
the manual migration steps up to 3.6.
|
||||
|
||||
## 3.7.0 -> 4.0.0
|
||||
|
||||
### NOMINATIM_PHRASE_CONFIG removed
|
||||
|
||||
Custom blacklist configurations for special phrases now need to be handed
|
||||
with the `--config` parameter to `nominatim special-phrases`. Alternatively
|
||||
you can put your custom configuration in the project directory in a file
|
||||
named `phrase-settings.json`.
|
||||
|
||||
Version 3.8 also removes the automatic converter for the php format of
|
||||
the configuration in older versions. If you are updating from Nominatim < 3.7
|
||||
and still work with a custom `phrase-settings.php`, you need to manually
|
||||
convert it into a json format.
|
||||
|
||||
### PHP utils removed
|
||||
|
||||
The old PHP utils have now been removed completely. You need to switch to
|
||||
the appropriate functions of the nominatim command line tool. See
|
||||
[Introducing `nominatim` command line tool](#introducing-nominatim-command-line-tool)
|
||||
below.
|
||||
|
||||
## 3.6.0 -> 3.7.0
|
||||
|
||||
### New format and name of configuration file
|
||||
@@ -80,7 +101,7 @@ done
|
||||
|
||||
The debugging UI is no longer directly provided with Nominatim. Instead we
|
||||
now provide a simple Javascript application. Please refer to
|
||||
[Setting up the Nominatim UI](../Setup-Nominatim-UI) for details on how to
|
||||
[Setting up the Nominatim UI](Setup-Nominatim-UI.md) for details on how to
|
||||
set up the UI.
|
||||
|
||||
The icons served together with the API responses have been moved to the
|
||||
|
||||
@@ -16,13 +16,14 @@ and run it. Grab the latest release from
|
||||
[nominatim-ui's Github release page](https://github.com/osm-search/nominatim-ui/releases)
|
||||
and unpack it. You can use `nominatim-ui-x.x.x.tar.gz` or `nominatim-ui-x.x.x.zip`.
|
||||
|
||||
Copy the example configuration into the right place:
|
||||
Next you need to adapt the UI yo your installation. Custom settings need to be
|
||||
put into `dist/theme/config.theme.js`. At a minimum you need to
|
||||
set `Nominatim_API_Endpoint` to point to your Nominatim installation:
|
||||
|
||||
cd nominatim-ui
|
||||
cp dist/config.example.js dist/config.js
|
||||
echo "Nominatim_Config.Nominatim_API_Endpoint='https:\\myserver.org\nominatim';" > dist/theme/config.theme.js
|
||||
|
||||
Now adapt the configuration to your needs. You need at least
|
||||
to change the `Nominatim_API_Endpoint` to point to your Nominatim installation.
|
||||
For the full set of available settings, have a look at `dist/config.defaults.js`.
|
||||
|
||||
Then you can just test it locally by spinning up a webserver in the `dist`
|
||||
directory. For example, with Python:
|
||||
|
||||
@@ -10,18 +10,21 @@ For a list of other methods to add or update data see the output of
|
||||
If you have configured a flatnode file for the import, then you
|
||||
need to keep this flatnode file around for updates.
|
||||
|
||||
#### Installing the newest version of Pyosmium
|
||||
### Installing the newest version of Pyosmium
|
||||
|
||||
It is recommended to install Pyosmium via pip. Make sure to use python3.
|
||||
The replication process uses
|
||||
[Pyosmium](https://docs.osmcode.org/pyosmium/latest/updating_osm_data.html)
|
||||
to download update data from the server.
|
||||
It is recommended to install Pyosmium via pip.
|
||||
Run (as the same user who will later run the updates):
|
||||
|
||||
```sh
|
||||
pip3 install --user osmium
|
||||
```
|
||||
|
||||
#### Setting up the update process
|
||||
### Setting up the update process
|
||||
|
||||
Next the update needs to be initialised. By default Nominatim is configured
|
||||
Next the update process needs to be initialised. By default Nominatim is configured
|
||||
to update using the global minutely diffs.
|
||||
|
||||
If you want a different update source you will need to add some settings
|
||||
@@ -45,12 +48,119 @@ what you expect.
|
||||
The `replication --init` command needs to be rerun whenever the replication
|
||||
service is changed.
|
||||
|
||||
#### Updating Nominatim
|
||||
### Updating Nominatim
|
||||
|
||||
The following command will keep your database constantly up to date:
|
||||
Nominatim supports different modes how to retrieve the update data from the
|
||||
server. Which one you want to use depends on your exact setup and how often you
|
||||
want to retrieve updates.
|
||||
|
||||
These instructions are for using a single source of updates. If you have
|
||||
imported multiple country extracts and want to keep them
|
||||
up-to-date, [Advanced installations section](Advanced-Installations.md)
|
||||
contains instructions to set up and update multiple country extracts.
|
||||
|
||||
#### Continuous updates
|
||||
|
||||
This is the easiest mode. Simply run the replication command without any
|
||||
parameters:
|
||||
|
||||
nominatim replication
|
||||
|
||||
If you have imported multiple country extracts and want to keep them
|
||||
up-to-date, [Advanced installations section](Advanced-Installations.md) contains instructions
|
||||
to set up and update multiple country extracts.
|
||||
The update application keeps running forever and retrieves and applies
|
||||
new updates from the server as they are published.
|
||||
|
||||
You can run this command as a simple systemd service. Create a service
|
||||
description like that in `/etc/systemd/system/nominatim-update.service`:
|
||||
|
||||
```
|
||||
[Unit]
|
||||
Description=Continuous updates of Nominatim
|
||||
|
||||
[Service]
|
||||
WorkingDirectory=/srv/nominatim
|
||||
ExecStart=nominatim replication
|
||||
StandardOutput=append:/var/log/nominatim-updates.log
|
||||
StandardError=append:/var/log/nominatim-updates.error.log
|
||||
User=nominatim
|
||||
Group=nominatim
|
||||
Type=simple
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
```
|
||||
|
||||
Replace the `WorkingDirectory` with your project directory. Also adapt user
|
||||
and group names as required.
|
||||
|
||||
Now activate the service and start the updates:
|
||||
|
||||
```
|
||||
sudo systemctl daemon-reload
|
||||
sudo systemctl enable nominatim-updates
|
||||
sudo systemctl start nominatim-updates
|
||||
```
|
||||
|
||||
#### One-time mode
|
||||
|
||||
When the `--once` parameter is given, then Nominatim will download exactly one
|
||||
batch of updates and then exit. This one-time mode still respects the
|
||||
`NOMINATIM_REPLICATION_UPDATE_INTERVAL` that you have set. If according to
|
||||
the update interval no new data has been published yet, it will go to sleep
|
||||
until the next expected update and only then attempt to download the next batch.
|
||||
|
||||
The one-time mode is particularly useful if you want to run updates continuously
|
||||
but need to schedule other work in between updates. For example, the main
|
||||
service at osm.org uses it, to regularly recompute postcodes -- a process that
|
||||
must not be run while updates are in progress. Its update script
|
||||
looks like this:
|
||||
|
||||
```sh
|
||||
#!/bin/bash
|
||||
|
||||
# Switch to your project directory.
|
||||
cd /srv/nominatim
|
||||
|
||||
while true; do
|
||||
nominatim replication --once
|
||||
if [ -f "/srv/nominatim/schedule-mainenance" ]; then
|
||||
rm /srv/nominatim/schedule-mainenance
|
||||
nominatim refresh --postcodes
|
||||
fi
|
||||
done
|
||||
```
|
||||
|
||||
A cron job then creates the file `/srv/nominatim/need-mainenance` once per night.
|
||||
|
||||
|
||||
#### Catch-up mode
|
||||
|
||||
With the `--catch-up` parameter, Nominatim will immediately try to download
|
||||
all changes from the server until the database is up-to-date. The catch-up mode
|
||||
still respects the parameter `NOMINATIM_REPLICATION_MAX_DIFF`. It downloads and
|
||||
applies the changes in appropriate batches until all is done.
|
||||
|
||||
The catch-up mode is foremost useful to bring the database up to speed after the
|
||||
initial import. Give that the service usually is not in production at this
|
||||
point, you can temporarily be a bit more generous with the batch size and
|
||||
number of threads you use for the updates by running catch-up like this:
|
||||
|
||||
```
|
||||
cd /srv/nominatim
|
||||
NOMINATIM_REPLICATION_MAX_DIFF=5000 nominatim replication --catch-up --threads 15
|
||||
```
|
||||
|
||||
The catch-up mode is also useful when you want to apply updates at a lower
|
||||
frequency than what the source publishes. You can set up a cron job to run
|
||||
replication catch-up at whatever interval you desire.
|
||||
|
||||
!!! hint
|
||||
When running scheduled updates with catch-up, it is a good idea to choose
|
||||
a replication source with an update frequency that is an order of magnitude
|
||||
lower. For example, if you want to update once a day, use an hourly updated
|
||||
source. This makes sure that you don't miss an entire day of updates when
|
||||
the source is unexpectely late to publish its update.
|
||||
|
||||
If you want to use the source with the same update frequency (e.g. a daily
|
||||
updated source with daily updates), use the
|
||||
continuous update mode. It ensures to re-request the newest update until it
|
||||
is published.
|
||||
|
||||
@@ -35,7 +35,7 @@ it contains the county/state/country across the border.
|
||||
#### 3. I get different counties/states/countries when I change the zoom parameter in the reverse query. How is that possible?
|
||||
|
||||
This is basically the same problem as in the previous answer.
|
||||
The zoom level influences at which [search rank](https://wiki.openstreetmap.org/wiki/Nominatim/Development_overview#Country_to_street_level) Nominatim starts looking
|
||||
The zoom level influences at which [search rank](../customize/Ranking.md#search-rank) Nominatim starts looking
|
||||
for the closest object. So the closest house number maybe on one side of the
|
||||
border while the closest street is on the other. As the address details contain
|
||||
the address of the closest object found, you might sometimes get one result,
|
||||
|
||||
@@ -290,6 +290,7 @@ with a designation label. Per default the following labels may appear:
|
||||
* emergency, historic, military, natural, landuse, place, railway,
|
||||
man_made, aerialway, boundary, amenity, aeroway, club, craft, leisure,
|
||||
office, mountain_pass, shop, tourism, bridge, tunnel, waterway
|
||||
* postcode
|
||||
|
||||
They roughly correspond to the classification of the OpenStreetMap data
|
||||
according to either the `place` tag or the main key of the object.
|
||||
|
||||
@@ -27,8 +27,8 @@ The search term may be specified with two different sets of parameters:
|
||||
|
||||
Free-form query string to search for.
|
||||
Free-form queries are processed first left-to-right and then right-to-left if that fails. So you may search for
|
||||
[pilkington avenue, birmingham](//nominatim.openstreetmap.org/search?q=pilkington+avenue,birmingham) as well as for
|
||||
[birmingham, pilkington avenue](//nominatim.openstreetmap.org/search?q=birmingham,+pilkington+avenue).
|
||||
[pilkington avenue, birmingham](https://nominatim.openstreetmap.org/search?q=pilkington+avenue,birmingham) as well as for
|
||||
[birmingham, pilkington avenue](https://nominatim.openstreetmap.org/search?q=birmingham,+pilkington+avenue).
|
||||
Commas are optional, but improve performance by reducing the complexity of the search.
|
||||
|
||||
|
||||
|
||||
@@ -1,38 +1,24 @@
|
||||
# OSM Data Import
|
||||
|
||||
OSM data is initially imported using [osm2pgsql](https://osm2pgsql.org).
|
||||
Nominatim uses its own data output style 'gazetteer', which differs from the
|
||||
output style created for map rendering.
|
||||
|
||||
## Database Layout
|
||||
|
||||
The gazetteer style produces a single table `place` with the following rows:
|
||||
|
||||
* `osm_type` - kind of OSM object (**N** - node, **W** - way, **R** - relation)
|
||||
* `osm_id` - original OSM ID
|
||||
* `class` - key of principal tag defining the object type
|
||||
* `type` - value of principal tag defining the object type
|
||||
* `name` - collection of tags that contain a name or reference
|
||||
* `admin_level` - numerical value of the tagged administrative level
|
||||
* `address` - collection of tags defining the address of an object
|
||||
* `extratags` - collection of additional interesting tags that are not
|
||||
directly relevant for searching
|
||||
* `geometry` - geometry of the object (in WGS84)
|
||||
|
||||
A single OSM object may appear multiple times in this table when it is tagged
|
||||
with multiple tags that may constitute a principal tag. Take for example a
|
||||
motorway bridge. In OSM, this would be a way which is tagged with
|
||||
`highway=motorway` and `bridge=yes`. This way would appear in the `place` table
|
||||
once with `class` of `highway` and once with a `class` of `bridge`. Thus the
|
||||
*unique key* for `place` is (`osm_type`, `osm_id`, `class`).
|
||||
|
||||
## Configuring the Import
|
||||
|
||||
How tags are interpreted and assigned to the different `place` columns can be
|
||||
configured via the import style configuration file (`NOMINATIM_IMPORT_STYLE`). This
|
||||
Which OSM objects are added to the database and which of the tags are used
|
||||
can be configured via the import style configuration file. This
|
||||
is a JSON file which contains a list of rules which are matched against every
|
||||
tag of every object and then assign the tag its specific role.
|
||||
|
||||
The style to use is given by the `NOMINATIM_IMPORT_STYLE` configuration
|
||||
option. There are a number of default styles, which are explained in detail
|
||||
in the [Import section](../admin/Import.md#filtering-imported-data). These
|
||||
standard styles may be referenced by their name.
|
||||
|
||||
You can also create your own custom syle. Put the style file into your
|
||||
project directory and then set `NOMINATIM_IMPORT_STYLE` to the name of the file.
|
||||
It is always recommended to start with one of the standard styles and customize
|
||||
those. You find the standard styles under the name `import-<stylename>.style`
|
||||
in the standard Nominatim configuration path (usually `/etc/nominatim` or
|
||||
`/usr/local/etc/nominatim`).
|
||||
|
||||
The remainder of the page describes the format of the file.
|
||||
|
||||
### Configuration Rules
|
||||
|
||||
A single rule looks like this:
|
||||
@@ -159,9 +145,6 @@ A rule can define as many of these properties for one match as it likes. For
|
||||
example, if the property is `"main,extra"` then the tag will open a new row
|
||||
but also have the tag appear in the list of extra tags.
|
||||
|
||||
There are a number of pre-defined styles in the `settings/` directory. It is
|
||||
advisable to start from one of these styles when defining your own.
|
||||
|
||||
### Changing the Style of Existing Databases
|
||||
|
||||
There is normally no issue changing the style of a database that is already
|
||||
20
docs/customize/Overview.md
Normal file
20
docs/customize/Overview.md
Normal file
@@ -0,0 +1,20 @@
|
||||
Nominatim comes with a predefined set of configuration options that should
|
||||
work for most standard installations. If you have special requirements, there
|
||||
are many places where the configuration can be adapted. This chapter describes
|
||||
the following configurable parts:
|
||||
|
||||
* [Global Settings](Settings.md) has a detailed description of all parameters that
|
||||
can be set in your local `.env` configuration
|
||||
* [Import styles](Import-Styles.md) explains how to write your own import style
|
||||
in order to control what kind of OSM data will be imported
|
||||
* [Place ranking](Ranking.md) describes the configuration around classifing
|
||||
places in terms of their importance and their role in an address
|
||||
* [Tokenizers](Tokenizers.md) describes the configuration of the module
|
||||
responsible for analysing and indexing names
|
||||
* [Special Phrases](Special-Phrases.md) are common nouns or phrases that
|
||||
can be used in search to identify a class of places
|
||||
|
||||
There are also guides for adding the following external data:
|
||||
|
||||
* [US house numbers from the TIGER dataset](Tiger.md)
|
||||
* [External postcodes](Postcodes.md)
|
||||
37
docs/customize/Postcodes.md
Normal file
37
docs/customize/Postcodes.md
Normal file
@@ -0,0 +1,37 @@
|
||||
# External postcode data
|
||||
|
||||
Nominatim creates a table of known postcode centroids during import. This table
|
||||
is used for searches of postcodes and for adding postcodes to places where the
|
||||
OSM data does not provide one. These postcode centroids are mainly computed
|
||||
from the OSM data itself. In addition, Nominatim supports reading postcode
|
||||
information from an external CSV file, to supplement the postcodes that are
|
||||
missing in OSM.
|
||||
|
||||
To enable external postcode support, simply put one CSV file per country into
|
||||
your project directory and name it `<CC>_postcodes.csv`. `<CC>` must be the
|
||||
two-letter country code for which to apply the file. The file may also be
|
||||
gzipped. Then it must be called `<CC>_postcodes.csv.gz`.
|
||||
|
||||
The CSV file must use commas as a delimiter and have a header line. Nominatim
|
||||
expects three columns to be present: `postcode`, `lat` and `lon`. All other
|
||||
columns are ignored. `lon` and `lat` must describe the x and y coordinates of the
|
||||
postcode centroids in WGS84.
|
||||
|
||||
The postcode files are loaded only when there is data for the given country
|
||||
in your database. For example, if there is a `us_postcodes.csv` file in your
|
||||
project directory but you import only an excerpt of Italy, then the US postcodes
|
||||
will simply be ignored.
|
||||
|
||||
As a rule, the external postcode data should be put into the project directory
|
||||
**before** starting the initial import. Still, you can add, remove and update the
|
||||
external postcode data at any time. Simply
|
||||
run:
|
||||
|
||||
```
|
||||
nominatim refresh --postcodes
|
||||
```
|
||||
|
||||
to make the changes visible in your database. Be aware, however, that the changes
|
||||
only have an immediate effect on searches for postcodes. Postcodes that were
|
||||
added to places are only updated, when they are reindexed. That usually happens
|
||||
only during replication updates.
|
||||
@@ -1,8 +1,7 @@
|
||||
# Place Ranking in Nominatim
|
||||
|
||||
Nominatim uses two metrics to rank a place: search rank and address rank.
|
||||
Both can be assigned a value between 0 and 30. They serve slightly
|
||||
different purposes, which are explained in this chapter.
|
||||
This chapter explains what place ranking means and how it can be customized.
|
||||
|
||||
## Search rank
|
||||
|
||||
649
docs/customize/Settings.md
Normal file
649
docs/customize/Settings.md
Normal file
@@ -0,0 +1,649 @@
|
||||
This section provides a reference of all configuration parameters that can
|
||||
be used with Nominatim.
|
||||
|
||||
# Configuring Nominatim
|
||||
|
||||
Nominatim uses [dotenv](https://github.com/theskumar/python-dotenv) to manage
|
||||
its configuration settings. There are two means to set configuration
|
||||
variables: through an `.env` configuration file or through an environment
|
||||
variable.
|
||||
|
||||
The `.env` configuration file needs to be placed into the
|
||||
[project directory](../admin/Import.md#creating-the-project-directory). It
|
||||
must contain configuration parameters in `<parameter>=<value>` format.
|
||||
Please refer to the dotenv documentation for details.
|
||||
|
||||
The configuration options may also be set in the form of shell environment
|
||||
variables. This is particularly useful, when you want to temporarily change
|
||||
a configuration option. For example, to force the replication serve to
|
||||
download the next change, you can temporarily disable the update interval:
|
||||
|
||||
NOMINATIM_REPLICATION_UPDATE_INTERVAL=0 nominatim replication --once
|
||||
|
||||
If a configuration option is defined through .env file and environment
|
||||
variable, then the latter takes precedence.
|
||||
|
||||
## Configuration Parameter Reference
|
||||
|
||||
### Import and Database Settings
|
||||
|
||||
#### NOMINATIM_DATABASE_DSN
|
||||
|
||||
| Summary | |
|
||||
| -------------- | --------------------------------------------------- |
|
||||
| **Description:** | Database connection string |
|
||||
| **Format:** | string: `pgsql:<param1>=<value1>;<param2>=<value2>;...` |
|
||||
| **Default:** | pgsql:dbname=nominatim |
|
||||
| **After Changes:** | run `nominatim refresh --website` |
|
||||
|
||||
Sets the connection parameters for the Nominatim database. At a minimum
|
||||
the name of the database (`dbname`) is required. You can set any additional
|
||||
parameter that is understood by libpq. See the [Postgres documentation](https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-PARAMKEYWORDS) for a full list.
|
||||
|
||||
!!! note
|
||||
It is usually recommended not to set the password directly in this
|
||||
configuration parameter. Use a
|
||||
[password file](https://www.postgresql.org/docs/current/libpq-pgpass.html)
|
||||
instead.
|
||||
|
||||
|
||||
#### NOMINATIM_DATABASE_WEBUSER
|
||||
|
||||
| Summary | |
|
||||
| -------------- | --------------------------------------------------- |
|
||||
| **Description:** | Database query user |
|
||||
| **Format:** | string |
|
||||
| **Default:** | www-data |
|
||||
| **After Changes:** | cannot be changed after import |
|
||||
|
||||
Defines the name of the database user that will run search queries. Usually
|
||||
this is the user under which the webserver is executed. When running Nominatim
|
||||
via php-fpm, you can also define a separate query user. The Postgres user
|
||||
needs to be set up before starting the import.
|
||||
|
||||
Nominatim grants minimal rights to this user to all tables that are needed
|
||||
for running geocoding queries.
|
||||
|
||||
|
||||
#### NOMINATIM_DATABASE_MODULE_PATH
|
||||
|
||||
| Summary | |
|
||||
| -------------- | --------------------------------------------------- |
|
||||
| **Description:** | Directory where to find the PostgreSQL server module |
|
||||
| **Format:** | path |
|
||||
| **Default:** | _empty_ (use `<project_directory>/module`) |
|
||||
| **After Changes:** | run `nominatim refresh --functions` |
|
||||
| **Comment:** | Legacy tokenizer only |
|
||||
|
||||
Defines the directory in which the PostgreSQL server module `nominatim.so`
|
||||
is stored. The directory and module must be accessible by the PostgreSQL
|
||||
server.
|
||||
|
||||
For information on how to use this setting when working with external databases,
|
||||
see [Advanced Installations](../admin/Advanced-Installations.md).
|
||||
|
||||
The option is only used by the Legacy tokenizer and ignored otherwise.
|
||||
|
||||
|
||||
#### NOMINATIM_TOKENIZER
|
||||
|
||||
| Summary | |
|
||||
| -------------- | --------------------------------------------------- |
|
||||
| **Description:** | Tokenizer used for normalizing and parsing queries and names |
|
||||
| **Format:** | string |
|
||||
| **Default:** | legacy |
|
||||
| **After Changes:** | cannot be changed after import |
|
||||
|
||||
Sets the tokenizer type to use for the import. For more information on
|
||||
available tokenizers and how they are configured, see
|
||||
[Tokenizers](../customize/Tokenizers.md).
|
||||
|
||||
|
||||
#### NOMINATIM_TOKENIZER_CONFIG
|
||||
|
||||
| Summary | |
|
||||
| -------------- | --------------------------------------------------- |
|
||||
| **Description:** | Configuration file for the tokenizer |
|
||||
| **Format:** | path |
|
||||
| **Default:** | _empty_ (default file depends on tokenizer) |
|
||||
| **After Changes:** | see documentation for each tokenizer |
|
||||
|
||||
Points to the file with additional configuration for the tokenizer.
|
||||
See the [Tokenizer](../customize/Tokenizers.md) descriptions for details
|
||||
on the file format.
|
||||
|
||||
If a relative path is given, then the file is searched first relative to the
|
||||
project directory and then in the global settings directory.
|
||||
|
||||
#### NOMINATIM_MAX_WORD_FREQUENCY
|
||||
|
||||
| Summary | |
|
||||
| -------------- | --------------------------------------------------- |
|
||||
| **Description:** | Number of occurrences before a word is considered frequent |
|
||||
| **Format:** | int |
|
||||
| **Default:** | 50000 |
|
||||
| **After Changes:** | cannot be changed after import |
|
||||
| **Comment:** | Legacy tokenizer only |
|
||||
|
||||
The word frequency count is used by the Legacy tokenizer to automatically
|
||||
identify _stop words_. Any partial term that occurs more often then what
|
||||
is defined in this setting, is effectively ignored during search.
|
||||
|
||||
|
||||
#### NOMINATIM_LIMIT_REINDEXING
|
||||
|
||||
| Summary | |
|
||||
| -------------- | --------------------------------------------------- |
|
||||
| **Description:** | Avoid invalidating large areas |
|
||||
| **Format:** | bool |
|
||||
| **Default:** | yes |
|
||||
|
||||
Nominatim computes the address of each place at indexing time. This has the
|
||||
advantage to make search faster but also means that more objects needs to
|
||||
be invalidated when the data changes. For example, changing the name of
|
||||
the state of Florida would require recomputing every single address point
|
||||
in the state to make the new name searchable in conjunction with addresses.
|
||||
|
||||
Setting this option to 'yes' means that Nominatim skips reindexing of contained
|
||||
objects when the area becomes too large.
|
||||
|
||||
|
||||
#### NOMINATIM_LANGUAGES
|
||||
|
||||
| Summary | |
|
||||
| -------------- | --------------------------------------------------- |
|
||||
| **Description:** | Restrict search languages |
|
||||
| **Format:** | string: comma-separated list of language codes |
|
||||
| **Default:** | _empty_ |
|
||||
|
||||
Normally Nominatim will include all language variants of name:XX
|
||||
in the search index. Set this to a comma separated list of language
|
||||
codes, to restrict import to a subset of languages.
|
||||
|
||||
Currently only affects the initial import of country names and special phrases.
|
||||
|
||||
|
||||
#### NOMINATIM_TERM_NORMALIZATION
|
||||
|
||||
| Summary | |
|
||||
| -------------- | --------------------------------------------------- |
|
||||
| **Description:** | Rules for normalizing terms for comparisons |
|
||||
| **Format:** | string: semicolon-separated list of ICU rules |
|
||||
| **Default:** | :: NFD (); [[:Nonspacing Mark:] [:Cf:]] >; :: lower (); [[:Punctuation:][:Space:]]+ > ' '; :: NFC (); |
|
||||
| **Comment:** | Legacy tokenizer only |
|
||||
|
||||
[Special phrases](Special-Phrases.md) have stricter matching requirements than
|
||||
normal search terms. They must appear exactly in the query after this term
|
||||
normalization has been applied.
|
||||
|
||||
Only has an effect on the Legacy tokenizer. For the ICU tokenizer the rules
|
||||
defined in the
|
||||
[normalization section](Tokenizers.md#normalization-and-transliteration)
|
||||
will be used.
|
||||
|
||||
|
||||
#### NOMINATIM_USE_US_TIGER_DATA
|
||||
|
||||
| Summary | |
|
||||
| -------------- | --------------------------------------------------- |
|
||||
| **Description:** | Enable searching for Tiger house number data |
|
||||
| **Format:** | boolean |
|
||||
| **Default:** | no |
|
||||
| **After Changes:** | run `nominatim --refresh --functions` |
|
||||
|
||||
When this setting is enabled, search and reverse queries also take data
|
||||
from [Tiger house number data](Tiger.md) into account.
|
||||
|
||||
|
||||
#### NOMINATIM_USE_AUX_LOCATION_DATA
|
||||
|
||||
| Summary | |
|
||||
| -------------- | --------------------------------------------------- |
|
||||
| **Description:** | Enable searching in external house number tables |
|
||||
| **Format:** | boolean |
|
||||
| **Default:** | no |
|
||||
| **After Changes:** | run `nominatim --refresh --functions` |
|
||||
| **Comment:** | Do not use. |
|
||||
|
||||
When this setting is enabled, search queries also take data from external
|
||||
house number tables into account.
|
||||
|
||||
*Warning:* This feature is currently unmaintained and should not be used.
|
||||
|
||||
|
||||
#### NOMINATIM_HTTP_PROXY
|
||||
|
||||
| Summary | |
|
||||
| -------------- | --------------------------------------------------- |
|
||||
| **Description:** | Use HTTP proxy when downloading data |
|
||||
| **Format:** | boolean |
|
||||
| **Default:** | no |
|
||||
|
||||
When this setting is enabled and at least
|
||||
[NOMINATIM_HTTP_PROXY_HOST](#nominatim_http_proxy_host) and
|
||||
[NOMINATIM_HTTP_PROXY_PORT](#nominatim_http_proxy_port) are set, the
|
||||
configured proxy will be used, when downloading external data like
|
||||
replication diffs.
|
||||
|
||||
|
||||
#### NOMINATIM_HTTP_PROXY_HOST
|
||||
|
||||
| Summary | |
|
||||
| -------------- | --------------------------------------------------- |
|
||||
| **Description:** | Host name of the proxy to use |
|
||||
| **Format:** | string |
|
||||
| **Default:** | _empty_ |
|
||||
|
||||
When [NOMINATIM_HTTP_PROXY](#nominatim_http_proxy) is enabled, this setting
|
||||
configures the proxy host name.
|
||||
|
||||
|
||||
#### NOMINATIM_HTTP_PROXY_PORT
|
||||
|
||||
| Summary | |
|
||||
| -------------- | --------------------------------------------------- |
|
||||
| **Description:** | Port number of the proxy to use |
|
||||
| **Format:** | integer |
|
||||
| **Default:** | 3128 |
|
||||
|
||||
When [NOMINATIM_HTTP_PROXY](#nominatim_http_proxy) is enabled, this setting
|
||||
configures the port number to use with the proxy.
|
||||
|
||||
|
||||
#### NOMINATIM_HTTP_PROXY_LOGIN
|
||||
|
||||
| Summary | |
|
||||
| -------------- | --------------------------------------------------- |
|
||||
| **Description:** | Username for proxies that require login |
|
||||
| **Format:** | string |
|
||||
| **Default:** | _empty_ |
|
||||
|
||||
When [NOMINATIM_HTTP_PROXY](#nominatim_http_proxy) is enabled, use this
|
||||
setting to define the username for proxies that require a login.
|
||||
|
||||
|
||||
#### NOMINATIM_HTTP_PROXY_PASSWORD
|
||||
|
||||
| Summary | |
|
||||
| -------------- | --------------------------------------------------- |
|
||||
| **Description:** | Password for proxies that require login |
|
||||
| **Format:** | string |
|
||||
| **Default:** | _empty_ |
|
||||
|
||||
When [NOMINATIM_HTTP_PROXY](#nominatim_http_proxy) is enabled, use this
|
||||
setting to define the password for proxies that require a login.
|
||||
|
||||
|
||||
#### NOMINATIM_OSM2PGSQL_BINARY
|
||||
|
||||
| Summary | |
|
||||
| -------------- | --------------------------------------------------- |
|
||||
| **Description:** | Location of the osm2pgsql binary |
|
||||
| **Format:** | path |
|
||||
| **Default:** | _empty_ (use binary shipped with Nominatim) |
|
||||
| **Comment:** | EXPERT ONLY |
|
||||
|
||||
Nominatim uses [osm2pgsql](https://osm2pgsql.org) to load the OSM data
|
||||
initially into the database. Nominatim comes bundled with a version of
|
||||
osm2pgsql that is guaranteed to be compatible. Use this setting to use
|
||||
a different binary instead. You should do this only when you know exactly
|
||||
what you are doing. If the osm2pgsql version is not compatible, then the
|
||||
result is undefined.
|
||||
|
||||
|
||||
#### NOMINATIM_WIKIPEDIA_DATA_PATH
|
||||
|
||||
| Summary | |
|
||||
| -------------- | --------------------------------------------------- |
|
||||
| **Description:** | Directory with the wikipedia importance data |
|
||||
| **Format:** | path |
|
||||
| **Default:** | _empty_ (project directory) |
|
||||
|
||||
Set a custom location for the
|
||||
[wikipedia ranking file](../admin/Import.md#wikipediawikidata-rankings). When
|
||||
unset, Nominatim expects the data to be saved in the project directory.
|
||||
|
||||
#### NOMINATIM_ADDRESS_LEVEL_CONFIG
|
||||
|
||||
| Summary | |
|
||||
| -------------- | --------------------------------------------------- |
|
||||
| **Description:** | Configuration file for rank assignments |
|
||||
| **Format:** | path |
|
||||
| **Default:** | address-levels.json |
|
||||
|
||||
The _address level configuration_ defines the rank assignments for places. See
|
||||
[Place Ranking](Ranking.md) for a detailed explanation what rank assignments
|
||||
are and what the configuration file must look like.
|
||||
|
||||
When a relative path is given, then the file is searched first relative to the
|
||||
project directory and then in the global settings directory.
|
||||
|
||||
|
||||
#### NOMINATIM_IMPORT_STYLE
|
||||
|
||||
| Summary | |
|
||||
| -------------- | --------------------------------------------------- |
|
||||
| **Description:** | Configuration to use for the initial OSM data import |
|
||||
| **Format:** | string or path |
|
||||
| **Default:** | extratags |
|
||||
|
||||
The _style configuration_ describes which OSM objects and tags are taken
|
||||
into consideration for the search database. Nominatim comes with a set
|
||||
of pre-configured styles, that may be configured here.
|
||||
|
||||
You can also write your own custom style and point the setting to the file
|
||||
with the style. When a relative path is given, then the style file is searched
|
||||
first relative to the project directory and then in the global settings
|
||||
directory.
|
||||
|
||||
See [Import Styles](Import-Styles.md)
|
||||
for more information on the available internal styles and the format of the
|
||||
configuration file.
|
||||
|
||||
#### NOMINATIM_FLATNODE_FILE
|
||||
|
||||
| Summary | |
|
||||
| -------------- | --------------------------------------------------- |
|
||||
| **Description:** | Location of osm2pgsql flatnode file |
|
||||
| **Format:** | path |
|
||||
| **Default:** | _empty_ (do not use a flatnote file) |
|
||||
| **After Changes:** | Only change when moving the file physically. |
|
||||
|
||||
The `osm2pgsql flatnode file` is file that efficiently stores geographic
|
||||
location for OSM nodes. For larger imports it can significantly speed up
|
||||
the import. When this option is unset, then osm2pgsql uses a PsotgreSQL table
|
||||
to store the locations.
|
||||
|
||||
When a relative path is given, then the flatnode file is created/searched
|
||||
relative to the project directory.
|
||||
|
||||
!!! warning
|
||||
|
||||
The flatnode file is not only used during the initial import but also
|
||||
when adding new data with `nominatim add-data` or `nominatim replication`.
|
||||
Make sure you keep the flatnode file around and this setting unmodified,
|
||||
if you plan to add more data or run regular updates.
|
||||
|
||||
|
||||
#### NOMINATIM_TABLESPACE_*
|
||||
|
||||
| Summary | |
|
||||
| -------------- | --------------------------------------------------- |
|
||||
| **Description:** | Group of settings for distributing the database over tablespaces |
|
||||
| **Format:** | string |
|
||||
| **Default:** | _empty_ (do not use a table space) |
|
||||
| **After Changes:** | no effect after initial import |
|
||||
|
||||
Nominatim allows to distribute the search database over up to 10 different
|
||||
[PostgreSQL tablespaces](https://www.postgresql.org/docs/current/manage-ag-tablespaces.html).
|
||||
If you use this option, make sure that the tablespaces exist before starting
|
||||
the import.
|
||||
|
||||
The available tablespace groups are:
|
||||
|
||||
NOMINATIM_TABLESPACE_SEARCH_DATA
|
||||
: Data used by the geocoding frontend.
|
||||
|
||||
NOMINATIM_TABLESPACE_SEARCH_INDEX
|
||||
: Indexes used by the geocoding frontend.
|
||||
|
||||
NOMINATIM_TABLESPACE_OSM_DATA
|
||||
: Raw OSM data cache used for import and updates.
|
||||
|
||||
NOMINATIM_TABLESPACE_OSM_DATA
|
||||
: Indexes on the raw OSM data cache.
|
||||
|
||||
NOMINATIM_TABLESPACE_PLACE_DATA
|
||||
: Data table with the pre-filtered but still unprocessed OSM data.
|
||||
Used only during imports and updates.
|
||||
|
||||
NOMINATIM_TABLESPACE_PLACE_INDEX
|
||||
: Indexes on raw data table. Used only during imports and updates.
|
||||
|
||||
NOMINATIM_TABLESPACE_ADDRESS_DATA
|
||||
: Data tables used for computing search terms and addresses of places
|
||||
during import and updates.
|
||||
|
||||
NOMINATIM_TABLESPACE_ADDRESS_INDEX
|
||||
: Indexes on the data tables for search term and address computation.
|
||||
Used only for import and updates.
|
||||
|
||||
NOMINATIM_TABLESPACE_AUX_DATA
|
||||
: Auxiliary data tables for non-OSM data, e.g. for Tiger house number data.
|
||||
|
||||
NOMINATIM_TABLESPACE_AUX_INDEX
|
||||
: Indexes on auxiliary data tables.
|
||||
|
||||
|
||||
### Replication Update Settings
|
||||
|
||||
#### NOMINATIM_REPLICATION_URL
|
||||
|
||||
| Summary | |
|
||||
| -------------- | --------------------------------------------------- |
|
||||
| **Description:** | Base URL of the replication service |
|
||||
| **Format:** | url |
|
||||
| **Default:** | https://planet.openstreetmap.org/replication/minute |
|
||||
| **After Changes:** | run `nominatim replication --init` |
|
||||
|
||||
Replication services deliver updates to OSM data. Use this setting to choose
|
||||
which replication service to use. See [Updates](../admin/Update.md) for more
|
||||
information on how to set up regular updates.
|
||||
|
||||
#### NOMINATIM_REPLICATION_MAX_DIFF
|
||||
|
||||
| Summary | |
|
||||
| -------------- | --------------------------------------------------- |
|
||||
| **Description:** | Maximum amount of data to download per update cycle (in MB) |
|
||||
| **Format:** | integer |
|
||||
| **Default:** | 50 |
|
||||
| **After Changes:** | restart the replication process |
|
||||
|
||||
At each update cycle Nominatim downloads diffs until either no more diffs
|
||||
are available on the server (i.e. the database is up-to-date) or the limit
|
||||
given in this setting is exceeded. Nominatim guarantees to downloads at least
|
||||
one diff, if one is available, no matter how small the setting.
|
||||
|
||||
The default for this setting is fairly conservative because Nominatim keeps
|
||||
all data downloaded in one cycle in RAM. Using large values in a production
|
||||
server may interfere badly with the search frontend because it evicts data
|
||||
from RAM that is needed for speedy answers to incoming requests. It is usually
|
||||
a better idea to keep this setting lower and run multiple update cycles
|
||||
to catch up with updates.
|
||||
|
||||
When catching up in non-production mode, for example after the initial import,
|
||||
the setting can easily be changed temporarily on the command line:
|
||||
|
||||
NOMINATIM_REPLICATION_MAX_DIFF=3000 nominatim replication
|
||||
|
||||
|
||||
#### NOMINATIM_REPLICATION_UPDATE_INTERVAL
|
||||
|
||||
| Summary | |
|
||||
| -------------- | --------------------------------------------------- |
|
||||
| **Description:** | Publication interval of the replication service (in seconds) |
|
||||
| **Format:** | integer |
|
||||
| **Default:** | 75 |
|
||||
| **After Changes:** | restart the replication process |
|
||||
|
||||
This setting determines when Nominatim will attempt to download again a new
|
||||
update. The time is computed from the publication date of the last diff
|
||||
downloaded. Setting this to a slightly higher value than the actual
|
||||
publication interval avoids unnecessary rechecks.
|
||||
|
||||
|
||||
#### NOMINATIM_REPLICATION_RECHECK_INTERVAL
|
||||
|
||||
| Summary | |
|
||||
| -------------- | --------------------------------------------------- |
|
||||
| **Description:** | Wait time to recheck for a pending update (in seconds) |
|
||||
| **Format:** | integer |
|
||||
| **Default:** | 60 |
|
||||
| **After Changes:** | restart the replication process |
|
||||
|
||||
When replication updates are run in continuous mode (using `nominatim replication`),
|
||||
this setting determines how long Nominatim waits until it looks for updates
|
||||
again when updates were not available on the server.
|
||||
|
||||
Note that this is different from
|
||||
[NOMINATIM_REPLICATION_UPDATE_INTERVAL](#nominatim_replication_update_interval).
|
||||
Nominatim will never attempt to query for new updates for UPDATE_INTERVAL
|
||||
seconds after the current database date. Only after the update interval has
|
||||
passed it asks for new data. If then no new data is found, it waits for
|
||||
RECHECK_INTERVAL seconds before it attempts again.
|
||||
|
||||
### API Settings
|
||||
|
||||
#### NOMINATIM_CORS_NOACCESSCONTROL
|
||||
|
||||
| Summary | |
|
||||
| -------------- | --------------------------------------------------- |
|
||||
| **Description:** | Send permissive CORS access headers |
|
||||
| **Format:** | boolean |
|
||||
| **Default:** | yes |
|
||||
| **After Changes:** | run `nominatim refresh --website` |
|
||||
|
||||
When this setting is enabled, API HTTP responses include the HTTP
|
||||
[CORS](https://en.wikipedia.org/wiki/CORS) headers
|
||||
`access-control-allow-origin: *` and `access-control-allow-methods: OPTIONS,GET`.
|
||||
|
||||
#### NOMINATIM_MAPICON_URL
|
||||
|
||||
| Summary | |
|
||||
| -------------- | --------------------------------------------------- |
|
||||
| **Description:** | URL prefix for static icon images |
|
||||
| **Format:** | url |
|
||||
| **Default:** | _empty_ |
|
||||
| **After Changes:** | run `nominatim refresh --website` |
|
||||
|
||||
When a mapicon URL is configured, then Nominatim includes an additional `icon`
|
||||
field in the responses, pointing to an appropriate icon for the place type.
|
||||
|
||||
Map icons used to be included in Nominatim itself but now have moved to the
|
||||
[nominatim-ui](https://github.com/osm-search/nominatim-ui/) project. If you
|
||||
want the URL to be included in API responses, make the `/mapicon`
|
||||
directory of the project available under a public URL and point this setting
|
||||
to the directory.
|
||||
|
||||
|
||||
#### NOMINATIM_DEFAULT_LANGUAGE
|
||||
|
||||
| Summary | |
|
||||
| -------------- | --------------------------------------------------- |
|
||||
| **Description:** | Language of responses when no language is requested |
|
||||
| **Format:** | language code |
|
||||
| **Default:** | _empty_ (use the local language of the feature) |
|
||||
| **After Changes:** | run `nominatim refresh --website` |
|
||||
|
||||
Nominatim localizes the place names in responses when the corresponding
|
||||
translation is available. Users can request a custom language setting through
|
||||
the HTTP accept-languages header or through the explicit parameter
|
||||
[accept-languages](../api/Search.md#language-of-results). If neither is
|
||||
given, it falls back to this setting. If the setting is also empty, then
|
||||
the local languages (in OSM: the name tag without any language suffix) is
|
||||
used.
|
||||
|
||||
|
||||
#### NOMINATIM_SEARCH_BATCH_MODE
|
||||
|
||||
| Summary | |
|
||||
| -------------- | --------------------------------------------------- |
|
||||
| **Description:** | Enable a special batch query mode |
|
||||
| **Format:** | boolean |
|
||||
| **Default:** | no |
|
||||
| **After Changes:** | run `nominatim refresh --website` |
|
||||
|
||||
This feature is currently undocumented and potentially broken.
|
||||
|
||||
|
||||
#### NOMINATIM_SEARCH_NAME_ONLY_THRESHOLD
|
||||
|
||||
| Summary | |
|
||||
| -------------- | --------------------------------------------------- |
|
||||
| **Description:** | Threshold for switching the search index lookup strategy |
|
||||
| **Format:** | integer |
|
||||
| **Default:** | 500 |
|
||||
| **After Changes:** | run `nominatim refresh --website` |
|
||||
|
||||
This setting defines the threshold over which a name is no longer considered
|
||||
as rare. When searching for places with rare names, only the name is used
|
||||
for place lookups. Otherwise the name and any address information is used.
|
||||
|
||||
This setting only has an effect after `nominatim refresh --word-counts` has
|
||||
been called to compute the word frequencies.
|
||||
|
||||
|
||||
#### NOMINATIM_LOOKUP_MAX_COUNT
|
||||
|
||||
| Summary | |
|
||||
| -------------- | --------------------------------------------------- |
|
||||
| **Description:** | Maximum number of OSM ids accepted by /lookup |
|
||||
| **Format:** | integer |
|
||||
| **Default:** | 50 |
|
||||
| **After Changes:** | run `nominatim refresh --website` |
|
||||
|
||||
The /lookup point accepts list of ids to look up address details for. This
|
||||
setting restricts the number of places a user may look up with a single
|
||||
request.
|
||||
|
||||
|
||||
#### NOMINATIM_POLYGON_OUTPUT_MAX_TYPES
|
||||
|
||||
| Summary | |
|
||||
| -------------- | --------------------------------------------------- |
|
||||
| **Description:** | Number of different geometry formats that may be returned |
|
||||
| **Format:** | integer |
|
||||
| **Default:** | 1 |
|
||||
| **After Changes:** | run `nominatim refresh --website` |
|
||||
|
||||
Nominatim supports returning full geometries of places. The geometries may
|
||||
be requested in different formats with one of the
|
||||
[`polygon_*` parameters](../api/Search.md#polygon-output). Use this
|
||||
setting to restrict the number of geometry types that may be requested
|
||||
with a single query.
|
||||
|
||||
Setting this parameter to 0 disables polygon output completely.
|
||||
|
||||
### Logging Settings
|
||||
|
||||
#### NOMINATIM_LOG_DB
|
||||
|
||||
| Summary | |
|
||||
| -------------- | --------------------------------------------------- |
|
||||
| **Description:** | Log requests into the database |
|
||||
| **Format:** | boolean |
|
||||
| **Default:** | no |
|
||||
| **After Changes:** | run `nominatim refresh --website` |
|
||||
|
||||
Enable logging requests into a database table with this setting. The logs
|
||||
can be found in the table `new_query_log`.
|
||||
|
||||
When using this logging method, it is advisable to set up a job that
|
||||
regularly clears out old logging information. Nominatim will not do that
|
||||
on its own.
|
||||
|
||||
Can be used as the same time as NOMINATIM_LOG_FILE.
|
||||
|
||||
#### NOMINATIM_LOG_FILE
|
||||
|
||||
| Summary | |
|
||||
| -------------- | --------------------------------------------------- |
|
||||
| **Description:** | Log requests into a file |
|
||||
| **Format:** | path |
|
||||
| **Default:** | _empty_ (logging disabled) |
|
||||
| **After Changes:** | run `nominatim refresh --website` |
|
||||
|
||||
Enable logging of requests into a file with this setting by setting the log
|
||||
file where to log to. A relative file name is assumed to be relative to
|
||||
the project directory.
|
||||
|
||||
|
||||
The entries in the log file have the following format:
|
||||
|
||||
<request time> <execution time in s> <number of results> <type> "<query string>"
|
||||
|
||||
Request time is the time when the request was started. The execution time is
|
||||
given in ms and corresponds to the time the query took executing in PHP.
|
||||
type contains the name of the endpoint used.
|
||||
|
||||
Can be used as the same time as NOMINATIM_LOG_DB.
|
||||
34
docs/customize/Special-Phrases.md
Normal file
34
docs/customize/Special-Phrases.md
Normal file
@@ -0,0 +1,34 @@
|
||||
# Special phrases
|
||||
|
||||
## Importing OSM user-maintained special phrases
|
||||
|
||||
As described in the [Import section](../admin/Import.md), it is possible to
|
||||
import special phrases from the wiki with the following command:
|
||||
|
||||
```sh
|
||||
nominatim special-phrases --import-from-wiki
|
||||
```
|
||||
|
||||
## Importing custom special phrases
|
||||
|
||||
But, it is also possible to import some phrases from a csv file.
|
||||
To do so, you have access to the following command:
|
||||
|
||||
```sh
|
||||
nominatim special-phrases --import-from-csv <csv file>
|
||||
```
|
||||
|
||||
Note that the two previous import commands will update the phrases from your database.
|
||||
This means that if you import some phrases from a csv file, only the phrases
|
||||
present in the csv file will be kept into the database. All other phrases will
|
||||
be removed.
|
||||
|
||||
If you want to only add new phrases and not update the other ones you can add
|
||||
the argument `--no-replace` to the import command. For example:
|
||||
|
||||
```sh
|
||||
nominatim special-phrases --import-from-csv <csv file> --no-replace
|
||||
```
|
||||
|
||||
This will add the phrases present in the csv file into the database without
|
||||
removing the other ones.
|
||||
28
docs/customize/Tiger.md
Normal file
28
docs/customize/Tiger.md
Normal file
@@ -0,0 +1,28 @@
|
||||
# Installing TIGER housenumber data for the US
|
||||
|
||||
Nominatim is able to use the official [TIGER](https://www.census.gov/geographies/mapping-files/time-series/geo/tiger-line-file.html)
|
||||
address set to complement the OSM house number data in the US. You can add
|
||||
TIGER data to your own Nominatim instance by following these steps. The
|
||||
entire US adds about 10GB to your database.
|
||||
|
||||
1. Get preprocessed TIGER 2021 data:
|
||||
|
||||
cd $PROJECT_DIR
|
||||
wget https://nominatim.org/data/tiger2021-nominatim-preprocessed.csv.tar.gz
|
||||
|
||||
2. Import the data into your Nominatim database:
|
||||
|
||||
nominatim add-data --tiger-data tiger2021-nominatim-preprocessed.csv.tar.gz
|
||||
|
||||
3. Enable use of the Tiger data in your `.env` by adding:
|
||||
|
||||
echo NOMINATIM_USE_US_TIGER_DATA=yes >> .env
|
||||
|
||||
4. Apply the new settings:
|
||||
|
||||
nominatim refresh --functions
|
||||
|
||||
|
||||
See the [TIGER-data project](https://github.com/osm-search/TIGER-data) for more
|
||||
information on how the data got preprocessed.
|
||||
|
||||
@@ -37,39 +37,42 @@ NOMINATIM_DATABASE_MODULE_PATH=<path to directory where nominatim.so resides>
|
||||
```
|
||||
|
||||
This is in particular useful when the database runs on a different server.
|
||||
See [Advanced installations](Advanced-Installations.md#importing-nominatim-to-an-external-postgresql-database) for details.
|
||||
See [Advanced installations](../admin/Advanced-Installations.md#importing-nominatim-to-an-external-postgresql-database) for details.
|
||||
|
||||
There are no other configuration options for the legacy tokenizer. All
|
||||
normalization functions are hard-coded.
|
||||
|
||||
## ICU tokenizer
|
||||
|
||||
!!! danger
|
||||
This tokenizer is currently in active development and still subject
|
||||
to backwards-incompatible changes.
|
||||
|
||||
The ICU tokenizer uses the [ICU library](http://site.icu-project.org/) to
|
||||
normalize names and queries. It also offers configurable decomposition and
|
||||
abbreviation handling.
|
||||
|
||||
To enable the tokenizer add the following line to your project configuration:
|
||||
|
||||
```
|
||||
NOMINATIM_TOKENIZER=icu
|
||||
```
|
||||
|
||||
### How it works
|
||||
|
||||
On import the tokenizer processes names in the following four stages:
|
||||
On import the tokenizer processes names in the following three stages:
|
||||
|
||||
1. The **Normalization** part removes all non-relevant information from the
|
||||
input.
|
||||
2. Incoming names are now converted to **full names**. This process is currently
|
||||
hard coded and mostly serves to handle name tags from OSM that contain
|
||||
multiple names (e.g. [Biel/Bienne](https://www.openstreetmap.org/node/240097197)).
|
||||
3. Next the tokenizer creates **variants** from the full names. These variants
|
||||
cover decomposition and abbreviation handling. Variants are saved to the
|
||||
database, so that it is not necessary to create the variants for a search
|
||||
query.
|
||||
4. The final **Tokenization** step converts the names to a simple ASCII form,
|
||||
potentially removing further spelling variants for better matching.
|
||||
1. During the **Sanitizer step** incoming names are cleaned up and converted to
|
||||
**full names**. This step can be used to regularize spelling, split multi-name
|
||||
tags into their parts and tag names with additional attributes. See the
|
||||
[Sanitizers section](#sanitizers) below for available cleaning routines.
|
||||
2. The **Normalization** part removes all information from the full names
|
||||
that are not relevant for search.
|
||||
3. The **Token analysis** step takes the normalized full names and creates
|
||||
all transliterated variants under which the name should be searchable.
|
||||
See the [Token analysis](#token-analysis) section below for more
|
||||
information.
|
||||
|
||||
At query time only stage 1) and 4) are used. The query is normalized and
|
||||
tokenized and the resulting string used for searching in the database.
|
||||
During query time, only normalization and transliteration are relevant.
|
||||
An incoming query is first split into name chunks (this usually means splitting
|
||||
the string at the commas) and the each part is normalised and transliterated.
|
||||
The result is used to look up places in the search index.
|
||||
|
||||
### Configuration
|
||||
|
||||
@@ -87,21 +90,36 @@ normalization:
|
||||
transliteration:
|
||||
- !include /etc/nominatim/icu-rules/extended-unicode-to-asccii.yaml
|
||||
- ":: Ascii ()"
|
||||
variants:
|
||||
- language: de
|
||||
words:
|
||||
- ~haus => haus
|
||||
- ~strasse -> str
|
||||
- language: en
|
||||
words:
|
||||
- road -> rd
|
||||
- bridge -> bdge,br,brdg,bri,brg
|
||||
sanitizers:
|
||||
- step: split-name-list
|
||||
token-analysis:
|
||||
- analyzer: generic
|
||||
variants:
|
||||
- !include icu-rules/variants-ca.yaml
|
||||
- words:
|
||||
- road -> rd
|
||||
- bridge -> bdge,br,brdg,bri,brg
|
||||
```
|
||||
|
||||
The configuration file contains three sections:
|
||||
`normalization`, `transliteration`, `variants`.
|
||||
The configuration file contains four sections:
|
||||
`normalization`, `transliteration`, `sanitizers` and `token-analysis`.
|
||||
|
||||
The normalization and transliteration sections each must contain a list of
|
||||
#### Normalization and Transliteration
|
||||
|
||||
The normalization and transliteration sections each define a set of
|
||||
ICU rules that are applied to the names.
|
||||
|
||||
The **normalisation** rules are applied after sanitation. They should remove
|
||||
any information that is not relevant for search at all. Usual rules to be
|
||||
applied here are: lower-casing, removing of special characters, cleanup of
|
||||
spaces.
|
||||
|
||||
The **transliteration** rules are applied at the end of the tokenization
|
||||
process to transfer the name into an ASCII representation. Transliteration can
|
||||
be useful to allow for further fuzzy matching, especially between different
|
||||
scripts.
|
||||
|
||||
Each section must contain a list of
|
||||
[ICU transformation rules](https://unicode-org.github.io/icu/userguide/transforms/general/rules.html).
|
||||
The rules are applied in the order in which they appear in the file.
|
||||
You can also include additional rules from external yaml file using the
|
||||
@@ -113,6 +131,85 @@ and may again include other files.
|
||||
YAML syntax. You should therefore always enclose the ICU rules in
|
||||
double-quotes.
|
||||
|
||||
#### Sanitizers
|
||||
|
||||
The sanitizers section defines an ordered list of functions that are applied
|
||||
to the name and address tags before they are further processed by the tokenizer.
|
||||
They allows to clean up the tagging and bring it to a standardized form more
|
||||
suitable for building the search index.
|
||||
|
||||
!!! hint
|
||||
Sanitizers only have an effect on how the search index is built. They
|
||||
do not change the information about each place that is saved in the
|
||||
database. In particular, they have no influence on how the results are
|
||||
displayed. The returned results always show the original information as
|
||||
stored in the OpenStreetMap database.
|
||||
|
||||
Each entry contains information of a sanitizer to be applied. It has a
|
||||
mandatory parameter `step` which gives the name of the sanitizer. Depending
|
||||
on the type, it may have additional parameters to configure its operation.
|
||||
|
||||
The order of the list matters. The sanitizers are applied exactly in the order
|
||||
that is configured. Each sanitizer works on the results of the previous one.
|
||||
|
||||
The following is a list of sanitizers that are shipped with Nominatim.
|
||||
|
||||
##### split-name-list
|
||||
|
||||
::: nominatim.tokenizer.sanitizers.split_name_list
|
||||
selection:
|
||||
members: False
|
||||
rendering:
|
||||
heading_level: 6
|
||||
|
||||
##### strip-brace-terms
|
||||
|
||||
::: nominatim.tokenizer.sanitizers.strip_brace_terms
|
||||
selection:
|
||||
members: False
|
||||
rendering:
|
||||
heading_level: 6
|
||||
|
||||
##### tag-analyzer-by-language
|
||||
|
||||
::: nominatim.tokenizer.sanitizers.tag_analyzer_by_language
|
||||
selection:
|
||||
members: False
|
||||
rendering:
|
||||
heading_level: 6
|
||||
|
||||
|
||||
|
||||
#### Token Analysis
|
||||
|
||||
Token analyzers take a full name and transform it into one or more normalized
|
||||
form that are then saved in the search index. In its simplest form, the
|
||||
analyzer only applies the transliteration rules. More complex analyzers
|
||||
create additional spelling variants of a name. This is useful to handle
|
||||
decomposition and abbreviation.
|
||||
|
||||
The ICU tokenizer may use different analyzers for different names. To select
|
||||
the analyzer to be used, the name must be tagged with the `analyzer` attribute
|
||||
by a sanitizer (see for example the
|
||||
[tag-analyzer-by-language sanitizer](#tag-analyzer-by-language)).
|
||||
|
||||
The token-analysis section contains the list of configured analyzers. Each
|
||||
analyzer must have an `id` parameter that uniquely identifies the analyzer.
|
||||
The only exception is the default analyzer that is used when no special
|
||||
analyzer was selected.
|
||||
|
||||
Different analyzer implementations may exist. To select the implementation,
|
||||
the `analyzer` parameter must be set. Currently there is only one implementation
|
||||
`generic` which is described in the following.
|
||||
|
||||
##### Generic token analyzer
|
||||
|
||||
The generic analyzer is able to create variants from a list of given
|
||||
abbreviation and decomposition replacements. It takes one optional parameter
|
||||
`variants` which lists the replacements to apply. If the section is
|
||||
omitted, then the generic analyzer becomes a simple analyzer that only
|
||||
applies the transliteration.
|
||||
|
||||
The variants section defines lists of replacements which create alternative
|
||||
spellings of a name. To create the variants, a name is scanned from left to
|
||||
right and the longest matching replacement is applied until the end of the
|
||||
@@ -138,7 +235,7 @@ term.
|
||||
words in the configuration because then it is possible to change the
|
||||
rules for normalization later without having to adapt the variant rules.
|
||||
|
||||
#### Decomposition
|
||||
###### Decomposition
|
||||
|
||||
In its standard form, only full words match against the source. There
|
||||
is a special notation to match the prefix and suffix of a word:
|
||||
@@ -165,7 +262,7 @@ To avoid automatic decomposition, use the '|' notation:
|
||||
|
||||
simply changes "hauptstrasse" to "hauptstr" and "rote strasse" to "rote str".
|
||||
|
||||
#### Initial and final terms
|
||||
###### Initial and final terms
|
||||
|
||||
It is also possible to restrict replacements to the beginning and end of a
|
||||
name:
|
||||
@@ -178,7 +275,7 @@ name:
|
||||
So the first example would trigger a replacement for "south 45th street" but
|
||||
not for "the south beach restaurant".
|
||||
|
||||
#### Replacements vs. variants
|
||||
###### Replacements vs. variants
|
||||
|
||||
The replacement syntax `source => target` works as a pure replacement. It changes
|
||||
the name instead of creating a variant. To create an additional version, you'd
|
||||
167
docs/develop/Database-Layout.md
Normal file
167
docs/develop/Database-Layout.md
Normal file
@@ -0,0 +1,167 @@
|
||||
# Database Layout
|
||||
|
||||
### Import tables
|
||||
|
||||
OSM data is initially imported using [osm2pgsql](https://osm2pgsql.org).
|
||||
Nominatim uses its own data output style 'gazetteer', which differs from the
|
||||
output style created for map rendering.
|
||||
|
||||
The import process creates the following tables:
|
||||
|
||||

|
||||
|
||||
The `planet_osm_*` tables are the usual backing tables for OSM data. Note
|
||||
that Nominatim uses them to look up special relations and to find nodes on
|
||||
ways.
|
||||
|
||||
The gazetteer style produces a single table `place` as output with the following
|
||||
columns:
|
||||
|
||||
* `osm_type` - kind of OSM object (**N** - node, **W** - way, **R** - relation)
|
||||
* `osm_id` - original OSM ID
|
||||
* `class` - key of principal tag defining the object type
|
||||
* `type` - value of principal tag defining the object type
|
||||
* `name` - collection of tags that contain a name or reference
|
||||
* `admin_level` - numerical value of the tagged administrative level
|
||||
* `address` - collection of tags defining the address of an object
|
||||
* `extratags` - collection of additional interesting tags that are not
|
||||
directly relevant for searching
|
||||
* `geometry` - geometry of the object (in WGS84)
|
||||
|
||||
A single OSM object may appear multiple times in this table when it is tagged
|
||||
with multiple tags that may constitute a principal tag. Take for example a
|
||||
motorway bridge. In OSM, this would be a way which is tagged with
|
||||
`highway=motorway` and `bridge=yes`. This way would appear in the `place` table
|
||||
once with `class` of `highway` and once with a `class` of `bridge`. Thus the
|
||||
*unique key* for `place` is (`osm_type`, `osm_id`, `class`).
|
||||
|
||||
How raw OSM tags are mapped to the columns in the place table is to a certain
|
||||
degree configurable. See [Customizing Import Styles](../customize/Import-Styles.md)
|
||||
for more information.
|
||||
|
||||
### Search tables
|
||||
|
||||
The following tables carry all information needed to do the search:
|
||||
|
||||

|
||||
|
||||
The **placex** table is the central table that saves all information about the
|
||||
searchable places in Nominatim. The basic columns are the same as for the
|
||||
place table and have the same meaning. The placex tables adds the following
|
||||
additional columns:
|
||||
|
||||
* `place_id` - the internal unique ID to identify the place
|
||||
* `partition` - the id to use with partitioned tables (see below)
|
||||
* `geometry_sector` - a location hash used for geographically close ordering
|
||||
* `parent_place_id` - the next higher place in the address hierarchy, only
|
||||
relevant for POI-type places (with rank 30)
|
||||
* `linked_place_id` - place ID of the place this object has been merged with.
|
||||
When this ID is set, then the place is invisible for search.
|
||||
* `importance` - measure how well known the place is
|
||||
* `rank_search`, `rank_address` - search and address rank (see [Customizing ranking](../customize/Ranking.md)
|
||||
* `wikipedia` - the wikipedia page used for computing the importance of the place
|
||||
* `country_code` - the country the place is located in
|
||||
* `housenumber` - normalized housenumber, if the place has one
|
||||
* `postcode` - computed postcode for the place
|
||||
* `indexed_status` - processing status of the place (0 - ready, 1 - freshly inserted, 2 - needs updating, 100 - needs deletion)
|
||||
* `indexed_date` - timestamp when the place was processed last
|
||||
* `centroid` - a point feature for the place
|
||||
|
||||
The **location_property_osmline** table is a special table for
|
||||
[address interpolations](https://wiki.openstreetmap.org/wiki/Addresses#Using_interpolation).
|
||||
The columns have the same meaning and use as the columns with the same name in
|
||||
the placex table. Only three columns are special:
|
||||
|
||||
* `startnumber` and `endnumber` - beginning and end of the number range
|
||||
for the interpolation
|
||||
* `interpolationtype` - a string `odd`, `even` or `all` to indicate
|
||||
the interval between the numbers
|
||||
|
||||
Address interpolations are always ways in OSM, which is why there is no column
|
||||
`osm_type`.
|
||||
|
||||
The **location_postcode** table holds computed centroids of all postcodes that
|
||||
can be found in the OSM data. The meaning of the columns is again the same
|
||||
as that of the placex table.
|
||||
|
||||
Every place needs an address, a set of surrounding places that describe the
|
||||
location of the place. The set of address places is made up of OSM places
|
||||
themselves. The **place_addressline** table cross-references for each place
|
||||
all the places that make up its address. Two columns define the address
|
||||
relation:
|
||||
|
||||
* `place_id` - reference to the place being addressed
|
||||
* `address_place_id` - reference to the place serving as an address part
|
||||
|
||||
The most of the columns cache information from the placex entry of the address
|
||||
part. The exceptions are:
|
||||
|
||||
* `fromarea` - is true if the address part has an area geometry and can
|
||||
therefore be considered preceise
|
||||
* `isaddress` - is true if the address part should show up in the address
|
||||
output. Sometimes there are multiple places competing for for same address
|
||||
type (e.g. multiple cities) and this field resolves the tie.
|
||||
|
||||
The **search_name** table contains the search index proper. It saves for each
|
||||
place the terms with which the place can be found. The terms are split into
|
||||
the name itself and all terms that make up the address. The table mirrors some
|
||||
of the columns from placex for faster lookup.
|
||||
|
||||
Search terms are not saved as strings. Each term is assigned an integer and those
|
||||
integers are saved in the name and address vectors of the search_name table. The
|
||||
**word** table serves as the lookup table from string to such a word ID. The
|
||||
exact content of the word table depends on the [tokenizer](Tokenizers.md) used.
|
||||
|
||||
## Address computation tables
|
||||
|
||||
Next to the main search tables, there is a set of secondary helper tables used
|
||||
to compute the address relations between places. These tables are partitioned.
|
||||
Each country is assigned a partition number in the country_name table (see
|
||||
below) and the data is then split between a set of tables, one for each
|
||||
partition. Note that Nominatim still manually manages partitioned tables.
|
||||
Native support for partitions in PostgreSQL only became useable with version 13.
|
||||
It will be a little while before Nominatim drops support for older versions.
|
||||
|
||||

|
||||
|
||||
The **search_name_X** tables are used to look up streets that appear in the
|
||||
`addr:street` tag.
|
||||
|
||||
The **location_area_large_X** tables are used to look up larger areas
|
||||
(administrative boundaries and place nodes) either through their geographic
|
||||
closeness or through `addr:*` entries.
|
||||
|
||||
The **location_road_X** tables are used to find the closest street for a
|
||||
dependent place.
|
||||
|
||||
All three table cache specific information from the placex table for their
|
||||
selected subset of places:
|
||||
|
||||
* `keywords` and `name_vector` contain lists of term ids (from the word table)
|
||||
that the full name of the place should match against
|
||||
* `isguess` is true for places that are not described by an area
|
||||
|
||||
All other columns reflect their counterpart in the placex table.
|
||||
|
||||
## Static data tables
|
||||
|
||||
Nominatim also creates a number of static tables at import:
|
||||
|
||||
* `nominatim_properties` saves settings that must not be changed after
|
||||
import
|
||||
* `address_levels` save the rank information from the
|
||||
[ranking configuration](../customize/Ranking.md)
|
||||
* `country_name` contains a fallback of names for all countries, their
|
||||
default languages and saves the assignment of countries to partitions.
|
||||
* `country_osm_grid` provides a fallback for country geometries
|
||||
|
||||
## Auxilary data tables
|
||||
|
||||
Finally there are some table for auxillary data:
|
||||
|
||||
* `location_property_tiger` - saves housenumber from the Tiger import. Its
|
||||
layout is similar to that of `location_propoerty_osmline`.
|
||||
* `place_class_*` tables are helper tables to facilitate lookup of POIs
|
||||
by their class and type. They exist because it is not possible to create
|
||||
combined indexes with geometries.
|
||||
|
||||
@@ -38,6 +38,7 @@ It has the following additional requirements:
|
||||
The documentation is built with mkdocs:
|
||||
|
||||
* [mkdocs](https://www.mkdocs.org/) >= 1.1.2
|
||||
* [mkdocstrings](https://mkdocstrings.github.io/)
|
||||
|
||||
### Installing prerequisites on Ubuntu/Debian
|
||||
|
||||
@@ -51,7 +52,7 @@ To install all necessary packages run:
|
||||
sudo apt install php-cgi phpunit php-codesniffer \
|
||||
python3-pip python3-setuptools python3-dev pylint
|
||||
|
||||
pip3 install --user behave mkdocs pytest
|
||||
pip3 install --user behave mkdocs mkdocstrings pytest
|
||||
```
|
||||
|
||||
The `mkdocs` executable will be located in `.local/bin`. You may have to add
|
||||
@@ -113,7 +114,7 @@ symlinks (see `CMakeLists.txt` for the exact steps).
|
||||
Now you can start webserver for local testing
|
||||
|
||||
```
|
||||
build> mkdocs serve
|
||||
build> make serve-doc
|
||||
[server:296] Serving on http://127.0.0.1:8000
|
||||
[handlers:62] Start watching changes
|
||||
```
|
||||
@@ -122,7 +123,7 @@ If you develop inside a Vagrant virtual machine, use a port that is forwarded
|
||||
to your host:
|
||||
|
||||
```
|
||||
build> mkdocs serve --dev-addr 0.0.0.0:8088
|
||||
build> PYTHONPATH=$SRCDIR mkdocs serve --dev-addr 0.0.0.0:8088
|
||||
[server:296] Serving on http://0.0.0.0:8088
|
||||
[handlers:62] Start watching changes
|
||||
```
|
||||
|
||||
152
docs/develop/Indexing.md
Normal file
152
docs/develop/Indexing.md
Normal file
@@ -0,0 +1,152 @@
|
||||
# Indexing Places
|
||||
|
||||
In Nominatim, the word __indexing__ refers to the process that takes the raw
|
||||
OpenStreetMap data from the place table, enriches it with address information
|
||||
and creates the search indexes. This section explains the basic data flow.
|
||||
|
||||
|
||||
## Initial import
|
||||
|
||||
After osm2pgsql has loaded the raw OSM data into the place table,
|
||||
the data is copied to the final search tables placex and location_property_osmline.
|
||||
While they are copied, some basic properties are added:
|
||||
|
||||
* country_code, geometry_sector and partition
|
||||
* initial search and address rank
|
||||
|
||||
In addition the column `indexed_status` is set to `1` marking the place as one
|
||||
that needs to be indexed.
|
||||
|
||||
All this happens in the triggers `placex_insert` and `osmline_insert`.
|
||||
|
||||
## Indexing
|
||||
|
||||
The main work horse of the data import is the indexing step, where Nominatim
|
||||
takes every place from the placex and location_property_osmline tables where
|
||||
the indexed_status != 0 and computes the search terms and the address parts
|
||||
of the place.
|
||||
|
||||
The indexing happens in three major steps:
|
||||
|
||||
1. **Data preparation** - The indexer gets the data for the place to be indexed
|
||||
from the database.
|
||||
|
||||
2. **Search name processing** - The prepared data is given to the
|
||||
tokenizer which computes the search terms from the names
|
||||
and potentially other information.
|
||||
|
||||
3. **Address processing** - The indexer then hands the prepared data and the
|
||||
tokenizer information back to the database via an `INSERT` statement which
|
||||
also sets the indexed_status to `0`. This triggers the update triggers
|
||||
`placex_update`/`osmline_update` which do the work of computing address
|
||||
parts and filling all the search tables.
|
||||
|
||||
When computing the address terms of a place, Nominatim relies on the processed
|
||||
search names of all the address parts. That is why places are processed in rank
|
||||
order, from smallest rank to largest. To ensure correct handling of linked
|
||||
place nodes, administrative boundaries are processed before all other places.
|
||||
|
||||
Apart from these restrictions, each place can be indexed independently
|
||||
from the others. This allows a large degree of parallelization during the indexing.
|
||||
It also means that the indexing process can be interrupted at any time and
|
||||
will simply pick up where it left of when restarted.
|
||||
|
||||
### Data preparation
|
||||
|
||||
The data preparation step computes and retrieves all data for a place that
|
||||
might be needed for the next step of processing the search name. That includes
|
||||
|
||||
* location information (country code)
|
||||
* place classification (class, type, ranks)
|
||||
* names (including names of linked places)
|
||||
* address information (`addr:*` tags)
|
||||
|
||||
Data preparation is implemented in pl/PgSQL mostly in the functions
|
||||
`placex_indexing_prepare()` and `get_interpolation_address()`.
|
||||
|
||||
#### `addr:*` tag inheritance
|
||||
|
||||
Nominatim has limited support for inheriting address tags from a building
|
||||
to POIs inside the building. This only works when the address tags are on the
|
||||
building outline. Any rank 30 object inside such a building or on its outline
|
||||
inherits all address tags when it does not have any address tags of its own.
|
||||
|
||||
The inheritance is computed in the data preparation step.
|
||||
|
||||
### Search name processing
|
||||
|
||||
The prepared place information is handed to the tokenizer next. This is a
|
||||
Python module responsible for processing the names from both name and address
|
||||
terms and building up the word index from them. The process is explained in
|
||||
more detail in the [Tokenizer chapter](Tokenizer.md).
|
||||
|
||||
### Address processing
|
||||
|
||||
Finally, the preprocessed place information and the results of the search name
|
||||
processing are written back to the database. At this point the update trigger
|
||||
of the placex/location_property_osmline tables take over and fill all the
|
||||
dependent tables. This makes up the most work-intensive part of the indexing.
|
||||
|
||||
Nominatim distinguishes between dependent and independent places.
|
||||
**Dependent places** are all places on rank 30: house numbers, POIs etc. These
|
||||
places don't have a full address of their own. Instead they are attached to
|
||||
a parent street or place and use the information of the parent for searching
|
||||
and displaying information. Everything else are **independent places**: streets,
|
||||
parks, water bodies, suburbs, cities, states etc. They receive a full address
|
||||
on their own.
|
||||
|
||||
The address processing for both types of places is very different.
|
||||
|
||||
#### Independent places
|
||||
|
||||
To compute the address of an independent place Nominatim searches for all
|
||||
places that cover the place to compute the address for at least partially.
|
||||
For places with an area, that area is used to check for coverage. For place
|
||||
nodes an artificial square area is computed according to the rank of
|
||||
the place. The lower the rank the lager the area. The `location_area_large_X`
|
||||
tables are there to facilitate the lookup. All places that can function as
|
||||
the address of another place are saved in those tables.
|
||||
|
||||
`addr:*` and `isin:*` tags are taken into account to compute the address, too.
|
||||
Nominatim will give preference to places with the same name as in these tags
|
||||
when looking for places in the vicinity. If there are no matching place names
|
||||
at all, then the tags are at least added to the search index. That means that
|
||||
the names will not be shown in the result as the 'address' of the place, but
|
||||
searching by them still works.
|
||||
|
||||
Independent places are always added to the global search index `search_name`.
|
||||
|
||||
#### Dependent places
|
||||
|
||||
Dependent places skip the full address computation for performance reasons.
|
||||
Instead they just find a parent place to attach themselves to.
|
||||
|
||||

|
||||
|
||||
By default a POI
|
||||
or house number will be attached to the closest street. That can be any major
|
||||
or minor street indexed by Nominatim. In the default configuration that means
|
||||
that it can attach itself to a footway but only when it has a name.
|
||||
|
||||
When the dependent place has an `addr:street` tag, then Nominatim will first
|
||||
try to find a street with the same name before falling back to the closest
|
||||
street.
|
||||
|
||||
There are also addresses in OSM, where the housenumber does not belong
|
||||
to a street at all. These have an `addr:place` tag. For these places, Nominatim
|
||||
tries to find a place with the given name in the indexed places with an
|
||||
address rank between 16 and 25. If none is found, then the dependent place
|
||||
is attached to the closest place in that category and the addr:place name is
|
||||
added as *unlisted* place, which indicates to Nominatim that it needs to add
|
||||
it to the address output, no matter what. This special case is necessary to
|
||||
cover addresses that don't really refer to an existing object.
|
||||
|
||||
When an address has both the `addr:street` and `addr:place` tag, then Nominatim
|
||||
assumes that the `addr:place` tag in fact should be the city part of the address
|
||||
and give the POI the usual street number address.
|
||||
|
||||
Dependent places are only added to the global search index `search_name` when
|
||||
they have either a name themselves or when they have address tags that are not
|
||||
covered by the places that make up their address. The latter ensures that
|
||||
addresses are always searchable by those address tags.
|
||||
|
||||
@@ -1,45 +0,0 @@
|
||||
# Postcodes in Nominatim
|
||||
|
||||
The blog post
|
||||
[Nominatim and Postcodes](https://www.openstreetmap.org/user/lonvia/diary/43143)
|
||||
describes the handling implemented since Nominatim 3.1.
|
||||
|
||||
Postcode centroids (aka 'calculated postcodes') are generated by looking at all
|
||||
postcodes of a country, grouping them and calculating the geometric centroid.
|
||||
There is currently no logic to deal with extreme outliers (typos or other
|
||||
mistakes in OSM data). There is also no check if a postcodes adheres to a
|
||||
country's format, e.g. if Swiss postcodes are 4 digits.
|
||||
|
||||
|
||||
## Regular updating calculated postcodes
|
||||
|
||||
The script to rerun the calculation is
|
||||
`nominatim refresh --postcodes`
|
||||
and runs once per night on nominatim.openstreetmap.org.
|
||||
|
||||
|
||||
## Finding places that share a specific postcode
|
||||
|
||||
In the Nominatim database run
|
||||
|
||||
```sql
|
||||
SELECT address->'postcode' as pc,
|
||||
osm_type, osm_id, class, type,
|
||||
st_x(centroid) as lon, st_y(centroid) as lat
|
||||
FROM placex
|
||||
WHERE country_code='fr'
|
||||
AND upper(trim (both ' ' from address->'postcode')) = '33210';
|
||||
```
|
||||
|
||||
Alternatively on [Overpass](https://overpass-turbo.eu/) run the following query
|
||||
|
||||
```
|
||||
[out:json][timeout:250];
|
||||
area["name"="France"]->.boundaryarea;
|
||||
(
|
||||
nwr(area.boundaryarea)["addr:postcode"="33210"];
|
||||
);
|
||||
out body;
|
||||
>;
|
||||
out skel qt;
|
||||
```
|
||||
332
docs/develop/Tokenizers.md
Normal file
332
docs/develop/Tokenizers.md
Normal file
@@ -0,0 +1,332 @@
|
||||
# Tokenizers
|
||||
|
||||
The tokenizer is the component of Nominatim that is responsible for
|
||||
analysing names of OSM objects and queries. Nominatim provides different
|
||||
tokenizers that use different strategies for normalisation. This page describes
|
||||
how tokenizers are expected to work and the public API that needs to be
|
||||
implemented when creating a new tokenizer. For information on how to configure
|
||||
a specific tokenizer for a database see the
|
||||
[tokenizer chapter in the Customization Guide](../customize/Tokenizers.md).
|
||||
|
||||
## Generic Architecture
|
||||
|
||||
### About Search Tokens
|
||||
|
||||
Search in Nominatim is organised around search tokens. Such a token represents
|
||||
string that can be part of the search query. Tokens are used so that the search
|
||||
index does not need to be organised around strings. Instead the database saves
|
||||
for each place which tokens match this place's name, address, house number etc.
|
||||
To be able to distinguish between these different types of information stored
|
||||
with the place, a search token also always has a certain type: name, house number,
|
||||
postcode etc.
|
||||
|
||||
During search an incoming query is transformed into a ordered list of such
|
||||
search tokens (or rather many lists, see below) and this list is then converted
|
||||
into a database query to find the right place.
|
||||
|
||||
It is the core task of the tokenizer to create, manage and assign the search
|
||||
tokens. The tokenizer is involved in two distinct operations:
|
||||
|
||||
* __at import time__: scanning names of OSM objects, normalizing them and
|
||||
building up the list of search tokens.
|
||||
* __at query time__: scanning the query and returning the appropriate search
|
||||
tokens.
|
||||
|
||||
|
||||
### Importing
|
||||
|
||||
The indexer is responsible to enrich an OSM object (or place) with all data
|
||||
required for geocoding. It is split into two parts: the controller collects
|
||||
the places that require updating, enriches the place information as required
|
||||
and hands the place to Postgresql. The collector is part of the Nominatim
|
||||
library written in Python. Within Postgresql, the `placex_update`
|
||||
trigger is responsible to fill out all secondary tables with extra geocoding
|
||||
information. This part is written in PL/pgSQL.
|
||||
|
||||
The tokenizer is involved in both parts. When the indexer prepares a place,
|
||||
it hands it over to the tokenizer to inspect the names and create all the
|
||||
search tokens applicable for the place. This usually involves updating the
|
||||
tokenizer's internal token lists and creating a list of all token IDs for
|
||||
the specific place. This list is later needed in the PL/pgSQL part where the
|
||||
indexer needs to add the token IDs to the appropriate search tables. To be
|
||||
able to communicate the list between the Python part and the pl/pgSQL trigger,
|
||||
the `placex` table contains a special JSONB column `token_info` which is there
|
||||
for the exclusive use of the tokenizer.
|
||||
|
||||
The Python part of the tokenizer returns a structured information about the
|
||||
tokens of a place to the indexer which converts it to JSON and inserts it into
|
||||
the `token_info` column. The content of the column is then handed to the PL/pqSQL
|
||||
callbacks of the tokenizer which extracts the required information. Usually
|
||||
the tokenizer then removes all information from the `token_info` structure,
|
||||
so that no information is ever persistently saved in the table. All information
|
||||
that went in should have been processed after all and put into secondary tables.
|
||||
This is however not a hard requirement. If the tokenizer needs to store
|
||||
additional information about a place permanently, it may do so in the
|
||||
`token_info` column. It just may never execute searches over it and
|
||||
consequently not create any special indexes on it.
|
||||
|
||||
### Querying
|
||||
|
||||
At query time, Nominatim builds up multiple _interpretations_ of the search
|
||||
query. Each of these interpretations is tried against the database in order
|
||||
of the likelihood with which they match to the search query. The first
|
||||
interpretation that yields results wins.
|
||||
|
||||
The interpretations are encapsulated in the `SearchDescription` class. An
|
||||
instance of this class is created by applying a sequence of
|
||||
_search tokens_ to an initially empty SearchDescription. It is the
|
||||
responsibility of the tokenizer to parse the search query and derive all
|
||||
possible sequences of search tokens. To that end the tokenizer needs to parse
|
||||
the search query and look up matching words in its own data structures.
|
||||
|
||||
## Tokenizer API
|
||||
|
||||
The following section describes the functions that need to be implemented
|
||||
for a custom tokenizer implementation.
|
||||
|
||||
!!! warning
|
||||
This API is currently in early alpha status. While this API is meant to
|
||||
be a public API on which other tokenizers may be implemented, the API is
|
||||
far away from being stable at the moment.
|
||||
|
||||
### Directory Structure
|
||||
|
||||
Nominatim expects two files for a tokenizer:
|
||||
|
||||
* `nominiatim/tokenizer/<NAME>_tokenizer.py` containing the Python part of the
|
||||
implementation
|
||||
* `lib-php/tokenizer/<NAME>_tokenizer.php` with the PHP part of the
|
||||
implementation
|
||||
|
||||
where `<NAME>` is a unique name for the tokenizer consisting of only lower-case
|
||||
letters, digits and underscore. A tokenizer also needs to install some SQL
|
||||
functions. By convention, these should be placed in `lib-sql/tokenizer`.
|
||||
|
||||
If the tokenizer has a default configuration file, this should be saved in
|
||||
the `settings/<NAME>_tokenizer.<SUFFIX>`.
|
||||
|
||||
### Configuration and Persistance
|
||||
|
||||
Tokenizers may define custom settings for their configuration. All settings
|
||||
must be prefixed with `NOMINATIM_TOKENIZER_`. Settings may be transient or
|
||||
persistent. Transient settings are loaded from the configuration file when
|
||||
Nominatim is started and may thus be changed at any time. Persistent settings
|
||||
are tied to a database installation and must only be read during installation
|
||||
time. If they are needed for the runtime then they must be saved into the
|
||||
`nominatim_properties` table and later loaded from there.
|
||||
|
||||
### The Python module
|
||||
|
||||
The Python module is expect to export a single factory function:
|
||||
|
||||
```python
|
||||
def create(dsn: str, data_dir: Path) -> AbstractTokenizer
|
||||
```
|
||||
|
||||
The `dsn` parameter contains the DSN of the Nominatim database. The `data_dir`
|
||||
is a directory in the project directory that the tokenizer may use to save
|
||||
database-specific data. The function must return the instance of the tokenizer
|
||||
class as defined below.
|
||||
|
||||
### Python Tokenizer Class
|
||||
|
||||
All tokenizers must inherit from `nominatim.tokenizer.base.AbstractTokenizer`
|
||||
and implement the abstract functions defined there.
|
||||
|
||||
::: nominatim.tokenizer.base.AbstractTokenizer
|
||||
rendering:
|
||||
heading_level: 4
|
||||
|
||||
### Python Analyzer Class
|
||||
|
||||
::: nominatim.tokenizer.base.AbstractAnalyzer
|
||||
rendering:
|
||||
heading_level: 4
|
||||
|
||||
### PL/pgSQL Functions
|
||||
|
||||
The tokenizer must provide access functions for the `token_info` column
|
||||
to the indexer which extracts the necessary information for the global
|
||||
search tables. If the tokenizer needs additional SQL functions for private
|
||||
use, then these functions must be prefixed with `token_` in order to ensure
|
||||
that there are no naming conflicts with the SQL indexer code.
|
||||
|
||||
The following functions are expected:
|
||||
|
||||
```sql
|
||||
FUNCTION token_get_name_search_tokens(info JSONB) RETURNS INTEGER[]
|
||||
```
|
||||
|
||||
Return an array of token IDs of search terms that should match
|
||||
the name(s) for the given place. These tokens are used to look up the place
|
||||
by name and, where the place functions as part of an address for another place,
|
||||
by address. Must return NULL when the place has no name.
|
||||
|
||||
```sql
|
||||
FUNCTION token_get_name_match_tokens(info JSONB) RETURNS INTEGER[]
|
||||
```
|
||||
|
||||
Return an array of token IDs of full names of the place that should be used
|
||||
to match addresses. The list of match tokens is usually more strict than
|
||||
search tokens as it is used to find a match between two OSM tag values which
|
||||
are expected to contain matching full names. Partial terms should not be
|
||||
used for match tokens. Must return NULL when the place has no name.
|
||||
|
||||
```sql
|
||||
FUNCTION token_get_housenumber_search_tokens(info JSONB) RETURNS INTEGER[]
|
||||
```
|
||||
|
||||
Return an array of token IDs of house number tokens that apply to the place.
|
||||
Note that a place may have multiple house numbers, for example when apartments
|
||||
each have their own number. Must be NULL when the place has no house numbers.
|
||||
|
||||
```sql
|
||||
FUNCTION token_normalized_housenumber(info JSONB) RETURNS TEXT
|
||||
```
|
||||
|
||||
Return the house number(s) in the normalized form that can be matched against
|
||||
a house number token text. If a place has multiple house numbers they must
|
||||
be listed with a semicolon as delimiter. Must be NULL when the place has no
|
||||
house numbers.
|
||||
|
||||
```sql
|
||||
FUNCTION token_matches_street(info JSONB, street_tokens INTEGER[]) RETURNS BOOLEAN
|
||||
```
|
||||
|
||||
Check if the given tokens (previously saved from `token_get_name_match_tokens()`)
|
||||
match against the `addr:street` tag name. Must return either NULL or FALSE
|
||||
when the place has no `addr:street` tag.
|
||||
|
||||
```sql
|
||||
FUNCTION token_matches_place(info JSONB, place_tokens INTEGER[]) RETURNS BOOLEAN
|
||||
```
|
||||
|
||||
Check if the given tokens (previously saved from `token_get_name_match_tokens()`)
|
||||
match against the `addr:place` tag name. Must return either NULL or FALSE
|
||||
when the place has no `addr:place` tag.
|
||||
|
||||
|
||||
```sql
|
||||
FUNCTION token_addr_place_search_tokens(info JSONB) RETURNS INTEGER[]
|
||||
```
|
||||
|
||||
Return the search token IDs extracted from the `addr:place` tag. These tokens
|
||||
are used for searches by address when no matching place can be found in the
|
||||
database. Must be NULL when the place has no `addr:place` tag.
|
||||
|
||||
```sql
|
||||
FUNCTION token_get_address_keys(info JSONB) RETURNS SETOF TEXT
|
||||
```
|
||||
|
||||
Return the set of keys for which address information is provided. This
|
||||
should correspond to the list of (relevant) `addr:*` tags with the `addr:`
|
||||
prefix removed or the keys used in the `address` dictionary of the place info.
|
||||
|
||||
```sql
|
||||
FUNCTION token_get_address_search_tokens(info JSONB, key TEXT) RETURNS INTEGER[]
|
||||
```
|
||||
|
||||
Return the array of search tokens for the given address part. `key` can be
|
||||
expected to be one of those returned with `token_get_address_keys()`. The
|
||||
search tokens are added to the address search vector of the place, when no
|
||||
corresponding OSM object could be found for the given address part from which
|
||||
to copy the name information.
|
||||
|
||||
```sql
|
||||
FUNCTION token_matches_address(info JSONB, key TEXT, tokens INTEGER[])
|
||||
```
|
||||
|
||||
Check if the given tokens match against the address part `key`.
|
||||
|
||||
__Warning:__ the tokens that are handed in are the lists previously saved
|
||||
from `token_get_name_search_tokens()`, _not_ from the match token list. This
|
||||
is an historical oddity which will be fixed at some point in the future.
|
||||
Currently, tokenizers are encouraged to make sure that matching works against
|
||||
both the search token list and the match token list.
|
||||
|
||||
```sql
|
||||
FUNCTION token_normalized_postcode(postcode TEXT) RETURNS TEXT
|
||||
```
|
||||
|
||||
Return the normalized version of the given postcode. This function must return
|
||||
the same value as the Python function `AbstractAnalyzer->normalize_postcode()`.
|
||||
|
||||
```sql
|
||||
FUNCTION token_strip_info(info JSONB) RETURNS JSONB
|
||||
```
|
||||
|
||||
Return the part of the `token_info` field that should be stored in the database
|
||||
permanently. The indexer calls this function when all processing is done and
|
||||
replaces the content of the `token_info` column with the returned value before
|
||||
the trigger stores the information in the database. May return NULL if no
|
||||
information should be stored permanently.
|
||||
|
||||
### PHP Tokenizer class
|
||||
|
||||
The PHP tokenizer class is instantiated once per request and responsible for
|
||||
analyzing the incoming query. Multiple requests may be in flight in
|
||||
parallel.
|
||||
|
||||
The class is expected to be found under the
|
||||
name of `\Nominatim\Tokenizer`. To find the class the PHP code includes the file
|
||||
`tokenizer/tokenizer.php` in the project directory. This file must be created
|
||||
when the tokenizer is first set up on import. The file should initialize any
|
||||
configuration variables by setting PHP constants and then require the file
|
||||
with the actual implementation of the tokenizer.
|
||||
|
||||
The tokenizer class must implement the following functions:
|
||||
|
||||
```php
|
||||
public function __construct(object &$oDB)
|
||||
```
|
||||
|
||||
The constructor of the class receives a database connection that can be used
|
||||
to query persistent data in the database.
|
||||
|
||||
```php
|
||||
public function checkStatus()
|
||||
```
|
||||
|
||||
Check that the tokenizer can access its persistent data structures. If there
|
||||
is an issue, throw an `\Exception`.
|
||||
|
||||
```php
|
||||
public function normalizeString(string $sTerm) : string
|
||||
```
|
||||
|
||||
Normalize string to a form to be used for comparisons when reordering results.
|
||||
Nominatim reweighs results how well the final display string matches the actual
|
||||
query. Before comparing result and query, names and query are normalised against
|
||||
this function. The tokenizer can thus remove all properties that should not be
|
||||
taken into account for reweighing, e.g. special characters or case.
|
||||
|
||||
```php
|
||||
public function tokensForSpecialTerm(string $sTerm) : array
|
||||
```
|
||||
|
||||
Return the list of special term tokens that match the given term.
|
||||
|
||||
```php
|
||||
public function extractTokensFromPhrases(array &$aPhrases) : TokenList
|
||||
```
|
||||
|
||||
Parse the given phrases, splitting them into word lists and retrieve the
|
||||
matching tokens.
|
||||
|
||||
The phrase array may take on two forms. In unstructured searches (using `q=`
|
||||
parameter) the search query is split at the commas and the elements are
|
||||
put into a sorted list. For structured searches the phrase array is an
|
||||
associative array where the key designates the type of the term (street, city,
|
||||
county etc.) The tokenizer may ignore the phrase type at this stage in parsing.
|
||||
Matching phrase type and appropriate search token type will be done later
|
||||
when the SearchDescription is built.
|
||||
|
||||
For each phrase in the list of phrases, the function must analyse the phrase
|
||||
string and then call `setWordSets()` to communicate the result of the analysis.
|
||||
A word set is a list of strings, where each string refers to a search token.
|
||||
A phrase may have multiple interpretations. Therefore a list of word sets is
|
||||
usually attached to the phrase. The search tokens themselves are returned
|
||||
by the function in an associative array, where the key corresponds to the
|
||||
strings given in the word sets. The value is a list of search tokens. Thus
|
||||
a single string in the list of word sets may refer to multiple search tokens.
|
||||
|
||||
35
docs/develop/address-tables.plantuml
Normal file
35
docs/develop/address-tables.plantuml
Normal file
@@ -0,0 +1,35 @@
|
||||
@startuml
|
||||
skinparam monochrome true
|
||||
skinparam ObjectFontStyle bold
|
||||
|
||||
map search_name_X {
|
||||
place_id => BIGINT
|
||||
address_rank => SMALLINT
|
||||
name_vector => INT[]
|
||||
centroid => GEOMETRY
|
||||
}
|
||||
|
||||
map location_area_large_X {
|
||||
place_id => BIGINT
|
||||
keywords => INT[]
|
||||
partition => SMALLINT
|
||||
rank_search => SMALLINT
|
||||
rank_address => SMALLINT
|
||||
country_code => VARCHR(2)
|
||||
isguess => BOOLEAN
|
||||
postcode => TEXT
|
||||
centroid => POINT
|
||||
geometry => GEOMETRY
|
||||
}
|
||||
|
||||
map location_road_X {
|
||||
place_id => BIGINT
|
||||
partition => SMALLINT
|
||||
country_code => VARCHR(2)
|
||||
geometry => GEOMETRY
|
||||
}
|
||||
|
||||
search_name_X -[hidden]> location_area_large_X
|
||||
location_area_large_X -[hidden]> location_road_X
|
||||
|
||||
@enduml
|
||||
47
docs/develop/address-tables.svg
Normal file
47
docs/develop/address-tables.svg
Normal file
File diff suppressed because one or more lines are too long
|
After Width: | Height: | Size: 11 KiB |
44
docs/develop/osm2pgsql-tables.plantuml
Normal file
44
docs/develop/osm2pgsql-tables.plantuml
Normal file
@@ -0,0 +1,44 @@
|
||||
@startuml
|
||||
skinparam monochrome true
|
||||
skinparam ObjectFontStyle bold
|
||||
|
||||
map planet_osm_nodes #eee {
|
||||
id => BIGINT
|
||||
lat => INT
|
||||
lon => INT
|
||||
}
|
||||
|
||||
map planet_osm_ways #eee {
|
||||
id => BIGINT
|
||||
nodes => BIGINT[]
|
||||
tags => TEXT[]
|
||||
}
|
||||
|
||||
map planet_osm_rels #eee {
|
||||
id => BIGINT
|
||||
parts => BIGINT[]
|
||||
members => TEXT[]
|
||||
tags => TEXT[]
|
||||
way_off => SMALLINT
|
||||
rel_off => SMALLINT
|
||||
}
|
||||
|
||||
map place {
|
||||
osm_type => CHAR(1)
|
||||
osm_id => BIGINT
|
||||
class => TEXT
|
||||
type => TEXT
|
||||
name => HSTORE
|
||||
address => HSTORE
|
||||
extratags => HSTORE
|
||||
admin_level => SMALLINT
|
||||
geometry => GEOMETRY
|
||||
}
|
||||
|
||||
planet_osm_nodes -[hidden]> planet_osm_ways
|
||||
planet_osm_ways -[hidden]> planet_osm_rels
|
||||
planet_osm_ways -[hidden]-> place
|
||||
|
||||
planet_osm_nodes::id <- planet_osm_ways::nodes
|
||||
|
||||
@enduml
|
||||
58
docs/develop/osm2pgsql-tables.svg
Normal file
58
docs/develop/osm2pgsql-tables.svg
Normal file
File diff suppressed because one or more lines are too long
|
After Width: | Height: | Size: 13 KiB |
31
docs/develop/parenting-flow.plantuml
Normal file
31
docs/develop/parenting-flow.plantuml
Normal file
@@ -0,0 +1,31 @@
|
||||
@startuml
|
||||
skinparam monochrome true
|
||||
|
||||
start
|
||||
|
||||
if (has 'addr:street'?) then (yes)
|
||||
if (street with that name\n nearby?) then (yes)
|
||||
:**Use closest street**
|
||||
**with same name**;
|
||||
kill
|
||||
else (no)
|
||||
:** Use closest**\n**street**;
|
||||
kill
|
||||
endif
|
||||
elseif (has 'addr:place'?) then (yes)
|
||||
if (place with that name\n nearby?) then (yes)
|
||||
:**Use closest place**
|
||||
**with same name**;
|
||||
kill
|
||||
else (no)
|
||||
:add addr:place to adress;
|
||||
:**Use closest place**\n**rank 16 to 25**;
|
||||
kill
|
||||
endif
|
||||
else (otherwise)
|
||||
:**Use closest**\n**street**;
|
||||
kill
|
||||
endif
|
||||
|
||||
|
||||
@enduml
|
||||
41
docs/develop/parenting-flow.svg
Normal file
41
docs/develop/parenting-flow.svg
Normal file
File diff suppressed because one or more lines are too long
|
After Width: | Height: | Size: 9.8 KiB |
99
docs/develop/search-tables.plantuml
Normal file
99
docs/develop/search-tables.plantuml
Normal file
@@ -0,0 +1,99 @@
|
||||
@startuml
|
||||
skinparam monochrome true
|
||||
skinparam ObjectFontStyle bold
|
||||
|
||||
left to right direction
|
||||
|
||||
map placex {
|
||||
place_id => BIGINT
|
||||
osm_type => CHAR(1)
|
||||
osm_id => BIGINT
|
||||
class => TEXT
|
||||
type => TEXT
|
||||
name => HSTORE
|
||||
address => HSTORE
|
||||
extratags => HSTORE
|
||||
admin_level => SMALLINT
|
||||
partition => SMALLINT
|
||||
geometry_sector => INT
|
||||
parent_place_id => BIGINT
|
||||
linked_place_id => BIGINT
|
||||
importance => DOUBLE
|
||||
rank_search => SMALLINT
|
||||
rank_address => SMALLINT
|
||||
wikipedia => TEXT
|
||||
country_code => VARCHAR(2)
|
||||
housenumber => TEXT
|
||||
postcode => TEXT
|
||||
indexed_status => SMALLINT
|
||||
indexed_date => TIMESTAMP
|
||||
centroid => GEOMETRY
|
||||
geometry => GEOMETRY
|
||||
}
|
||||
|
||||
map search_name {
|
||||
place_id => BIGINT
|
||||
importance => DOUBLE
|
||||
search_rank => SMALLINT
|
||||
address_rank => SMALLINT
|
||||
name_vector => INT[]
|
||||
nameaddress_vector => INT[]
|
||||
country_code => VARCHAR(2)
|
||||
centroid => GEOMETRY
|
||||
}
|
||||
|
||||
map word {
|
||||
word_id => INT
|
||||
word_token => TEXT
|
||||
... =>
|
||||
}
|
||||
|
||||
map location_property_osmline {
|
||||
place_id => BIGINT
|
||||
osm_id => BIGINT
|
||||
startnumber => INT
|
||||
endnumber => INT
|
||||
interpolationtype => TEXT
|
||||
address => HSTORE
|
||||
partition => SMALLINT
|
||||
geometry_sector => INT
|
||||
parent_place_id => BIGINT
|
||||
country_code => VARCHAR(2)
|
||||
postcode => text
|
||||
indexed_status => SMALLINT
|
||||
indexed_date => TIMESTAMP
|
||||
linegeo => GEOMETRY
|
||||
}
|
||||
|
||||
map place_addressline {
|
||||
place_id => BIGINT
|
||||
address_place_id => BIGINT
|
||||
distance => DOUBLE
|
||||
cached_rank_address => SMALLINT
|
||||
fromarea => BOOLEAN
|
||||
isaddress => BOOLEAN
|
||||
}
|
||||
|
||||
map location_postcode {
|
||||
place_id => BIGINT
|
||||
postcode => TEXT
|
||||
parent_place_id => BIGINT
|
||||
rank_search => SMALLINT
|
||||
rank_address => SMALLINT
|
||||
indexed_status => SMALLINT
|
||||
indexed_date => TIMESTAMP
|
||||
geometry => GEOMETRY
|
||||
}
|
||||
|
||||
placex::place_id <-- search_name::place_id
|
||||
placex::place_id <-- place_addressline::place_id
|
||||
placex::place_id <-- place_addressline::address_place_id
|
||||
|
||||
search_name::name_vector --> word::word_id
|
||||
search_name::nameaddress_vector --> word::word_id
|
||||
|
||||
place_addressline -[hidden]> location_property_osmline
|
||||
search_name -[hidden]> place_addressline
|
||||
location_property_osmline -[hidden]-> location_postcode
|
||||
|
||||
@enduml
|
||||
117
docs/develop/search-tables.svg
Normal file
117
docs/develop/search-tables.svg
Normal file
File diff suppressed because one or more lines are too long
|
After Width: | Height: | Size: 35 KiB |
@@ -13,3 +13,11 @@ th, td {
|
||||
th {
|
||||
background-color: #eee;
|
||||
}
|
||||
|
||||
/* Indentation for mkdocstrings.
|
||||
div.doc-contents:not(.first) {
|
||||
padding-left: 25px;
|
||||
border-left: 4px solid rgba(230, 230, 230);
|
||||
margin-bottom: 60px;
|
||||
}*/
|
||||
|
||||
|
||||
@@ -1,8 +1,10 @@
|
||||
Nominatim (from the Latin, 'by name') is a tool to search OSM data by name and address and to generate synthetic addresses of OSM points (reverse geocoding).
|
||||
|
||||
This guide comes in three parts:
|
||||
This guide comes in four parts:
|
||||
|
||||
* __[API reference](api/Overview.md)__ for users of Nominatim
|
||||
* __[Administration Guide](admin/Installation.md)__ for those who want
|
||||
to install their own Nominatim server
|
||||
* __[Customization Guide](customize/Overview.md)__ for those who want to
|
||||
adapt their own installation to their special requirements
|
||||
* __[Developer's Guide](develop/overview.md)__ for developers of the software
|
||||
|
||||
@@ -19,18 +19,26 @@ pages:
|
||||
- 'Import' : 'admin/Import.md'
|
||||
- 'Update' : 'admin/Update.md'
|
||||
- 'Deploy' : 'admin/Deployment.md'
|
||||
- 'Customize Imports' : 'admin/Customization.md'
|
||||
- 'Tokenizers' : 'admin/Tokenizers.md'
|
||||
- 'Nominatim UI' : 'admin/Setup-Nominatim-UI.md'
|
||||
- 'Advanced Installations' : 'admin/Advanced-Installations.md'
|
||||
- 'Maintenance' : 'admin/Maintenance.md'
|
||||
- 'Migration from older Versions' : 'admin/Migration.md'
|
||||
- 'Troubleshooting' : 'admin/Faq.md'
|
||||
- 'Customization Guide':
|
||||
- 'Overview': 'customize/Overview.md'
|
||||
- 'Import Styles': 'customize/Import-Styles.md'
|
||||
- 'Configuration Settings': 'customize/Settings.md'
|
||||
- 'Place Ranking' : 'customize/Ranking.md'
|
||||
- 'Tokenizers' : 'customize/Tokenizers.md'
|
||||
- 'Special Phrases': 'customize/Special-Phrases.md'
|
||||
- 'External data: US housenumbers from TIGER': 'customize/Tiger.md'
|
||||
- 'External data: Postcodes': 'customize/Postcodes.md'
|
||||
- 'Developers Guide':
|
||||
- 'Setup for Development' : 'develop/Development-Environment.md'
|
||||
- 'Architecture Overview' : 'develop/overview.md'
|
||||
- 'OSM Data Import' : 'develop/Import.md'
|
||||
- 'Place Ranking' : 'develop/Ranking.md'
|
||||
- 'Postcodes' : 'develop/Postcodes.md'
|
||||
- 'Database Layout' : 'develop/Database-Layout.md'
|
||||
- 'Indexing' : 'develop/Indexing.md'
|
||||
- 'Tokenizers' : 'develop/Tokenizers.md'
|
||||
- 'Setup for Development' : 'develop/Development-Environment.md'
|
||||
- 'Testing' : 'develop/Testing.md'
|
||||
- 'External Data Sources': 'develop/data-sources.md'
|
||||
- 'Appendix':
|
||||
@@ -41,6 +49,15 @@ pages:
|
||||
markdown_extensions:
|
||||
- codehilite
|
||||
- admonition
|
||||
- def_list
|
||||
- toc:
|
||||
permalink:
|
||||
extra_css: [extra.css, styles.css]
|
||||
plugins:
|
||||
- search
|
||||
- mkdocstrings:
|
||||
handlers:
|
||||
python:
|
||||
rendering:
|
||||
show_source: false
|
||||
show_signature_annotations: false
|
||||
|
||||
@@ -127,7 +127,7 @@ class Debug
|
||||
|
||||
public static function printSQL($sSQL)
|
||||
{
|
||||
echo '<p><tt><font color="#aaa">'.$sSQL.'</font></tt></p>'."\n";
|
||||
echo '<p><tt><font color="#aaa">'.htmlspecialchars($sSQL, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML401).'</font></tt></p>'."\n";
|
||||
}
|
||||
|
||||
private static function outputVar($mVar, $sPreNL)
|
||||
@@ -170,11 +170,12 @@ class Debug
|
||||
}
|
||||
|
||||
if (is_string($mVar)) {
|
||||
echo "'$mVar'";
|
||||
return strlen($mVar) + 2;
|
||||
$sOut = "'$mVar'";
|
||||
} else {
|
||||
$sOut = (string)$mVar;
|
||||
}
|
||||
|
||||
echo (string)$mVar;
|
||||
return strlen((string)$mVar);
|
||||
echo htmlspecialchars($sOut, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML401);
|
||||
return strlen($sOut);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -498,7 +498,6 @@ class Geocode
|
||||
if ($this->aCountryCodes) {
|
||||
$oCtx->setCountryList($this->aCountryCodes);
|
||||
}
|
||||
$this->oTokenizer->setCountryRestriction($this->aCountryCodes);
|
||||
|
||||
Debug::newSection('Query Preprocessing');
|
||||
|
||||
@@ -507,13 +506,6 @@ class Geocode
|
||||
userError('Query string is not UTF-8 encoded.');
|
||||
}
|
||||
|
||||
// Conflicts between US state abreviations and various words for 'the' in different languages
|
||||
if (isset($this->aLangPrefOrder['name:en'])) {
|
||||
$sQuery = preg_replace('/(^|,)\s*il\s*(,|$)/i', '\1illinois\2', $sQuery);
|
||||
$sQuery = preg_replace('/(^|,)\s*al\s*(,|$)/i', '\1alabama\2', $sQuery);
|
||||
$sQuery = preg_replace('/(^|,)\s*la\s*(,|$)/i', '\1louisiana\2', $sQuery);
|
||||
}
|
||||
|
||||
// Do we have anything that looks like a lat/lon pair?
|
||||
$sQuery = $oCtx->setNearPointFromQuery($sQuery);
|
||||
|
||||
|
||||
@@ -9,29 +9,14 @@ namespace Nominatim;
|
||||
*/
|
||||
class Phrase
|
||||
{
|
||||
const MAX_WORDSET_LEN = 20;
|
||||
const MAX_WORDSETS = 100;
|
||||
|
||||
// Complete phrase as a string.
|
||||
// Complete phrase as a string (guaranteed to have no leading or trailing
|
||||
// spaces).
|
||||
private $sPhrase;
|
||||
// Element type for structured searches.
|
||||
private $sPhraseType;
|
||||
// Possible segmentations of the phrase.
|
||||
private $aWordSets;
|
||||
|
||||
public static function cmpByArraylen($aA, $aB)
|
||||
{
|
||||
$iALen = count($aA);
|
||||
$iBLen = count($aB);
|
||||
|
||||
if ($iALen == $iBLen) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return ($iALen < $iBLen) ? -1 : 1;
|
||||
}
|
||||
|
||||
|
||||
public function __construct($sPhrase, $sPhraseType)
|
||||
{
|
||||
$this->sPhrase = trim($sPhrase);
|
||||
@@ -57,6 +42,11 @@ class Phrase
|
||||
return $this->sPhraseType;
|
||||
}
|
||||
|
||||
public function setWordSets($aWordSets)
|
||||
{
|
||||
$this->aWordSets = $aWordSets;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the array of possible segmentations of the phrase.
|
||||
*
|
||||
@@ -80,61 +70,6 @@ class Phrase
|
||||
}
|
||||
}
|
||||
|
||||
public function computeWordSets($aWords, $oTokens)
|
||||
{
|
||||
$iNumWords = count($aWords);
|
||||
|
||||
if ($iNumWords == 0) {
|
||||
$this->aWordSets = null;
|
||||
return;
|
||||
}
|
||||
|
||||
// Caches the word set for the partial phrase up to word i.
|
||||
$aSetCache = array_fill(0, $iNumWords, array());
|
||||
|
||||
// Initialise first element of cache. There can only be the word.
|
||||
if ($oTokens->containsAny($aWords[0])) {
|
||||
$aSetCache[0][] = array($aWords[0]);
|
||||
}
|
||||
|
||||
// Now do the next elements using what we already have.
|
||||
for ($i = 1; $i < $iNumWords; $i++) {
|
||||
for ($j = $i; $j > 0; $j--) {
|
||||
$sPartial = $j == $i ? $aWords[$j] : $aWords[$j].' '.$sPartial;
|
||||
if (!empty($aSetCache[$j - 1]) && $oTokens->containsAny($sPartial)) {
|
||||
$aPartial = array($sPartial);
|
||||
foreach ($aSetCache[$j - 1] as $aSet) {
|
||||
if (count($aSet) < Phrase::MAX_WORDSET_LEN) {
|
||||
$aSetCache[$i][] = array_merge($aSet, $aPartial);
|
||||
}
|
||||
}
|
||||
if (count($aSetCache[$i]) > 2 * Phrase::MAX_WORDSETS) {
|
||||
usort(
|
||||
$aSetCache[$i],
|
||||
array('\Nominatim\Phrase', 'cmpByArraylen')
|
||||
);
|
||||
$aSetCache[$i] = array_slice(
|
||||
$aSetCache[$i],
|
||||
0,
|
||||
Phrase::MAX_WORDSETS
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// finally the current full phrase
|
||||
$sPartial = $aWords[0].' '.$sPartial;
|
||||
if ($oTokens->containsAny($sPartial)) {
|
||||
$aSetCache[$i][] = array($sPartial);
|
||||
}
|
||||
}
|
||||
|
||||
$this->aWordSets = $aSetCache[$iNumWords - 1];
|
||||
usort($this->aWordSets, array('\Nominatim\Phrase', 'cmpByArraylen'));
|
||||
$this->aWordSets = array_slice($this->aWordSets, 0, Phrase::MAX_WORDSETS);
|
||||
}
|
||||
|
||||
|
||||
public function debugInfo()
|
||||
{
|
||||
return array(
|
||||
|
||||
@@ -111,6 +111,7 @@ class ReverseGeocode
|
||||
$sSQL .= ' FROM placex';
|
||||
$sSQL .= ' WHERE osm_type = \'N\'';
|
||||
$sSQL .= ' AND country_code = \''.$sCountryCode.'\'';
|
||||
$sSQL .= ' AND rank_search < 26 '; // needed to select right index
|
||||
$sSQL .= ' AND rank_search between 5 and ' .min(25, $iMaxRank);
|
||||
$sSQL .= ' AND class = \'place\' AND type != \'postcode\'';
|
||||
$sSQL .= ' AND name IS NOT NULL ';
|
||||
@@ -206,6 +207,7 @@ class ReverseGeocode
|
||||
// for place nodes at rank_address 16
|
||||
$sSQL .= ' AND rank_search > '.$iRankSearch;
|
||||
$sSQL .= ' AND rank_search <= '.$iMaxRank;
|
||||
$sSQL .= ' AND rank_search < 26 '; // needed to select right index
|
||||
$sSQL .= ' AND rank_address > 0';
|
||||
$sSQL .= ' AND class = \'place\'';
|
||||
$sSQL .= ' AND type != \'postcode\'';
|
||||
|
||||
@@ -28,6 +28,8 @@ class SearchContext
|
||||
public $sqlViewboxLarge = '';
|
||||
/// Reference along a route (as SQL).
|
||||
public $sqlViewboxCentre = '';
|
||||
/// List of countries to restrict search to (as array).
|
||||
public $aCountryList = null;
|
||||
/// List of countries to restrict search to (as SQL).
|
||||
public $sqlCountryList = '';
|
||||
/// List of place IDs to exclude (as SQL).
|
||||
@@ -187,6 +189,7 @@ class SearchContext
|
||||
public function setCountryList($aCountries)
|
||||
{
|
||||
$this->sqlCountryList = '('.join(',', array_map('addQuotes', $aCountries)).')';
|
||||
$this->aCountryList = $aCountries;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -279,6 +282,19 @@ class SearchContext
|
||||
return '';
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if the given country is covered by the search context.
|
||||
*
|
||||
* @param string $sCountryCode Country code of the country to check.
|
||||
*
|
||||
* @return True, if no country code restrictions are set or the
|
||||
* country is included in the country list.
|
||||
*/
|
||||
public function isCountryApplicable($sCountryCode)
|
||||
{
|
||||
return $this->aCountryList === null || in_array($sCountryCode, $this->aCountryList);
|
||||
}
|
||||
|
||||
public function debugInfo()
|
||||
{
|
||||
return array(
|
||||
|
||||
@@ -19,6 +19,8 @@ class SearchDescription
|
||||
private $aName = array();
|
||||
/// True if the name is rare enough to force index use on name.
|
||||
private $bRareName = false;
|
||||
/// True if the name requires to be accompanied by address terms.
|
||||
private $bNameNeedsAddress = false;
|
||||
/// List of word ids making up the address of the object.
|
||||
private $aAddress = array();
|
||||
/// List of word ids that appear in the name but should be ignored.
|
||||
@@ -113,6 +115,9 @@ class SearchDescription
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if ($this->bNameNeedsAddress && empty($this->aAddress)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
@@ -231,6 +236,7 @@ class SearchDescription
|
||||
{
|
||||
$this->aName[$iId] = $iId;
|
||||
$this->bRareName = $bRareName;
|
||||
$this->bNameNeedsAddress = false;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -240,11 +246,19 @@ class SearchDescription
|
||||
* @param integer iID ID of term to add.
|
||||
* @param bool bSearchable Term should be used to search for result
|
||||
* (i.e. term is not a stop word).
|
||||
* @param bool bNeedsAddress True if the term is too unspecific to be used
|
||||
* in a stand-alone search without an address
|
||||
* to narrow down the search.
|
||||
* @param integer iPhraseNumber Index of phrase, where the partial term
|
||||
* appears.
|
||||
*/
|
||||
public function addPartialNameToken($iId, $bSearchable, $iPhraseNumber)
|
||||
public function addPartialNameToken($iId, $bSearchable, $bNeedsAddress, $iPhraseNumber)
|
||||
{
|
||||
if (empty($this->aName)) {
|
||||
$this->bNameNeedsAddress = $bNeedsAddress;
|
||||
} else {
|
||||
$this->bNameNeedsAddress &= $bNeedsAddress;
|
||||
}
|
||||
if ($bSearchable) {
|
||||
$this->aName[$iId] = $iId;
|
||||
} else {
|
||||
@@ -310,6 +324,7 @@ class SearchDescription
|
||||
{
|
||||
$this->aAddress = array_merge($this->aAddress, $this->aName);
|
||||
$this->bRareName = false;
|
||||
$this->bNameNeedsAddress = true;
|
||||
$this->aName = array($iId => $iId);
|
||||
$this->iNamePhrase = -1;
|
||||
}
|
||||
@@ -566,32 +581,37 @@ class SearchDescription
|
||||
|
||||
// Sort by existence of the requested house number but only if not
|
||||
// too many results are expected for the street, i.e. if the result
|
||||
// will be narrowed down by an address. Remeber that with ordering
|
||||
// will be narrowed down by an address. Remember that with ordering
|
||||
// every single result has to be checked.
|
||||
if ($this->sHouseNumber && ($this->bRareName || !empty($this->aAddress) || $this->sPostcode)) {
|
||||
$sHouseNumberRegex = '\\\\m'.$this->sHouseNumber.'\\\\M';
|
||||
$aOrder[] = ' (';
|
||||
$aOrder[0] .= 'EXISTS(';
|
||||
$aOrder[0] .= ' SELECT place_id';
|
||||
$aOrder[0] .= ' FROM placex';
|
||||
$aOrder[0] .= ' WHERE parent_place_id = search_name.place_id';
|
||||
$aOrder[0] .= " AND housenumber ~* E'".$sHouseNumberRegex."'";
|
||||
$aOrder[0] .= ' LIMIT 1';
|
||||
$aOrder[0] .= ') ';
|
||||
// also housenumbers from interpolation lines table are needed
|
||||
if (preg_match('/[0-9]+/', $this->sHouseNumber)) {
|
||||
$iHouseNumber = intval($this->sHouseNumber);
|
||||
$aOrder[0] .= 'OR EXISTS(';
|
||||
$aOrder[0] .= ' SELECT place_id ';
|
||||
$aOrder[0] .= ' FROM location_property_osmline ';
|
||||
$aOrder[0] .= ' WHERE parent_place_id = search_name.place_id';
|
||||
$aOrder[0] .= ' AND startnumber is not NULL';
|
||||
$aOrder[0] .= ' AND '.$iHouseNumber.'>=startnumber ';
|
||||
$aOrder[0] .= ' AND '.$iHouseNumber.'<=endnumber ';
|
||||
$aOrder[0] .= ' LIMIT 1';
|
||||
$aOrder[0] .= ')';
|
||||
$sHouseNumberRegex = $oDB->getDBQuoted('\\\\m'.$this->sHouseNumber.'\\\\M');
|
||||
|
||||
// Housenumbers on streets and places.
|
||||
$sChildHnr = 'SELECT * FROM placex WHERE parent_place_id = search_name.place_id';
|
||||
$sChildHnr .= ' AND housenumber ~* E'.$sHouseNumberRegex;
|
||||
// Interpolations on streets and places.
|
||||
if (preg_match('/^[0-9]+$/', $this->sHouseNumber)) {
|
||||
$sIpolHnr = 'SELECT * FROM location_property_osmline ';
|
||||
$sIpolHnr .= 'WHERE parent_place_id = search_name.place_id ';
|
||||
$sIpolHnr .= ' AND startnumber is not NULL';
|
||||
$sIpolHnr .= ' AND '.$this->sHouseNumber.'>=startnumber ';
|
||||
$sIpolHnr .= ' AND '.$this->sHouseNumber.'<=endnumber ';
|
||||
} else {
|
||||
$sIpolHnr = false;
|
||||
}
|
||||
$aOrder[0] .= ') DESC';
|
||||
// Housenumbers on the object iteself for unlisted places.
|
||||
$sSelfHnr = 'SELECT * FROM placex WHERE place_id = search_name.place_id';
|
||||
$sSelfHnr .= ' AND housenumber ~* E'.$sHouseNumberRegex;
|
||||
|
||||
$sSql = '(CASE WHEN address_rank = 30 THEN EXISTS('.$sSelfHnr.') ';
|
||||
$sSql .= ' ELSE EXISTS('.$sChildHnr.') ';
|
||||
if ($sIpolHnr) {
|
||||
$sSql .= 'OR EXISTS('.$sIpolHnr.') ';
|
||||
}
|
||||
$sSql .= 'END) DESC';
|
||||
|
||||
|
||||
$aOrder[] = $sSql;
|
||||
}
|
||||
|
||||
if (!empty($this->aName)) {
|
||||
@@ -624,7 +644,7 @@ class SearchDescription
|
||||
$aOrder[] = $this->oContext->distanceSQL('centroid');
|
||||
} elseif ($this->sPostcode) {
|
||||
if (empty($this->aAddress)) {
|
||||
$aTerms[] = "EXISTS(SELECT place_id FROM location_postcode p WHERE p.postcode = '".$this->sPostcode."' AND ST_DWithin(search_name.centroid, p.geometry, 0.1))";
|
||||
$aTerms[] = "EXISTS(SELECT place_id FROM location_postcode p WHERE p.postcode = '".$this->sPostcode."' AND ST_DWithin(search_name.centroid, p.geometry, 0.12))";
|
||||
} else {
|
||||
$aOrder[] = "(SELECT min(ST_Distance(search_name.centroid, p.geometry)) FROM location_postcode p WHERE p.postcode = '".$this->sPostcode."')";
|
||||
}
|
||||
@@ -719,9 +739,9 @@ class SearchDescription
|
||||
return $aResults;
|
||||
}
|
||||
|
||||
$sHouseNumberRegex = '\\\\m'.$this->sHouseNumber.'\\\\M';
|
||||
$sHouseNumberRegex = $oDB->getDBQuoted('\\\\m'.$this->sHouseNumber.'\\\\M');
|
||||
$sSQL = 'SELECT place_id FROM placex WHERE';
|
||||
$sSQL .= " housenumber ~* E'".$sHouseNumberRegex."'";
|
||||
$sSQL .= ' housenumber ~* E'.$sHouseNumberRegex;
|
||||
$sSQL .= ' AND ('.join(' OR ', $aIDCondition).')';
|
||||
$sSQL .= $this->oContext->excludeSQL(' AND place_id');
|
||||
|
||||
|
||||
131
lib-php/SimpleWordList.php
Normal file
131
lib-php/SimpleWordList.php
Normal file
@@ -0,0 +1,131 @@
|
||||
<?php
|
||||
|
||||
namespace Nominatim;
|
||||
|
||||
/**
|
||||
* A word list creator based on simple splitting by space.
|
||||
*
|
||||
* Creates possible permutations of split phrases by finding all combination
|
||||
* of splitting the phrase on space boundaries.
|
||||
*/
|
||||
class SimpleWordList
|
||||
{
|
||||
const MAX_WORDSET_LEN = 20;
|
||||
const MAX_WORDSETS = 100;
|
||||
|
||||
// The phrase as a list of simple terms (without spaces).
|
||||
private $aWords;
|
||||
|
||||
/**
|
||||
* Create a new word list
|
||||
*
|
||||
* @param string sPhrase Phrase to create the word list from. The phrase is
|
||||
* expected to be normalised, so that there are no
|
||||
* subsequent spaces.
|
||||
*/
|
||||
public function __construct($sPhrase)
|
||||
{
|
||||
if (strlen($sPhrase) > 0) {
|
||||
$this->aWords = explode(' ', $sPhrase);
|
||||
} else {
|
||||
$this->aWords = array();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all possible tokens that are present in this word list.
|
||||
*
|
||||
* @return array The list of string tokens in the word list.
|
||||
*/
|
||||
public function getTokens()
|
||||
{
|
||||
$aTokens = array();
|
||||
$iNumWords = count($this->aWords);
|
||||
|
||||
for ($i = 0; $i < $iNumWords; $i++) {
|
||||
$sPhrase = $this->aWords[$i];
|
||||
$aTokens[$sPhrase] = $sPhrase;
|
||||
|
||||
for ($j = $i + 1; $j < $iNumWords; $j++) {
|
||||
$sPhrase .= ' '.$this->aWords[$j];
|
||||
$aTokens[$sPhrase] = $sPhrase;
|
||||
}
|
||||
}
|
||||
|
||||
return $aTokens;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute all possible permutations of phrase splits that result in
|
||||
* words which are in the token list.
|
||||
*/
|
||||
public function getWordSets($oTokens)
|
||||
{
|
||||
$iNumWords = count($this->aWords);
|
||||
|
||||
if ($iNumWords == 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Caches the word set for the partial phrase up to word i.
|
||||
$aSetCache = array_fill(0, $iNumWords, array());
|
||||
|
||||
// Initialise first element of cache. There can only be the word.
|
||||
if ($oTokens->containsAny($this->aWords[0])) {
|
||||
$aSetCache[0][] = array($this->aWords[0]);
|
||||
}
|
||||
|
||||
// Now do the next elements using what we already have.
|
||||
for ($i = 1; $i < $iNumWords; $i++) {
|
||||
for ($j = $i; $j > 0; $j--) {
|
||||
$sPartial = $j == $i ? $this->aWords[$j] : $this->aWords[$j].' '.$sPartial;
|
||||
if (!empty($aSetCache[$j - 1]) && $oTokens->containsAny($sPartial)) {
|
||||
$aPartial = array($sPartial);
|
||||
foreach ($aSetCache[$j - 1] as $aSet) {
|
||||
if (count($aSet) < SimpleWordList::MAX_WORDSET_LEN) {
|
||||
$aSetCache[$i][] = array_merge($aSet, $aPartial);
|
||||
}
|
||||
}
|
||||
if (count($aSetCache[$i]) > 2 * SimpleWordList::MAX_WORDSETS) {
|
||||
usort(
|
||||
$aSetCache[$i],
|
||||
array('\Nominatim\SimpleWordList', 'cmpByArraylen')
|
||||
);
|
||||
$aSetCache[$i] = array_slice(
|
||||
$aSetCache[$i],
|
||||
0,
|
||||
SimpleWordList::MAX_WORDSETS
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// finally the current full phrase
|
||||
$sPartial = $this->aWords[0].' '.$sPartial;
|
||||
if ($oTokens->containsAny($sPartial)) {
|
||||
$aSetCache[$i][] = array($sPartial);
|
||||
}
|
||||
}
|
||||
|
||||
$aWordSets = $aSetCache[$iNumWords - 1];
|
||||
usort($aWordSets, array('\Nominatim\SimpleWordList', 'cmpByArraylen'));
|
||||
return array_slice($aWordSets, 0, SimpleWordList::MAX_WORDSETS);
|
||||
}
|
||||
|
||||
public static function cmpByArraylen($aA, $aB)
|
||||
{
|
||||
$iALen = count($aA);
|
||||
$iBLen = count($aB);
|
||||
|
||||
if ($iALen == $iBLen) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return ($iALen < $iBLen) ? -1 : 1;
|
||||
}
|
||||
|
||||
public function debugInfo()
|
||||
{
|
||||
return $this->aWords;
|
||||
}
|
||||
}
|
||||
@@ -36,7 +36,9 @@ class Country
|
||||
*/
|
||||
public function isExtendable($oSearch, $oPosition)
|
||||
{
|
||||
return !$oSearch->hasCountry() && $oPosition->maybePhrase('country');
|
||||
return !$oSearch->hasCountry()
|
||||
&& $oPosition->maybePhrase('country')
|
||||
&& $oSearch->getContext()->isCountryApplicable($this->sCountryCode);
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -58,8 +58,8 @@ class HouseNumber
|
||||
// up of numbers, add a penalty
|
||||
$iSearchCost = 1;
|
||||
if (preg_match('/\\d/', $this->sToken) === 0
|
||||
|| preg_match_all('/[^0-9]/', $this->sToken, $aMatches) > 2) {
|
||||
$iSearchCost++;
|
||||
|| preg_match_all('/[^0-9 ]/', $this->sToken, $aMatches) > 3) {
|
||||
$iSearchCost += strlen($this->sToken) - 1;
|
||||
}
|
||||
if (!$oSearch->hasOperator(\Nominatim\Operator::NONE)) {
|
||||
$iSearchCost++;
|
||||
|
||||
@@ -90,6 +90,7 @@ class Partial
|
||||
$oNewSearch->addPartialNameToken(
|
||||
$this->iId,
|
||||
$this->iSearchNameCount < CONST_Max_Word_Frequency,
|
||||
$this->iSearchNameCount > CONST_Search_NameOnlySearchFrequencyThreshold,
|
||||
$oPosition->getPhrase()
|
||||
);
|
||||
|
||||
|
||||
@@ -44,7 +44,10 @@ class SpecialTerm
|
||||
*/
|
||||
public function isExtendable($oSearch, $oPosition)
|
||||
{
|
||||
return !$oSearch->hasOperator() && $oPosition->isPhrase('');
|
||||
return !$oSearch->hasOperator()
|
||||
&& $oPosition->isPhrase('')
|
||||
&& ($this->iOperator != \Nominatim\Operator::NONE
|
||||
|| (!$oSearch->hasAddress() && !$oSearch->hasHousenumber() && !$oSearch->hasCountry()));
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -66,8 +69,8 @@ class SpecialTerm
|
||||
$iOp = \Nominatim\Operator::NAME;
|
||||
} else {
|
||||
$iOp = \Nominatim\Operator::NEAR;
|
||||
$iSearchCost += 2;
|
||||
}
|
||||
$iSearchCost += 2;
|
||||
} elseif (!$oPosition->isFirstToken() && !$oPosition->isLastToken()) {
|
||||
$iSearchCost += 2;
|
||||
}
|
||||
|
||||
@@ -1,34 +0,0 @@
|
||||
<?php
|
||||
@define('CONST_LibDir', dirname(dirname(__FILE__)));
|
||||
|
||||
require_once(CONST_LibDir.'/init-cmd.php');
|
||||
|
||||
ini_set('memory_limit', '800M');
|
||||
ini_set('display_errors', 'stderr');
|
||||
|
||||
$aCMDOptions
|
||||
= array(
|
||||
'Import country language data from osm wiki',
|
||||
array('help', 'h', 0, 1, 0, 0, false, 'Show Help'),
|
||||
array('quiet', 'q', 0, 1, 0, 0, 'bool', 'Quiet output'),
|
||||
array('verbose', 'v', 0, 1, 0, 0, 'bool', 'Verbose output'),
|
||||
array('project-dir', '', 0, 1, 1, 1, 'realpath', 'Base directory of the Nominatim installation (default: .)'),
|
||||
);
|
||||
getCmdOpt($_SERVER['argv'], $aCMDOptions, $aCMDResult, true, true);
|
||||
|
||||
loadSettings($aCMDResult['project-dir'] ?? getcwd());
|
||||
setupHTTPProxy();
|
||||
|
||||
if (true) {
|
||||
$sURL = 'https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Country_Codes';
|
||||
$sWikiPageXML = file_get_contents($sURL);
|
||||
if (preg_match_all('#\\| ([a-z]{2}) \\|\\| [^|]+\\|\\| ([a-z,]+)#', $sWikiPageXML, $aMatches, PREG_SET_ORDER)) {
|
||||
foreach ($aMatches as $aMatch) {
|
||||
$aLanguages = explode(',', $aMatch[2]);
|
||||
foreach ($aLanguages as $i => $s) {
|
||||
$aLanguages[$i] = '"'.pg_escape_string($s).'"';
|
||||
}
|
||||
echo "UPDATE country_name set country_default_language_codes = '{".join(',', $aLanguages)."}' where country_code = '".pg_escape_string($aMatch[1])."';\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -86,8 +86,13 @@ if (!$aResult['reverse-only']) {
|
||||
if ($bVerbose) {
|
||||
echo "\n";
|
||||
}
|
||||
|
||||
$oTokenizer = new \Nominatim\Tokenizer($oDB);
|
||||
|
||||
$aWords = $oTokenizer->mostFrequentWords(1000);
|
||||
|
||||
$sSQL = 'SELECT word FROM word WHERE word is not null ORDER BY search_name_count DESC LIMIT 1000';
|
||||
foreach ($oDB->getCol($sSQL) as $sWord) {
|
||||
foreach ($aWords as $sWord) {
|
||||
if ($bVerbose) {
|
||||
echo "$sWord = ";
|
||||
}
|
||||
|
||||
@@ -1,21 +0,0 @@
|
||||
<?php
|
||||
|
||||
$phpPhraseSettingsFile = $argv[1];
|
||||
$jsonPhraseSettingsFile = dirname($phpPhraseSettingsFile).'/'.basename($phpPhraseSettingsFile, '.php').'.json';
|
||||
|
||||
if (file_exists($phpPhraseSettingsFile) && !file_exists($jsonPhraseSettingsFile)) {
|
||||
include $phpPhraseSettingsFile;
|
||||
|
||||
$data = array();
|
||||
|
||||
if (isset($aTagsBlacklist)) {
|
||||
$data['blackList'] = $aTagsBlacklist;
|
||||
}
|
||||
if (isset($aTagsWhitelist)) {
|
||||
$data['whiteList'] = $aTagsWhitelist;
|
||||
}
|
||||
|
||||
$jsonFile = fopen($jsonPhraseSettingsFile, 'w');
|
||||
fwrite($jsonFile, json_encode($data));
|
||||
fclose($jsonFile);
|
||||
}
|
||||
@@ -2,13 +2,14 @@
|
||||
|
||||
namespace Nominatim;
|
||||
|
||||
require_once(CONST_LibDir.'/SimpleWordList.php');
|
||||
|
||||
class Tokenizer
|
||||
{
|
||||
private $oDB;
|
||||
|
||||
private $oNormalizer;
|
||||
private $oTransliterator;
|
||||
private $aCountryRestriction;
|
||||
|
||||
public function __construct(&$oDB)
|
||||
{
|
||||
@@ -19,7 +20,7 @@ class Tokenizer
|
||||
|
||||
public function checkStatus()
|
||||
{
|
||||
$sSQL = 'SELECT word_id FROM word limit 1';
|
||||
$sSQL = 'SELECT word_id FROM word WHERE word_id is not null limit 1';
|
||||
$iWordID = $this->oDB->getOne($sSQL);
|
||||
if ($iWordID === false) {
|
||||
throw new \Exception('Query failed', 703);
|
||||
@@ -30,12 +31,6 @@ class Tokenizer
|
||||
}
|
||||
|
||||
|
||||
public function setCountryRestriction($aCountries)
|
||||
{
|
||||
$this->aCountryRestriction = $aCountries;
|
||||
}
|
||||
|
||||
|
||||
public function normalizeString($sTerm)
|
||||
{
|
||||
if ($this->oNormalizer === null) {
|
||||
@@ -45,6 +40,15 @@ class Tokenizer
|
||||
return $this->oNormalizer->transliterate($sTerm);
|
||||
}
|
||||
|
||||
|
||||
public function mostFrequentWords($iNum)
|
||||
{
|
||||
$sSQL = "SELECT word FROM word WHERE type = 'W'";
|
||||
$sSQL .= "ORDER BY info->'count' DESC LIMIT ".$iNum;
|
||||
return $this->oDB->getCol($sSQL);
|
||||
}
|
||||
|
||||
|
||||
private function makeStandardWord($sTerm)
|
||||
{
|
||||
return trim($this->oTransliterator->transliterate(' '.$sTerm.' '));
|
||||
@@ -88,13 +92,10 @@ class Tokenizer
|
||||
$sNormQuery .= ','.$this->normalizeString($oPhrase->getPhrase());
|
||||
$sPhrase = $this->makeStandardWord($oPhrase->getPhrase());
|
||||
Debug::printVar('Phrase', $sPhrase);
|
||||
if (strlen($sPhrase) > 0) {
|
||||
$aWords = explode(' ', $sPhrase);
|
||||
Tokenizer::addTokens($aTokens, $aWords);
|
||||
$aWordLists[] = $aWords;
|
||||
} else {
|
||||
$aWordLists[] = array();
|
||||
}
|
||||
|
||||
$oWordList = new SimpleWordList($sPhrase);
|
||||
$aTokens = array_merge($aTokens, $oWordList->getTokens());
|
||||
$aWordLists[] = $oWordList;
|
||||
}
|
||||
|
||||
Debug::printVar('Tokens', $aTokens);
|
||||
@@ -103,7 +104,7 @@ class Tokenizer
|
||||
$oValidTokens = $this->computeValidTokens($aTokens, $sNormQuery);
|
||||
|
||||
foreach ($aPhrases as $iPhrase => $oPhrase) {
|
||||
$oPhrase->computeWordSets($aWordLists[$iPhrase], $oValidTokens);
|
||||
$oPhrase->setWordSets($aWordLists[$iPhrase]->getWordSets($oValidTokens));
|
||||
}
|
||||
|
||||
return $oValidTokens;
|
||||
@@ -162,10 +163,7 @@ class Tokenizer
|
||||
|
||||
switch ($aWord['type']) {
|
||||
case 'C': // country name tokens
|
||||
if ($aWord['word'] !== null
|
||||
&& (!$this->aCountryRestriction
|
||||
|| in_array($aWord['word'], $this->aCountryRestriction))
|
||||
) {
|
||||
if ($aWord['word'] !== null) {
|
||||
$oValidTokens->addToken(
|
||||
$sTok,
|
||||
new Token\Country($iId, $aWord['word'])
|
||||
@@ -220,27 +218,4 @@ class Tokenizer
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Add the tokens from this phrase to the given list of tokens.
|
||||
*
|
||||
* @param string[] $aTokens List of tokens to append.
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
private static function addTokens(&$aTokens, $aWords)
|
||||
{
|
||||
$iNumWords = count($aWords);
|
||||
|
||||
for ($i = 0; $i < $iNumWords; $i++) {
|
||||
$sPhrase = $aWords[$i];
|
||||
$aTokens[$sPhrase] = $sPhrase;
|
||||
|
||||
for ($j = $i + 1; $j < $iNumWords; $j++) {
|
||||
$sPhrase .= ' '.$aWords[$j];
|
||||
$aTokens[$sPhrase] = $sPhrase;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2,12 +2,13 @@
|
||||
|
||||
namespace Nominatim;
|
||||
|
||||
require_once(CONST_LibDir.'/SimpleWordList.php');
|
||||
|
||||
class Tokenizer
|
||||
{
|
||||
private $oDB;
|
||||
|
||||
private $oNormalizer = null;
|
||||
private $aCountryRestriction = null;
|
||||
|
||||
public function __construct(&$oDB)
|
||||
{
|
||||
@@ -37,12 +38,6 @@ class Tokenizer
|
||||
}
|
||||
|
||||
|
||||
public function setCountryRestriction($aCountries)
|
||||
{
|
||||
$this->aCountryRestriction = $aCountries;
|
||||
}
|
||||
|
||||
|
||||
public function normalizeString($sTerm)
|
||||
{
|
||||
if ($this->oNormalizer === null) {
|
||||
@@ -53,6 +48,14 @@ class Tokenizer
|
||||
}
|
||||
|
||||
|
||||
public function mostFrequentWords($iNum)
|
||||
{
|
||||
$sSQL = 'SELECT word FROM word WHERE word is not null ';
|
||||
$sSQL .= 'ORDER BY search_name_count DESC LIMIT '.$iNum;
|
||||
return $this->oDB->getCol($sSQL);
|
||||
}
|
||||
|
||||
|
||||
public function tokensForSpecialTerm($sTerm)
|
||||
{
|
||||
$aResults = array();
|
||||
@@ -92,6 +95,23 @@ class Tokenizer
|
||||
$sNormQuery .= ','.$this->normalizeString($oPhrase->getPhrase());
|
||||
$sSQL .= 'make_standard_name(:' .$iPhrase.') as p'.$iPhrase.',';
|
||||
$aParams[':'.$iPhrase] = $oPhrase->getPhrase();
|
||||
|
||||
// Conflicts between US state abbreviations and various words
|
||||
// for 'the' in different languages
|
||||
switch (strtolower($oPhrase->getPhrase())) {
|
||||
case 'il':
|
||||
$aParams[':'.$iPhrase] = 'illinois';
|
||||
break;
|
||||
case 'al':
|
||||
$aParams[':'.$iPhrase] = 'alabama';
|
||||
break;
|
||||
case 'la':
|
||||
$aParams[':'.$iPhrase] = 'louisiana';
|
||||
break;
|
||||
default:
|
||||
$aParams[':'.$iPhrase] = $oPhrase->getPhrase();
|
||||
break;
|
||||
}
|
||||
}
|
||||
$sSQL = substr($sSQL, 0, -1);
|
||||
|
||||
@@ -106,13 +126,14 @@ class Tokenizer
|
||||
$aWordLists = array();
|
||||
$aTokens = array();
|
||||
foreach ($aNormPhrases as $sPhrase) {
|
||||
if (strlen($sPhrase) > 0) {
|
||||
$aWords = explode(' ', $sPhrase);
|
||||
Tokenizer::addTokens($aTokens, $aWords);
|
||||
$aWordLists[] = $aWords;
|
||||
} else {
|
||||
$aWordLists[] = array();
|
||||
$oWordList = new SimpleWordList($sPhrase);
|
||||
|
||||
foreach ($oWordList->getTokens() as $sToken) {
|
||||
$aTokens[' '.$sToken] = ' '.$sToken;
|
||||
$aTokens[$sToken] = $sToken;
|
||||
}
|
||||
|
||||
$aWordLists[] = $oWordList;
|
||||
}
|
||||
|
||||
Debug::printVar('Tokens', $aTokens);
|
||||
@@ -121,7 +142,7 @@ class Tokenizer
|
||||
$oValidTokens = $this->computeValidTokens($aTokens, $sNormQuery);
|
||||
|
||||
foreach ($aPhrases as $iPhrase => $oPhrase) {
|
||||
$oPhrase->computeWordSets($aWordLists[$iPhrase], $oValidTokens);
|
||||
$oPhrase->setWordSets($aWordLists[$iPhrase]->getWordSets($oValidTokens));
|
||||
}
|
||||
|
||||
return $oValidTokens;
|
||||
@@ -206,12 +227,7 @@ class Tokenizer
|
||||
);
|
||||
}
|
||||
} elseif ($aWord['country_code']) {
|
||||
// Filter country tokens that do not match restricted countries.
|
||||
if (!$this->aCountryRestriction
|
||||
|| in_array($aWord['country_code'], $this->aCountryRestriction)
|
||||
) {
|
||||
$oToken = new Token\Country($iId, $aWord['country_code']);
|
||||
}
|
||||
$oToken = new Token\Country($iId, $aWord['country_code']);
|
||||
} elseif ($aWord['word_token'][0] == ' ') {
|
||||
$oToken = new Token\Word(
|
||||
$iId,
|
||||
@@ -238,29 +254,4 @@ class Tokenizer
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Add the tokens from this phrase to the given list of tokens.
|
||||
*
|
||||
* @param string[] $aTokens List of tokens to append.
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
private static function addTokens(&$aTokens, $aWords)
|
||||
{
|
||||
$iNumWords = count($aWords);
|
||||
|
||||
for ($i = 0; $i < $iNumWords; $i++) {
|
||||
$sPhrase = $aWords[$i];
|
||||
$aTokens[' '.$sPhrase] = ' '.$sPhrase;
|
||||
$aTokens[$sPhrase] = $sPhrase;
|
||||
|
||||
for ($j = $i + 1; $j < $iNumWords; $j++) {
|
||||
$sPhrase .= ' '.$aWords[$j];
|
||||
$aTokens[' '.$sPhrase] = ' '.$sPhrase;
|
||||
$aTokens[$sPhrase] = $sPhrase;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -223,11 +223,13 @@ BEGIN
|
||||
OR placex.country_code = place.country_code)
|
||||
ORDER BY rank_address desc,
|
||||
(place_addressline.place_id = in_place_id) desc,
|
||||
(fromarea and place.centroid is not null and not isaddress
|
||||
and (place.address is null or avals(name) && avals(place.address))
|
||||
and ST_Contains(geometry, place.centroid)) desc,
|
||||
isaddress desc, fromarea desc,
|
||||
distance asc, rank_search desc
|
||||
(CASE WHEN coalesce((avals(name) && avals(place.address)), False) THEN 2
|
||||
WHEN isaddress THEN 0
|
||||
WHEN fromarea
|
||||
and place.centroid is not null
|
||||
and ST_Contains(geometry, place.centroid) THEN 1
|
||||
ELSE -1 END) desc,
|
||||
fromarea desc, distance asc, rank_search desc
|
||||
LOOP
|
||||
-- RAISE WARNING '%',location;
|
||||
location_isaddress := location.rank_address != current_rank_address;
|
||||
|
||||
@@ -43,7 +43,7 @@ LANGUAGE plpgsql STABLE;
|
||||
|
||||
|
||||
-- find the parent road of the cut road parts
|
||||
CREATE OR REPLACE FUNCTION get_interpolation_parent(street INTEGER[], place INTEGER[],
|
||||
CREATE OR REPLACE FUNCTION get_interpolation_parent(token_info JSONB,
|
||||
partition SMALLINT,
|
||||
centroid GEOMETRY, geom GEOMETRY)
|
||||
RETURNS BIGINT
|
||||
@@ -52,7 +52,7 @@ DECLARE
|
||||
parent_place_id BIGINT;
|
||||
location RECORD;
|
||||
BEGIN
|
||||
parent_place_id := find_parent_for_address(street, place, partition, centroid);
|
||||
parent_place_id := find_parent_for_address(token_info, partition, centroid);
|
||||
|
||||
IF parent_place_id is null THEN
|
||||
FOR location IN SELECT place_id FROM placex
|
||||
@@ -155,9 +155,8 @@ BEGIN
|
||||
NEW.interpolationtype = NEW.address->'interpolation';
|
||||
|
||||
place_centroid := ST_PointOnSurface(NEW.linegeo);
|
||||
NEW.parent_place_id = get_interpolation_parent(token_addr_street_match_tokens(NEW.token_info),
|
||||
token_addr_place_match_tokens(NEW.token_info),
|
||||
NEW.partition, place_centroid, NEW.linegeo);
|
||||
NEW.parent_place_id = get_interpolation_parent(NEW.token_info, NEW.partition,
|
||||
place_centroid, NEW.linegeo);
|
||||
|
||||
interpol_postcode := token_normalized_postcode(NEW.address->'postcode');
|
||||
|
||||
|
||||
@@ -66,7 +66,7 @@ LANGUAGE plpgsql STABLE;
|
||||
|
||||
CREATE OR REPLACE FUNCTION get_address_place(in_partition SMALLINT, feature GEOMETRY,
|
||||
from_rank SMALLINT, to_rank SMALLINT,
|
||||
extent FLOAT, tokens INT[])
|
||||
extent FLOAT, token_info JSONB, key TEXT)
|
||||
RETURNS nearfeaturecentr
|
||||
AS $$
|
||||
DECLARE
|
||||
@@ -80,7 +80,7 @@ BEGIN
|
||||
FROM location_area_large_{{ partition }}
|
||||
WHERE geometry && ST_Expand(feature, extent)
|
||||
AND rank_address between from_rank and to_rank
|
||||
AND tokens && keywords
|
||||
AND token_matches_address(token_info, key, keywords)
|
||||
GROUP BY place_id, keywords, rank_address, rank_search, isguess, postcode, centroid
|
||||
ORDER BY bool_or(ST_Intersects(geometry, feature)), distance LIMIT 1;
|
||||
RETURN r;
|
||||
@@ -148,18 +148,21 @@ LANGUAGE plpgsql;
|
||||
|
||||
CREATE OR REPLACE FUNCTION getNearestNamedRoadPlaceId(in_partition INTEGER,
|
||||
point GEOMETRY,
|
||||
isin_token INTEGER[])
|
||||
token_info JSONB)
|
||||
RETURNS BIGINT
|
||||
AS $$
|
||||
DECLARE
|
||||
parent BIGINT;
|
||||
BEGIN
|
||||
IF not token_has_addr_street(token_info) THEN
|
||||
RETURN NULL;
|
||||
END IF;
|
||||
|
||||
{% for partition in db.partitions %}
|
||||
IF in_partition = {{ partition }} THEN
|
||||
SELECT place_id FROM search_name_{{ partition }}
|
||||
INTO parent
|
||||
WHERE name_vector && isin_token
|
||||
WHERE token_matches_street(token_info, name_vector)
|
||||
AND centroid && ST_Expand(point, 0.015)
|
||||
AND address_rank between 26 and 27
|
||||
ORDER BY ST_Distance(centroid, point) ASC limit 1;
|
||||
@@ -174,19 +177,22 @@ LANGUAGE plpgsql STABLE;
|
||||
|
||||
CREATE OR REPLACE FUNCTION getNearestNamedPlacePlaceId(in_partition INTEGER,
|
||||
point GEOMETRY,
|
||||
isin_token INTEGER[])
|
||||
token_info JSONB)
|
||||
RETURNS BIGINT
|
||||
AS $$
|
||||
DECLARE
|
||||
parent BIGINT;
|
||||
BEGIN
|
||||
IF not token_has_addr_place(token_info) THEN
|
||||
RETURN NULL;
|
||||
END IF;
|
||||
|
||||
{% for partition in db.partitions %}
|
||||
IF in_partition = {{ partition }} THEN
|
||||
SELECT place_id
|
||||
INTO parent
|
||||
FROM search_name_{{ partition }}
|
||||
WHERE name_vector && isin_token
|
||||
WHERE token_matches_place(token_info, name_vector)
|
||||
AND centroid && ST_Expand(point, 0.04)
|
||||
AND address_rank between 16 and 25
|
||||
ORDER BY ST_Distance(centroid, point) ASC limit 1;
|
||||
|
||||
@@ -247,6 +247,7 @@ BEGIN
|
||||
indexed_status = 2,
|
||||
geometry = NEW.geometry
|
||||
where place_id = existingplacex.place_id;
|
||||
|
||||
-- if a node(=>house), which is part of a interpolation line, changes (e.g. the street attribute) => mark this line for reparenting
|
||||
-- (already here, because interpolation lines are reindexed before nodes, so in the second call it would be too late)
|
||||
IF NEW.osm_type='N'
|
||||
@@ -270,6 +271,26 @@ BEGIN
|
||||
and x.class = p.class;
|
||||
END IF;
|
||||
|
||||
IF coalesce(existing.name::text, '') != coalesce(NEW.name::text, '')
|
||||
THEN
|
||||
IF existingplacex.rank_address between 26 and 27 THEN
|
||||
-- When streets change their name, this may have an effect on POI objects
|
||||
-- with addr:street tags.
|
||||
UPDATE placex SET indexed_status = 2
|
||||
WHERE indexed_status = 0 and address ? 'street'
|
||||
and parent_place_id = existingplacex.place_id;
|
||||
UPDATE placex SET indexed_status = 2
|
||||
WHERE indexed_status = 0 and rank_search = 30 and address ? 'street'
|
||||
and ST_DWithin(NEW.geometry, geometry, 0.002);
|
||||
ELSEIF existingplacex.rank_address between 16 and 25 THEN
|
||||
-- When places change their name, this may have an effect on POI objects
|
||||
-- with addr:place tags.
|
||||
UPDATE placex SET indexed_status = 2
|
||||
WHERE indexed_status = 0 and address ? 'place' and rank_search = 30
|
||||
and parent_place_id = existingplacex.place_id;
|
||||
-- No update of surrounding objects, potentially too expensive.
|
||||
END IF;
|
||||
END IF;
|
||||
END IF;
|
||||
|
||||
-- Abort the add (we modified the existing place instead)
|
||||
|
||||
@@ -1,27 +1,33 @@
|
||||
-- Trigger functions for the placex table.
|
||||
|
||||
-- Information returned by update preparation.
|
||||
DROP TYPE IF EXISTS prepare_update_info CASCADE;
|
||||
CREATE TYPE prepare_update_info AS (
|
||||
name HSTORE,
|
||||
address HSTORE,
|
||||
rank_address SMALLINT,
|
||||
country_code TEXT,
|
||||
class TEXT,
|
||||
type TEXT,
|
||||
linked_place_id BIGINT
|
||||
);
|
||||
|
||||
-- Retrieve the data needed by the indexer for updating the place.
|
||||
--
|
||||
-- Return parameters:
|
||||
-- name list of names
|
||||
-- address list of address tags, either from the object or a surrounding
|
||||
-- building
|
||||
-- country_feature If the place is a country feature, this contains the
|
||||
-- country code, otherwise it is null.
|
||||
CREATE OR REPLACE FUNCTION placex_prepare_update(p placex,
|
||||
OUT name HSTORE,
|
||||
OUT address HSTORE,
|
||||
OUT country_feature VARCHAR)
|
||||
CREATE OR REPLACE FUNCTION placex_indexing_prepare(p placex)
|
||||
RETURNS prepare_update_info
|
||||
AS $$
|
||||
DECLARE
|
||||
location RECORD;
|
||||
result prepare_update_info;
|
||||
BEGIN
|
||||
-- For POI nodes, check if the address should be derived from a surrounding
|
||||
-- building.
|
||||
IF p.rank_search < 30 OR p.osm_type != 'N' OR p.address is not null THEN
|
||||
address := p.address;
|
||||
result.address := p.address;
|
||||
ELSE
|
||||
-- The additional && condition works around the misguided query
|
||||
-- planner of postgis 3.0.
|
||||
SELECT placex.address || hstore('_inherited', '') INTO address
|
||||
SELECT placex.address || hstore('_inherited', '') INTO result.address
|
||||
FROM placex
|
||||
WHERE ST_Covers(geometry, p.centroid)
|
||||
and geometry && p.centroid
|
||||
@@ -31,15 +37,26 @@ BEGIN
|
||||
LIMIT 1;
|
||||
END IF;
|
||||
|
||||
address := address - '_unlisted_place'::TEXT;
|
||||
name := p.name;
|
||||
result.address := result.address - '_unlisted_place'::TEXT;
|
||||
result.name := p.name;
|
||||
result.class := p.class;
|
||||
result.type := p.type;
|
||||
result.country_code := p.country_code;
|
||||
result.rank_address := p.rank_address;
|
||||
|
||||
country_feature := CASE WHEN p.admin_level = 2
|
||||
and p.class = 'boundary' and p.type = 'administrative'
|
||||
and p.osm_type = 'R'
|
||||
THEN p.country_code
|
||||
ELSE null
|
||||
END;
|
||||
-- Names of linked places need to be merged in, so search for a linkable
|
||||
-- place already here.
|
||||
SELECT * INTO location FROM find_linked_place(p);
|
||||
|
||||
IF location.place_id is not NULL THEN
|
||||
result.linked_place_id := location.place_id;
|
||||
|
||||
IF NOT location.name IS NULL THEN
|
||||
result.name := location.name || result.name;
|
||||
END IF;
|
||||
END IF;
|
||||
|
||||
RETURN result;
|
||||
END;
|
||||
$$
|
||||
LANGUAGE plpgsql STABLE;
|
||||
@@ -89,8 +106,7 @@ CREATE OR REPLACE FUNCTION find_parent_for_poi(poi_osm_type CHAR(1),
|
||||
poi_osm_id BIGINT,
|
||||
poi_partition SMALLINT,
|
||||
bbox GEOMETRY,
|
||||
addr_street INTEGER[],
|
||||
addr_place INTEGER[],
|
||||
token_info JSONB,
|
||||
is_place_addr BOOLEAN)
|
||||
RETURNS BIGINT
|
||||
AS $$
|
||||
@@ -104,8 +120,7 @@ BEGIN
|
||||
parent_place_id := find_associated_street(poi_osm_type, poi_osm_id);
|
||||
|
||||
IF parent_place_id is null THEN
|
||||
parent_place_id := find_parent_for_address(addr_street, addr_place,
|
||||
poi_partition, bbox);
|
||||
parent_place_id := find_parent_for_address(token_info, poi_partition, bbox);
|
||||
END IF;
|
||||
|
||||
IF parent_place_id is null and poi_osm_type = 'N' THEN
|
||||
@@ -318,13 +333,14 @@ BEGIN
|
||||
WHERE s.place_id = parent_place_id;
|
||||
|
||||
FOR addr_item IN
|
||||
SELECT (get_addr_tag_rank(key, country)).*, match_tokens, search_tokens
|
||||
FROM token_get_address_tokens(token_info)
|
||||
WHERE not search_tokens <@ parent_address_vector
|
||||
SELECT (get_addr_tag_rank(key, country)).*, key,
|
||||
token_get_address_search_tokens(token_info, key) as search_tokens
|
||||
FROM token_get_address_keys(token_info) as key
|
||||
WHERE not token_get_address_search_tokens(token_info, key) <@ parent_address_vector
|
||||
LOOP
|
||||
addr_place := get_address_place(in_partition, geometry,
|
||||
addr_item.from_rank, addr_item.to_rank,
|
||||
addr_item.extent, addr_item.match_tokens);
|
||||
addr_item.extent, token_info, addr_item.key);
|
||||
|
||||
IF addr_place is null THEN
|
||||
-- No place found in OSM that matches. Make it at least searchable.
|
||||
@@ -432,14 +448,16 @@ BEGIN
|
||||
|
||||
FOR location IN
|
||||
SELECT (get_address_place(partition, geometry, from_rank, to_rank,
|
||||
extent, match_tokens)).*, search_tokens
|
||||
FROM (SELECT (get_addr_tag_rank(key, country)).*, match_tokens, search_tokens
|
||||
FROM token_get_address_tokens(token_info)) x
|
||||
extent, token_info, key)).*, key
|
||||
FROM (SELECT (get_addr_tag_rank(key, country)).*, key
|
||||
FROM token_get_address_keys(token_info) as key) x
|
||||
ORDER BY rank_address, distance, isguess desc
|
||||
LOOP
|
||||
IF location.place_id is null THEN
|
||||
{% if not db.reverse_only %}
|
||||
nameaddress_vector := array_merge(nameaddress_vector, location.search_tokens);
|
||||
nameaddress_vector := array_merge(nameaddress_vector,
|
||||
token_get_address_search_tokens(token_info,
|
||||
location.key));
|
||||
{% endif %}
|
||||
ELSE
|
||||
{% if not db.reverse_only %}
|
||||
@@ -674,15 +692,14 @@ DECLARE
|
||||
parent_address_level SMALLINT;
|
||||
place_address_level SMALLINT;
|
||||
|
||||
addr_street INTEGER[];
|
||||
addr_place INTEGER[];
|
||||
|
||||
max_rank SMALLINT;
|
||||
|
||||
name_vector INTEGER[];
|
||||
nameaddress_vector INTEGER[];
|
||||
addr_nameaddress_vector INTEGER[];
|
||||
|
||||
linked_place BIGINT;
|
||||
|
||||
linked_node_id BIGINT;
|
||||
linked_importance FLOAT;
|
||||
linked_wikipedia TEXT;
|
||||
@@ -718,9 +735,14 @@ BEGIN
|
||||
|
||||
NEW.extratags := NEW.extratags - 'linked_place'::TEXT;
|
||||
|
||||
-- NEW.linked_place_id contains the precomputed linkee. Save this and restore
|
||||
-- the previous link status.
|
||||
linked_place := NEW.linked_place_id;
|
||||
NEW.linked_place_id := OLD.linked_place_id;
|
||||
|
||||
IF NEW.linked_place_id is not null THEN
|
||||
NEW.token_info := null;
|
||||
{% if debug %}RAISE WARNING 'place already linked to %', NEW.linked_place_id;{% endif %}
|
||||
{% if debug %}RAISE WARNING 'place already linked to %', OLD.linked_place_id;{% endif %}
|
||||
RETURN NEW;
|
||||
END IF;
|
||||
|
||||
@@ -838,8 +860,6 @@ BEGIN
|
||||
END IF;
|
||||
|
||||
NEW.housenumber := token_normalized_housenumber(NEW.token_info);
|
||||
addr_street := token_addr_street_match_tokens(NEW.token_info);
|
||||
addr_place := token_addr_place_match_tokens(NEW.token_info);
|
||||
|
||||
NEW.postcode := null;
|
||||
|
||||
@@ -885,7 +905,7 @@ BEGIN
|
||||
NEW.parent_place_id := find_parent_for_poi(NEW.osm_type, NEW.osm_id,
|
||||
NEW.partition,
|
||||
ST_Envelope(NEW.geometry),
|
||||
addr_street, addr_place,
|
||||
NEW.token_info,
|
||||
is_place_address);
|
||||
|
||||
-- If we found the road take a shortcut here.
|
||||
@@ -956,8 +976,9 @@ BEGIN
|
||||
-- ---------------------------------------------------------------------------
|
||||
-- Full indexing
|
||||
{% if debug %}RAISE WARNING 'Using full index mode for % %', NEW.osm_type, NEW.osm_id;{% endif %}
|
||||
SELECT * INTO location FROM find_linked_place(NEW);
|
||||
IF location.place_id is not null THEN
|
||||
IF linked_place is not null THEN
|
||||
SELECT * INTO location FROM placex WHERE place_id = linked_place;
|
||||
|
||||
{% if debug %}RAISE WARNING 'Linked %', location;{% endif %}
|
||||
|
||||
-- Use the linked point as the centre point of the geometry,
|
||||
@@ -974,11 +995,6 @@ BEGIN
|
||||
NEW.rank_address := location.rank_address;
|
||||
END IF;
|
||||
|
||||
-- merge in the label name
|
||||
IF NOT location.name IS NULL THEN
|
||||
NEW.name := location.name || NEW.name;
|
||||
END IF;
|
||||
|
||||
-- merge in extra tags
|
||||
NEW.extratags := hstore('linked_' || location.class, location.type)
|
||||
|| coalesce(location.extratags, ''::hstore)
|
||||
|
||||
@@ -215,13 +215,12 @@ LANGUAGE plpgsql STABLE;
|
||||
|
||||
-- Find the parent of an address with addr:street/addr:place tag.
|
||||
--
|
||||
-- \param street Value of addr:street or NULL if tag is missing.
|
||||
-- \param place Value of addr:place or NULL if tag is missing.
|
||||
-- \param token_info Naming info with the address information.
|
||||
-- \param partition Partition where to search the parent.
|
||||
-- \param centroid Location of the address.
|
||||
--
|
||||
-- \return Place ID of the parent if one was found, NULL otherwise.
|
||||
CREATE OR REPLACE FUNCTION find_parent_for_address(street INTEGER[], place INTEGER[],
|
||||
CREATE OR REPLACE FUNCTION find_parent_for_address(token_info JSONB,
|
||||
partition SMALLINT,
|
||||
centroid GEOMETRY)
|
||||
RETURNS BIGINT
|
||||
@@ -229,30 +228,22 @@ CREATE OR REPLACE FUNCTION find_parent_for_address(street INTEGER[], place INTEG
|
||||
DECLARE
|
||||
parent_place_id BIGINT;
|
||||
BEGIN
|
||||
IF street is not null THEN
|
||||
-- Check for addr:street attributes
|
||||
-- Note that addr:street links can only be indexed, once the street itself is indexed
|
||||
parent_place_id := getNearestNamedRoadPlaceId(partition, centroid, street);
|
||||
IF parent_place_id is not null THEN
|
||||
{% if debug %}RAISE WARNING 'Get parent form addr:street: %', parent_place_id;{% endif %}
|
||||
RETURN parent_place_id;
|
||||
END IF;
|
||||
-- Check for addr:street attributes
|
||||
parent_place_id := getNearestNamedRoadPlaceId(partition, centroid, token_info);
|
||||
IF parent_place_id is not null THEN
|
||||
{% if debug %}RAISE WARNING 'Get parent from addr:street: %', parent_place_id;{% endif %}
|
||||
RETURN parent_place_id;
|
||||
END IF;
|
||||
|
||||
-- Check for addr:place attributes.
|
||||
IF place is not null THEN
|
||||
parent_place_id := getNearestNamedPlacePlaceId(partition, centroid, place);
|
||||
IF parent_place_id is not null THEN
|
||||
{% if debug %}RAISE WARNING 'Get parent form addr:place: %', parent_place_id;{% endif %}
|
||||
RETURN parent_place_id;
|
||||
END IF;
|
||||
END IF;
|
||||
|
||||
RETURN NULL;
|
||||
parent_place_id := getNearestNamedPlacePlaceId(partition, centroid, token_info);
|
||||
{% if debug %}RAISE WARNING 'Get parent from addr:place: %', parent_place_id;{% endif %}
|
||||
RETURN parent_place_id;
|
||||
END;
|
||||
$$
|
||||
LANGUAGE plpgsql STABLE;
|
||||
|
||||
|
||||
CREATE OR REPLACE FUNCTION delete_location(OLD_place_id BIGINT)
|
||||
RETURNS BOOLEAN
|
||||
AS $$
|
||||
|
||||
@@ -155,11 +155,11 @@ CREATE INDEX idx_placex_linked_place_id ON placex USING BTREE (linked_place_id)
|
||||
CREATE INDEX idx_placex_rank_search ON placex USING BTREE (rank_search, geometry_sector) {{db.tablespace.address_index}};
|
||||
CREATE INDEX idx_placex_geometry ON placex USING GIST (geometry) {{db.tablespace.search_index}};
|
||||
CREATE INDEX idx_placex_geometry_buildings ON placex
|
||||
USING GIST (geometry) {{db.tablespace.search_index}}
|
||||
USING {{postgres.spgist_geom}} (geometry) {{db.tablespace.search_index}}
|
||||
WHERE address is not null and rank_search = 30
|
||||
and ST_GeometryType(geometry) in ('ST_Polygon','ST_MultiPolygon');
|
||||
CREATE INDEX idx_placex_geometry_placenode ON placex
|
||||
USING GIST (geometry) {{db.tablespace.search_index}}
|
||||
USING {{postgres.spgist_geom}} (geometry) {{db.tablespace.search_index}}
|
||||
WHERE osm_type = 'N' and rank_search < 26
|
||||
and class = 'place' and type != 'postcode' and linked_place_id is null;
|
||||
CREATE INDEX idx_placex_wikidata on placex USING BTREE ((extratags -> 'wikidata')) {{db.tablespace.address_index}} WHERE extratags ? 'wikidata' and class = 'place' and osm_type = 'N' and rank_search < 26;
|
||||
|
||||
@@ -14,7 +14,6 @@ DECLARE
|
||||
out_partition INTEGER;
|
||||
out_parent_place_id BIGINT;
|
||||
location RECORD;
|
||||
address_street_word_ids INTEGER[];
|
||||
|
||||
BEGIN
|
||||
|
||||
@@ -54,13 +53,9 @@ BEGIN
|
||||
|
||||
place_centroid := ST_Centroid(linegeo);
|
||||
out_partition := get_partition('us');
|
||||
out_parent_place_id := null;
|
||||
|
||||
address_street_word_ids := token_addr_street_match_tokens(token_info);
|
||||
IF address_street_word_ids IS NOT NULL THEN
|
||||
out_parent_place_id := getNearestNamedRoadPlaceId(out_partition, place_centroid,
|
||||
address_street_word_ids);
|
||||
END IF;
|
||||
out_parent_place_id := getNearestNamedRoadPlaceId(out_partition, place_centroid,
|
||||
token_info);
|
||||
|
||||
IF out_parent_place_id IS NULL THEN
|
||||
SELECT getNearestParallelRoadFeature(out_partition, linegeo)
|
||||
|
||||
@@ -34,40 +34,59 @@ AS $$
|
||||
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
||||
|
||||
|
||||
CREATE OR REPLACE FUNCTION token_addr_street_match_tokens(info JSONB)
|
||||
RETURNS INTEGER[]
|
||||
CREATE OR REPLACE FUNCTION token_has_addr_street(info JSONB)
|
||||
RETURNS BOOLEAN
|
||||
AS $$
|
||||
SELECT (info->>'street')::INTEGER[]
|
||||
SELECT info->>'street' is not null;
|
||||
$$ LANGUAGE SQL IMMUTABLE;
|
||||
|
||||
|
||||
CREATE OR REPLACE FUNCTION token_has_addr_place(info JSONB)
|
||||
RETURNS BOOLEAN
|
||||
AS $$
|
||||
SELECT info->>'place' is not null;
|
||||
$$ LANGUAGE SQL IMMUTABLE;
|
||||
|
||||
|
||||
CREATE OR REPLACE FUNCTION token_matches_street(info JSONB, street_tokens INTEGER[])
|
||||
RETURNS BOOLEAN
|
||||
AS $$
|
||||
SELECT (info->>'street')::INTEGER[] <@ street_tokens
|
||||
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
||||
|
||||
|
||||
CREATE OR REPLACE FUNCTION token_addr_place_match_tokens(info JSONB)
|
||||
RETURNS INTEGER[]
|
||||
CREATE OR REPLACE FUNCTION token_matches_place(info JSONB, place_tokens INTEGER[])
|
||||
RETURNS BOOLEAN
|
||||
AS $$
|
||||
SELECT (info->>'place_match')::INTEGER[]
|
||||
SELECT (info->>'place')::INTEGER[] <@ place_tokens
|
||||
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
||||
|
||||
|
||||
CREATE OR REPLACE FUNCTION token_addr_place_search_tokens(info JSONB)
|
||||
RETURNS INTEGER[]
|
||||
AS $$
|
||||
SELECT (info->>'place_search')::INTEGER[]
|
||||
SELECT (info->>'place')::INTEGER[]
|
||||
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
||||
|
||||
|
||||
DROP TYPE IF EXISTS token_addresstoken CASCADE;
|
||||
CREATE TYPE token_addresstoken AS (
|
||||
key TEXT,
|
||||
match_tokens INT[],
|
||||
search_tokens INT[]
|
||||
);
|
||||
|
||||
CREATE OR REPLACE FUNCTION token_get_address_tokens(info JSONB)
|
||||
RETURNS SETOF token_addresstoken
|
||||
CREATE OR REPLACE FUNCTION token_get_address_keys(info JSONB)
|
||||
RETURNS SETOF TEXT
|
||||
AS $$
|
||||
SELECT key, (value->>1)::int[] as match_tokens,
|
||||
(value->>0)::int[] as search_tokens
|
||||
FROM jsonb_each(info->'addr');
|
||||
SELECT * FROM jsonb_object_keys(info->'addr');
|
||||
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
||||
|
||||
|
||||
CREATE OR REPLACE FUNCTION token_get_address_search_tokens(info JSONB, key TEXT)
|
||||
RETURNS INTEGER[]
|
||||
AS $$
|
||||
SELECT (info->'addr'->>key)::INTEGER[];
|
||||
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
||||
|
||||
|
||||
CREATE OR REPLACE FUNCTION token_matches_address(info JSONB, key TEXT, tokens INTEGER[])
|
||||
RETURNS BOOLEAN
|
||||
AS $$
|
||||
SELECT (info->'addr'->>key)::INTEGER[] <@ tokens;
|
||||
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
||||
|
||||
|
||||
@@ -127,15 +146,34 @@ BEGIN
|
||||
VALUES (term_id, term, 'w', json_build_object('count', term_count));
|
||||
END IF;
|
||||
|
||||
IF term_count < {{ max_word_freq }} THEN
|
||||
partial_tokens := array_merge(partial_tokens, ARRAY[term_id]);
|
||||
END IF;
|
||||
partial_tokens := array_merge(partial_tokens, ARRAY[term_id]);
|
||||
END LOOP;
|
||||
END;
|
||||
$$
|
||||
LANGUAGE plpgsql;
|
||||
|
||||
|
||||
CREATE OR REPLACE FUNCTION getorcreate_partial_word(partial TEXT)
|
||||
RETURNS INTEGER
|
||||
AS $$
|
||||
DECLARE
|
||||
token INTEGER;
|
||||
BEGIN
|
||||
SELECT min(word_id) INTO token
|
||||
FROM word WHERE word_token = partial and type = 'w';
|
||||
|
||||
IF token IS NULL THEN
|
||||
token := nextval('seq_word');
|
||||
INSERT INTO word (word_id, word_token, type, info)
|
||||
VALUES (token, partial, 'w', json_build_object('count', 0));
|
||||
END IF;
|
||||
|
||||
RETURN token;
|
||||
END;
|
||||
$$
|
||||
LANGUAGE plpgsql;
|
||||
|
||||
|
||||
CREATE OR REPLACE FUNCTION getorcreate_hnr_id(lookup_term TEXT)
|
||||
RETURNS INTEGER
|
||||
AS $$
|
||||
@@ -34,17 +34,31 @@ AS $$
|
||||
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
||||
|
||||
|
||||
CREATE OR REPLACE FUNCTION token_addr_street_match_tokens(info JSONB)
|
||||
RETURNS INTEGER[]
|
||||
CREATE OR REPLACE FUNCTION token_has_addr_street(info JSONB)
|
||||
RETURNS BOOLEAN
|
||||
AS $$
|
||||
SELECT (info->>'street')::INTEGER[]
|
||||
SELECT info->>'street' is not null;
|
||||
$$ LANGUAGE SQL IMMUTABLE;
|
||||
|
||||
|
||||
CREATE OR REPLACE FUNCTION token_has_addr_place(info JSONB)
|
||||
RETURNS BOOLEAN
|
||||
AS $$
|
||||
SELECT info->>'place_match' is not null;
|
||||
$$ LANGUAGE SQL IMMUTABLE;
|
||||
|
||||
|
||||
CREATE OR REPLACE FUNCTION token_matches_street(info JSONB, street_tokens INTEGER[])
|
||||
RETURNS BOOLEAN
|
||||
AS $$
|
||||
SELECT (info->>'street')::INTEGER[] && street_tokens
|
||||
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
||||
|
||||
|
||||
CREATE OR REPLACE FUNCTION token_addr_place_match_tokens(info JSONB)
|
||||
RETURNS INTEGER[]
|
||||
CREATE OR REPLACE FUNCTION token_matches_place(info JSONB, place_tokens INTEGER[])
|
||||
RETURNS BOOLEAN
|
||||
AS $$
|
||||
SELECT (info->>'place_match')::INTEGER[]
|
||||
SELECT (info->>'place_match')::INTEGER[] && place_tokens
|
||||
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
||||
|
||||
|
||||
@@ -55,19 +69,24 @@ AS $$
|
||||
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
||||
|
||||
|
||||
DROP TYPE IF EXISTS token_addresstoken CASCADE;
|
||||
CREATE TYPE token_addresstoken AS (
|
||||
key TEXT,
|
||||
match_tokens INT[],
|
||||
search_tokens INT[]
|
||||
);
|
||||
|
||||
CREATE OR REPLACE FUNCTION token_get_address_tokens(info JSONB)
|
||||
RETURNS SETOF token_addresstoken
|
||||
CREATE OR REPLACE FUNCTION token_get_address_keys(info JSONB)
|
||||
RETURNS SETOF TEXT
|
||||
AS $$
|
||||
SELECT key, (value->>1)::int[] as match_tokens,
|
||||
(value->>0)::int[] as search_tokens
|
||||
FROM jsonb_each(info->'addr');
|
||||
SELECT * FROM jsonb_object_keys(info->'addr');
|
||||
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
||||
|
||||
|
||||
CREATE OR REPLACE FUNCTION token_get_address_search_tokens(info JSONB, key TEXT)
|
||||
RETURNS INTEGER[]
|
||||
AS $$
|
||||
SELECT (info->'addr'->key->>0)::INTEGER[];
|
||||
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
||||
|
||||
|
||||
CREATE OR REPLACE FUNCTION token_matches_address(info JSONB, key TEXT, tokens INTEGER[])
|
||||
RETURNS BOOLEAN
|
||||
AS $$
|
||||
SELECT (info->'addr'->key->>1)::INTEGER[] && tokens;
|
||||
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
||||
|
||||
|
||||
|
||||
@@ -1,2 +1,3 @@
|
||||
-- Required for details lookup.
|
||||
CREATE INDEX IF NOT EXISTS idx_word_word_id
|
||||
ON word USING BTREE (word_id) {{db.tablespace.search_index}};
|
||||
|
||||
@@ -1,11 +0,0 @@
|
||||
DROP TABLE IF EXISTS word_frequencies;
|
||||
CREATE TABLE word_frequencies AS
|
||||
SELECT unnest(name_vector) as id, count(*) FROM search_name GROUP BY id;
|
||||
|
||||
CREATE INDEX idx_word_frequencies ON word_frequencies(id);
|
||||
|
||||
UPDATE word SET search_name_count = count
|
||||
FROM word_frequencies
|
||||
WHERE word_token like ' %' and word_id = id;
|
||||
|
||||
DROP TABLE word_frequencies;
|
||||
@@ -1,6 +1,6 @@
|
||||
# Creates and installs manual page
|
||||
|
||||
configure_file(${PROJECT_SOURCE_DIR}/manual/create-manpage.tmpl create_manpage.py)
|
||||
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/create-manpage.tmpl create_manpage.py)
|
||||
|
||||
find_program(ARGPARSEMANPAGE argparse-manpage)
|
||||
|
||||
@@ -8,8 +8,8 @@ ADD_CUSTOM_TARGET(manpage
|
||||
COMMAND ${ARGPARSEMANPAGE} --pyfile ${CMAKE_CURRENT_BINARY_DIR}/create_manpage.py
|
||||
--function get_parser --project-name Nominatim
|
||||
--url https://nominatim.org > ${CMAKE_CURRENT_SOURCE_DIR}/nominatim.1
|
||||
|
||||
COMMAND sed -i '/.SH AUTHORS/I,+2 d' ${CMAKE_CURRENT_SOURCE_DIR}/nominatim.1
|
||||
--author 'the Nominatim developer community'
|
||||
--author-email info@nominatim.org
|
||||
)
|
||||
|
||||
install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/nominatim.1 DESTINATION share/man/man1 )
|
||||
@@ -6,7 +6,9 @@ nominatim
|
||||
[-h] {import,freeze,replication,special-phrases,add-data,index,refresh,admin,export,serve,search,reverse,lookup,details,status} ...
|
||||
.SH DESCRIPTION
|
||||
Command\-line tools for importing, updating, administrating and
|
||||
.br
|
||||
querying the Nominatim database.
|
||||
.br
|
||||
|
||||
.SH OPTIONS
|
||||
|
||||
@@ -45,7 +47,7 @@ nominatim
|
||||
Start a simple web server for serving the API.
|
||||
.TP
|
||||
\fBnominatim\fR \fI\,search\/\fR
|
||||
Execute API search query.
|
||||
Execute a search query.
|
||||
.TP
|
||||
\fBnominatim\fR \fI\,reverse\/\fR
|
||||
Execute API reverse query.
|
||||
@@ -66,6 +68,15 @@ usage: nominatim import [-h] [-q] [-v] [--project-dir DIR] [-j NUM]
|
||||
[--index-noanalyse]
|
||||
|
||||
Create a new Nominatim database from an OSM file.
|
||||
.br
|
||||
|
||||
.br
|
||||
This sub\-command sets up a new Nominatim database from scratch starting
|
||||
.br
|
||||
with creating a new database in Postgresql. The user running this command
|
||||
.br
|
||||
needs superuser rights on the database.
|
||||
.br
|
||||
|
||||
|
||||
|
||||
@@ -88,7 +99,7 @@ Number of parallel threads to use
|
||||
|
||||
.TP
|
||||
\fB\-\-osm\-file\fR FILE
|
||||
OSM file to be imported.
|
||||
OSM file to be imported (repeat for importing multiple files)
|
||||
|
||||
.TP
|
||||
\fB\-\-continue\fR {load\-data,indexing,db\-postprocess}
|
||||
@@ -116,19 +127,27 @@ Continue import even when errors in SQL are present
|
||||
|
||||
.TP
|
||||
\fB\-\-index\-noanalyse\fR
|
||||
Do not perform analyse operations during index
|
||||
Do not perform analyse operations during index (expert only)
|
||||
|
||||
.SH OPTIONS 'nominatim freeze'
|
||||
usage: nominatim freeze [-h] [-q] [-v] [--project-dir DIR] [-j NUM]
|
||||
|
||||
Make database read\-only.
|
||||
.br
|
||||
|
||||
.br
|
||||
About half of data in the Nominatim database is kept only to be able to
|
||||
.br
|
||||
keep the data up\-to\-date with new changes made in OpenStreetMap. This
|
||||
.br
|
||||
command drops all this data and only keeps the part needed for geocoding
|
||||
.br
|
||||
itself.
|
||||
.br
|
||||
|
||||
.br
|
||||
This command has the same effect as the `\-\-no\-updates` option for imports.
|
||||
.br
|
||||
|
||||
|
||||
|
||||
@@ -157,6 +176,33 @@ usage: nominatim replication [-h] [-q] [-v] [--project-dir DIR] [-j NUM]
|
||||
[--socket-timeout SOCKET_TIMEOUT]
|
||||
|
||||
Update the database using an online replication service.
|
||||
.br
|
||||
|
||||
.br
|
||||
An OSM replication service is an online service that provides regular
|
||||
.br
|
||||
updates (OSM diff files) for the planet or update they provide. The OSMF
|
||||
.br
|
||||
provides the primary replication service for the full planet at
|
||||
.br
|
||||
https://planet.osm.org/replication/ but there are other providers of
|
||||
.br
|
||||
extracts of OSM data who provide such a service as well.
|
||||
.br
|
||||
|
||||
.br
|
||||
This sub\-command allows to set up such a replication service and download
|
||||
.br
|
||||
and import updates at regular intervals. You need to call '\-\-init' once to
|
||||
.br
|
||||
set up the process or whenever you change the replication configuration
|
||||
.br
|
||||
parameters. Without any arguments, the sub\-command will go into a loop and
|
||||
.br
|
||||
continuously apply updates as they become available. Giving `\-\-once` just
|
||||
.br
|
||||
downloads and imports the next batch of updates.
|
||||
.br
|
||||
|
||||
|
||||
|
||||
@@ -195,7 +241,7 @@ Download and apply updates only once. When not set, updates are continuously app
|
||||
|
||||
.TP
|
||||
\fB\-\-no\-index\fR
|
||||
Do not index the new data. Only applicable together with \-\-once
|
||||
Do not index the new data. Only usable together with \-\-once
|
||||
|
||||
.TP
|
||||
\fB\-\-osm2pgsql\-cache\fR SIZE
|
||||
@@ -203,13 +249,47 @@ Size of cache to be used by osm2pgsql (in MB)
|
||||
|
||||
.TP
|
||||
\fB\-\-socket\-timeout\fR \fI\,SOCKET_TIMEOUT\/\fR
|
||||
Set timeout for file downloads.
|
||||
Set timeout for file downloads
|
||||
|
||||
.SH OPTIONS 'nominatim special-phrases'
|
||||
usage: nominatim special-phrases [-h] [-q] [-v] [--project-dir DIR] [-j NUM]
|
||||
[--import-from-wiki]
|
||||
[--import-from-wiki] [--import-from-csv FILE]
|
||||
[--no-replace]
|
||||
|
||||
Import special phrases.
|
||||
.br
|
||||
|
||||
.br
|
||||
Special phrases are search terms that narrow down the type of object
|
||||
.br
|
||||
that should be searched. For example, you might want to search for
|
||||
.br
|
||||
'Hotels in Barcelona'. The OSM wiki has a selection of special phrases
|
||||
.br
|
||||
in many languages, which can be imported with this command.
|
||||
.br
|
||||
|
||||
.br
|
||||
You can also provide your own phrases in a CSV file. The file needs to have
|
||||
.br
|
||||
the following five columns:
|
||||
.br
|
||||
* phrase \- the term expected for searching
|
||||
.br
|
||||
* class \- the OSM tag key of the object type
|
||||
.br
|
||||
* type \- the OSM tag value of the object type
|
||||
.br
|
||||
* operator \- the kind of search to be done (one of: in, near, name, \-)
|
||||
.br
|
||||
* plural \- whether the term is a plural or not (Y/N)
|
||||
.br
|
||||
|
||||
.br
|
||||
An example file can be found in the Nominatim sources at
|
||||
.br
|
||||
'test/testdb/full_en_phrases_test.csv'.
|
||||
.br
|
||||
|
||||
|
||||
|
||||
@@ -232,17 +312,48 @@ Number of parallel threads to use
|
||||
|
||||
.TP
|
||||
\fB\-\-import\-from\-wiki\fR
|
||||
Import special phrases from the OSM wiki to the database.
|
||||
Import special phrases from the OSM wiki to the database
|
||||
|
||||
.TP
|
||||
\fB\-\-import\-from\-csv\fR FILE
|
||||
Import special phrases from a CSV file
|
||||
|
||||
.TP
|
||||
\fB\-\-no\-replace\fR
|
||||
Keep the old phrases and only add the new ones
|
||||
|
||||
.SH OPTIONS 'nominatim add-data'
|
||||
usage: nominatim add-data [-h] [-q] [-v] [--project-dir DIR] [-j NUM]
|
||||
(--file FILE | --diff FILE | --node ID | --way ID | --relation ID | --tiger-data DIR)
|
||||
[--use-main-api]
|
||||
[--use-main-api] [--osm2pgsql-cache SIZE]
|
||||
[--socket-timeout SOCKET_TIMEOUT]
|
||||
|
||||
Add additional data from a file or an online source.
|
||||
.br
|
||||
|
||||
Data is only imported, not indexed. You need to call `nominatim index`
|
||||
to complete the process.
|
||||
.br
|
||||
This command allows to add or update the search data in the database.
|
||||
.br
|
||||
The data can come either from an OSM file or single OSM objects can
|
||||
.br
|
||||
directly be downloaded from the OSM API. This function only loads the
|
||||
.br
|
||||
data into the database. Afterwards it still needs to be integrated
|
||||
.br
|
||||
in the search index. Use the `nominatim index` command for that.
|
||||
.br
|
||||
|
||||
.br
|
||||
The command can also be used to add external non\-OSM data to the
|
||||
.br
|
||||
database. At the moment the only supported format is TIGER housenumber
|
||||
.br
|
||||
data. See the online documentation at
|
||||
.br
|
||||
https://nominatim.org/release\-docs/latest/admin/Import/#installing\-tiger\-housenumber\-data\-for\-the\-us
|
||||
.br
|
||||
for more information.
|
||||
.br
|
||||
|
||||
|
||||
|
||||
@@ -265,11 +376,11 @@ Number of parallel threads to use
|
||||
|
||||
.TP
|
||||
\fB\-\-file\fR FILE
|
||||
Import data from an OSM file
|
||||
Import data from an OSM file or diff file
|
||||
|
||||
.TP
|
||||
\fB\-\-diff\fR FILE
|
||||
Import data from an OSM diff file
|
||||
Import data from an OSM diff file (deprecated: use \-\-file)
|
||||
|
||||
.TP
|
||||
\fB\-\-node\fR ID
|
||||
@@ -285,18 +396,37 @@ Import a single relation from the API
|
||||
|
||||
.TP
|
||||
\fB\-\-tiger\-data\fR DIR
|
||||
Add housenumbers from the US TIGER census database.
|
||||
Add housenumbers from the US TIGER census database
|
||||
|
||||
.TP
|
||||
\fB\-\-use\-main\-api\fR
|
||||
Use OSM API instead of Overpass to download objects
|
||||
|
||||
.TP
|
||||
\fB\-\-osm2pgsql\-cache\fR SIZE
|
||||
Size of cache to be used by osm2pgsql (in MB)
|
||||
|
||||
.TP
|
||||
\fB\-\-socket\-timeout\fR \fI\,SOCKET_TIMEOUT\/\fR
|
||||
Set timeout for file downloads
|
||||
|
||||
.SH OPTIONS 'nominatim index'
|
||||
usage: nominatim index [-h] [-q] [-v] [--project-dir DIR] [-j NUM]
|
||||
[--boundaries-only] [--no-boundaries] [--minrank RANK]
|
||||
[--maxrank RANK]
|
||||
|
||||
Reindex all new and modified data.
|
||||
.br
|
||||
|
||||
.br
|
||||
Indexing is the process of computing the address and search terms for
|
||||
.br
|
||||
the places in the database. Every time data is added or changed, indexing
|
||||
.br
|
||||
needs to be run. Imports and replication updates automatically take care
|
||||
.br
|
||||
of indexing. For other cases, this function allows to run indexing manually.
|
||||
.br
|
||||
|
||||
|
||||
|
||||
@@ -341,8 +471,23 @@ usage: nominatim refresh [-h] [-q] [-v] [--project-dir DIR] [-j NUM]
|
||||
[--enable-debug-statements]
|
||||
|
||||
Recompute auxiliary data used by the indexing process.
|
||||
.br
|
||||
|
||||
These functions must not be run in parallel with other update commands.
|
||||
.br
|
||||
This sub\-commands updates various static data and functions in the database.
|
||||
.br
|
||||
It usually needs to be run after changing various aspects of the
|
||||
.br
|
||||
configuration. The configuration documentation will mention the exact
|
||||
.br
|
||||
command to use in such case.
|
||||
.br
|
||||
|
||||
.br
|
||||
Warning: the 'update' command must not be run in parallel with other update
|
||||
.br
|
||||
commands like 'replication' or 'add\-data'.
|
||||
.br
|
||||
|
||||
|
||||
|
||||
@@ -381,7 +526,7 @@ Update the PL/pgSQL functions in the database
|
||||
|
||||
.TP
|
||||
\fB\-\-wiki\-data\fR
|
||||
Update Wikipedia/data importance numbers.
|
||||
Update Wikipedia/data importance numbers
|
||||
|
||||
.TP
|
||||
\fB\-\-importance\fR
|
||||
@@ -406,6 +551,7 @@ usage: nominatim admin [-h] [-q] [-v] [--project-dir DIR] [-j NUM]
|
||||
[--osm-id OSM_ID | --place-id PLACE_ID]
|
||||
|
||||
Analyse and maintain the database.
|
||||
.br
|
||||
|
||||
|
||||
|
||||
@@ -428,19 +574,19 @@ Number of parallel threads to use
|
||||
|
||||
.TP
|
||||
\fB\-\-warm\fR
|
||||
Warm database caches for search and reverse queries.
|
||||
Warm database caches for search and reverse queries
|
||||
|
||||
.TP
|
||||
\fB\-\-check\-database\fR
|
||||
Check that the database is complete and operational.
|
||||
Check that the database is complete and operational
|
||||
|
||||
.TP
|
||||
\fB\-\-migrate\fR
|
||||
Migrate the database to a new software version.
|
||||
Migrate the database to a new software version
|
||||
|
||||
.TP
|
||||
\fB\-\-analyse\-indexing\fR
|
||||
Print performance analysis of the indexing process.
|
||||
Print performance analysis of the indexing process
|
||||
|
||||
.TP
|
||||
\fB\-\-search\-only\fR
|
||||
@@ -468,6 +614,7 @@ usage: nominatim export [-h] [-q] [-v] [--project-dir DIR] [-j NUM]
|
||||
[--restrict-to-osm-relation ID]
|
||||
|
||||
Export addresses as CSV file from the database.
|
||||
.br
|
||||
|
||||
|
||||
|
||||
@@ -525,12 +672,19 @@ usage: nominatim serve [-h] [-q] [-v] [--project-dir DIR] [-j NUM]
|
||||
[--server SERVER]
|
||||
|
||||
Start a simple web server for serving the API.
|
||||
.br
|
||||
|
||||
.br
|
||||
This command starts the built\-in PHP webserver to serve the website
|
||||
.br
|
||||
from the current project directory. This webserver is only suitable
|
||||
for testing and develop. Do not use it in production setups!
|
||||
.br
|
||||
for testing and development. Do not use it in production setups!
|
||||
.br
|
||||
|
||||
.br
|
||||
By the default, the webserver can be accessed at: http://127.0.0.1:8088
|
||||
.br
|
||||
|
||||
|
||||
|
||||
@@ -568,7 +722,18 @@ usage: nominatim search [-h] [-q] [-v] [--project-dir DIR] [-j NUM]
|
||||
[--exclude_place_ids ID,..] [--limit LIMIT]
|
||||
[--viewbox X1,Y1,X2,Y2] [--bounded] [--no-dedupe]
|
||||
|
||||
Execute API search query.
|
||||
Execute a search query.
|
||||
.br
|
||||
|
||||
.br
|
||||
This command works exactly the same as if calling the /search endpoint on
|
||||
.br
|
||||
the web API. See the online documentation for more details on the
|
||||
.br
|
||||
various parameters:
|
||||
.br
|
||||
https://nominatim.org/release\-docs/latest/api/Search/
|
||||
.br
|
||||
|
||||
|
||||
|
||||
@@ -623,15 +788,15 @@ Format of result
|
||||
|
||||
.TP
|
||||
\fB\-\-addressdetails\fR
|
||||
Include a breakdown of the address into elements.
|
||||
Include a breakdown of the address into elements
|
||||
|
||||
.TP
|
||||
\fB\-\-extratags\fR
|
||||
Include additional information if available (e.g. wikipedia link, opening hours).
|
||||
Include additional information if available (e.g. wikipedia link, opening hours)
|
||||
|
||||
.TP
|
||||
\fB\-\-namedetails\fR
|
||||
Include a list of alternative names.
|
||||
Include a list of alternative names
|
||||
|
||||
.TP
|
||||
\fB\-\-lang\fR LANGS, \fB\-\-accept\-language\fR LANGS
|
||||
@@ -639,7 +804,7 @@ Preferred language order for presenting search results
|
||||
|
||||
.TP
|
||||
\fB\-\-polygon\-output\fR {geojson,kml,svg,text}
|
||||
Output geometry of results as a GeoJSON, KML, SVG or WKT.
|
||||
Output geometry of results as a GeoJSON, KML, SVG or WKT
|
||||
|
||||
.TP
|
||||
\fB\-\-polygon\-threshold\fR TOLERANCE
|
||||
@@ -647,7 +812,7 @@ Simplify output geometry.Parameter is difference tolerance in degrees.
|
||||
|
||||
.TP
|
||||
\fB\-\-countrycodes\fR CC,..
|
||||
Limit search results to one or more countries.
|
||||
Limit search results to one or more countries
|
||||
|
||||
.TP
|
||||
\fB\-\-exclude_place_ids\fR ID,..
|
||||
@@ -679,6 +844,17 @@ usage: nominatim reverse [-h] [-q] [-v] [--project-dir DIR] [-j NUM] --lat LAT
|
||||
[--polygon-threshold TOLERANCE]
|
||||
|
||||
Execute API reverse query.
|
||||
.br
|
||||
|
||||
.br
|
||||
This command works exactly the same as if calling the /reverse endpoint on
|
||||
.br
|
||||
the web API. See the online documentation for more details on the
|
||||
.br
|
||||
various parameters:
|
||||
.br
|
||||
https://nominatim.org/release\-docs/latest/api/Reverse/
|
||||
.br
|
||||
|
||||
|
||||
|
||||
@@ -717,15 +893,15 @@ Format of result
|
||||
|
||||
.TP
|
||||
\fB\-\-addressdetails\fR
|
||||
Include a breakdown of the address into elements.
|
||||
Include a breakdown of the address into elements
|
||||
|
||||
.TP
|
||||
\fB\-\-extratags\fR
|
||||
Include additional information if available (e.g. wikipedia link, opening hours).
|
||||
Include additional information if available (e.g. wikipedia link, opening hours)
|
||||
|
||||
.TP
|
||||
\fB\-\-namedetails\fR
|
||||
Include a list of alternative names.
|
||||
Include a list of alternative names
|
||||
|
||||
.TP
|
||||
\fB\-\-lang\fR LANGS, \fB\-\-accept\-language\fR LANGS
|
||||
@@ -733,7 +909,7 @@ Preferred language order for presenting search results
|
||||
|
||||
.TP
|
||||
\fB\-\-polygon\-output\fR {geojson,kml,svg,text}
|
||||
Output geometry of results as a GeoJSON, KML, SVG or WKT.
|
||||
Output geometry of results as a GeoJSON, KML, SVG or WKT
|
||||
|
||||
.TP
|
||||
\fB\-\-polygon\-threshold\fR TOLERANCE
|
||||
@@ -748,6 +924,17 @@ usage: nominatim lookup [-h] [-q] [-v] [--project-dir DIR] [-j NUM] --id OSMID
|
||||
[--polygon-threshold TOLERANCE]
|
||||
|
||||
Execute API lookup query.
|
||||
.br
|
||||
|
||||
.br
|
||||
This command works exactly the same as if calling the /lookup endpoint on
|
||||
.br
|
||||
the web API. See the online documentation for more details on the
|
||||
.br
|
||||
various parameters:
|
||||
.br
|
||||
https://nominatim.org/release\-docs/latest/api/Lookup/
|
||||
.br
|
||||
|
||||
|
||||
|
||||
@@ -778,15 +965,15 @@ Format of result
|
||||
|
||||
.TP
|
||||
\fB\-\-addressdetails\fR
|
||||
Include a breakdown of the address into elements.
|
||||
Include a breakdown of the address into elements
|
||||
|
||||
.TP
|
||||
\fB\-\-extratags\fR
|
||||
Include additional information if available (e.g. wikipedia link, opening hours).
|
||||
Include additional information if available (e.g. wikipedia link, opening hours)
|
||||
|
||||
.TP
|
||||
\fB\-\-namedetails\fR
|
||||
Include a list of alternative names.
|
||||
Include a list of alternative names
|
||||
|
||||
.TP
|
||||
\fB\-\-lang\fR LANGS, \fB\-\-accept\-language\fR LANGS
|
||||
@@ -794,7 +981,7 @@ Preferred language order for presenting search results
|
||||
|
||||
.TP
|
||||
\fB\-\-polygon\-output\fR {geojson,kml,svg,text}
|
||||
Output geometry of results as a GeoJSON, KML, SVG or WKT.
|
||||
Output geometry of results as a GeoJSON, KML, SVG or WKT
|
||||
|
||||
.TP
|
||||
\fB\-\-polygon\-threshold\fR TOLERANCE
|
||||
@@ -809,6 +996,17 @@ usage: nominatim details [-h] [-q] [-v] [--project-dir DIR] [-j NUM]
|
||||
[--lang LANGS]
|
||||
|
||||
Execute API details query.
|
||||
.br
|
||||
|
||||
.br
|
||||
This command works exactly the same as if calling the /details endpoint on
|
||||
.br
|
||||
the web API. See the online documentation for more details on the
|
||||
.br
|
||||
various parameters:
|
||||
.br
|
||||
https://nominatim.org/release\-docs/latest/api/Details/
|
||||
.br
|
||||
|
||||
|
||||
|
||||
@@ -843,7 +1041,7 @@ Look up the OSM relation with the given ID.
|
||||
|
||||
.TP
|
||||
\fB\-\-place_id\fR \fI\,PLACE_ID\/\fR, \fB\-p\fR \fI\,PLACE_ID\/\fR
|
||||
Database internal identifier of the OSM object to look up.
|
||||
Database internal identifier of the OSM object to look up
|
||||
|
||||
.TP
|
||||
\fB\-\-class\fR \fI\,OBJECT_CLASS\/\fR
|
||||
@@ -851,27 +1049,27 @@ Class type to disambiguated multiple entries of the same object.
|
||||
|
||||
.TP
|
||||
\fB\-\-addressdetails\fR
|
||||
Include a breakdown of the address into elements.
|
||||
Include a breakdown of the address into elements
|
||||
|
||||
.TP
|
||||
\fB\-\-keywords\fR
|
||||
Include a list of name keywords and address keywords.
|
||||
Include a list of name keywords and address keywords
|
||||
|
||||
.TP
|
||||
\fB\-\-linkedplaces\fR
|
||||
Include a details of places that are linked with this one.
|
||||
Include a details of places that are linked with this one
|
||||
|
||||
.TP
|
||||
\fB\-\-hierarchy\fR
|
||||
Include details of places lower in the address hierarchy.
|
||||
Include details of places lower in the address hierarchy
|
||||
|
||||
.TP
|
||||
\fB\-\-group_hierarchy\fR
|
||||
Group the places by type.
|
||||
Group the places by type
|
||||
|
||||
.TP
|
||||
\fB\-\-polygon_geojson\fR
|
||||
Include geometry of result.
|
||||
Include geometry of result
|
||||
|
||||
.TP
|
||||
\fB\-\-lang\fR LANGS, \fB\-\-accept\-language\fR LANGS
|
||||
@@ -882,6 +1080,17 @@ usage: nominatim status [-h] [-q] [-v] [--project-dir DIR] [-j NUM]
|
||||
[--format {text,json}]
|
||||
|
||||
Execute API status query.
|
||||
.br
|
||||
|
||||
.br
|
||||
This command works exactly the same as if calling the /status endpoint on
|
||||
.br
|
||||
the web API. See the online documentation for more details on the
|
||||
.br
|
||||
various parameters:
|
||||
.br
|
||||
https://nominatim.org/release\-docs/latest/api/Status/
|
||||
.br
|
||||
|
||||
|
||||
|
||||
@@ -906,6 +1115,9 @@ Number of parallel threads to use
|
||||
\fB\-\-format\fR {text,json}
|
||||
Format of result
|
||||
|
||||
.SH AUTHORS
|
||||
.B Nominatim
|
||||
was written by the Nominatim developer community <info@nominatim.org>.
|
||||
.SH DISTRIBUTION
|
||||
The latest version of Nominatim may be downloaded from
|
||||
.UR https://nominatim.org
|
||||
@@ -176,7 +176,7 @@ class AdminServe:
|
||||
|
||||
This command starts the built-in PHP webserver to serve the website
|
||||
from the current project directory. This webserver is only suitable
|
||||
for testing and develop. Do not use it in production setups!
|
||||
for testing and development. Do not use it in production setups!
|
||||
|
||||
By the default, the webserver can be accessed at: http://127.0.0.1:8088
|
||||
"""
|
||||
|
||||
@@ -3,6 +3,8 @@ Implementation of the 'add-data' subcommand.
|
||||
"""
|
||||
import logging
|
||||
|
||||
import psutil
|
||||
|
||||
# Do not repeat documentation of subcommand classes.
|
||||
# pylint: disable=C0111
|
||||
# Using non-top-level imports to avoid eventually unused imports.
|
||||
@@ -14,8 +16,17 @@ class UpdateAddData:
|
||||
"""\
|
||||
Add additional data from a file or an online source.
|
||||
|
||||
Data is only imported, not indexed. You need to call `nominatim index`
|
||||
to complete the process.
|
||||
This command allows to add or update the search data in the database.
|
||||
The data can come either from an OSM file or single OSM objects can
|
||||
directly be downloaded from the OSM API. This function only loads the
|
||||
data into the database. Afterwards it still needs to be integrated
|
||||
in the search index. Use the `nominatim index` command for that.
|
||||
|
||||
The command can also be used to add external non-OSM data to the
|
||||
database. At the moment the only supported format is TIGER housenumber
|
||||
data. See the online documentation at
|
||||
https://nominatim.org/release-docs/latest/admin/Import/#installing-tiger-housenumber-data-for-the-us
|
||||
for more information.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
@@ -33,14 +44,14 @@ class UpdateAddData:
|
||||
group.add_argument('--relation', metavar='ID', type=int,
|
||||
help='Import a single relation from the API')
|
||||
group.add_argument('--tiger-data', metavar='DIR',
|
||||
help='Add housenumbers from the US TIGER census database.')
|
||||
help='Add housenumbers from the US TIGER census database')
|
||||
group = parser.add_argument_group('Extra arguments')
|
||||
group.add_argument('--use-main-api', action='store_true',
|
||||
help='Use OSM API instead of Overpass to download objects')
|
||||
group.add_argument('--osm2pgsql-cache', metavar='SIZE', type=int,
|
||||
help='Size of cache to be used by osm2pgsql (in MB)')
|
||||
group.add_argument('--socket-timeout', dest='socket_timeout', type=int, default=60,
|
||||
help='Set timeout for file downloads.')
|
||||
help='Set timeout for file downloads')
|
||||
|
||||
@staticmethod
|
||||
def run(args):
|
||||
@@ -50,7 +61,8 @@ class UpdateAddData:
|
||||
if args.tiger_data:
|
||||
tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
|
||||
return tiger_data.add_tiger_data(args.tiger_data,
|
||||
args.config, args.threads or 1,
|
||||
args.config,
|
||||
args.threads or psutil.cpu_count() or 1,
|
||||
tokenizer)
|
||||
|
||||
osm2pgsql_params = args.osm2pgsql_options(default_cache=1000, default_threads=1)
|
||||
|
||||
@@ -23,13 +23,13 @@ class AdminFuncs:
|
||||
group = parser.add_argument_group('Admin tasks')
|
||||
objs = group.add_mutually_exclusive_group(required=True)
|
||||
objs.add_argument('--warm', action='store_true',
|
||||
help='Warm database caches for search and reverse queries.')
|
||||
help='Warm database caches for search and reverse queries')
|
||||
objs.add_argument('--check-database', action='store_true',
|
||||
help='Check that the database is complete and operational.')
|
||||
help='Check that the database is complete and operational')
|
||||
objs.add_argument('--migrate', action='store_true',
|
||||
help='Migrate the database to a new software version.')
|
||||
help='Migrate the database to a new software version')
|
||||
objs.add_argument('--analyse-indexing', action='store_true',
|
||||
help='Print performance analysis of the indexing process.')
|
||||
help='Print performance analysis of the indexing process')
|
||||
group = parser.add_argument_group('Arguments for cache warming')
|
||||
group.add_argument('--search-only', action='store_const', dest='target',
|
||||
const='search',
|
||||
|
||||
@@ -4,6 +4,7 @@ Subcommand definitions for API calls from the command line.
|
||||
import logging
|
||||
|
||||
from nominatim.tools.exec_utils import run_api_script
|
||||
from nominatim.errors import UsageError
|
||||
|
||||
# Do not repeat documentation of subcommand classes.
|
||||
# pylint: disable=C0111
|
||||
@@ -20,19 +21,19 @@ STRUCTURED_QUERY = (
|
||||
)
|
||||
|
||||
EXTRADATA_PARAMS = (
|
||||
('addressdetails', 'Include a breakdown of the address into elements.'),
|
||||
('addressdetails', 'Include a breakdown of the address into elements'),
|
||||
('extratags', ("Include additional information if available "
|
||||
"(e.g. wikipedia link, opening hours).")),
|
||||
('namedetails', 'Include a list of alternative names.')
|
||||
"(e.g. wikipedia link, opening hours)")),
|
||||
('namedetails', 'Include a list of alternative names')
|
||||
)
|
||||
|
||||
DETAILS_SWITCHES = (
|
||||
('addressdetails', 'Include a breakdown of the address into elements.'),
|
||||
('keywords', 'Include a list of name keywords and address keywords.'),
|
||||
('linkedplaces', 'Include a details of places that are linked with this one.'),
|
||||
('hierarchy', 'Include details of places lower in the address hierarchy.'),
|
||||
('group_hierarchy', 'Group the places by type.'),
|
||||
('polygon_geojson', 'Include geometry of result.')
|
||||
('addressdetails', 'Include a breakdown of the address into elements'),
|
||||
('keywords', 'Include a list of name keywords and address keywords'),
|
||||
('linkedplaces', 'Include a details of places that are linked with this one'),
|
||||
('hierarchy', 'Include details of places lower in the address hierarchy'),
|
||||
('group_hierarchy', 'Group the places by type'),
|
||||
('polygon_geojson', 'Include geometry of result')
|
||||
)
|
||||
|
||||
def _add_api_output_arguments(parser):
|
||||
@@ -47,15 +48,32 @@ def _add_api_output_arguments(parser):
|
||||
help='Preferred language order for presenting search results')
|
||||
group.add_argument('--polygon-output',
|
||||
choices=['geojson', 'kml', 'svg', 'text'],
|
||||
help='Output geometry of results as a GeoJSON, KML, SVG or WKT.')
|
||||
help='Output geometry of results as a GeoJSON, KML, SVG or WKT')
|
||||
group.add_argument('--polygon-threshold', type=float, metavar='TOLERANCE',
|
||||
help=("Simplify output geometry."
|
||||
"Parameter is difference tolerance in degrees."))
|
||||
|
||||
|
||||
def _run_api(endpoint, args, params):
|
||||
script_file = args.project_dir / 'website' / (endpoint + '.php')
|
||||
|
||||
if not script_file.exists():
|
||||
LOG.error("Cannot find API script file.\n\n"
|
||||
"Make sure to run 'nominatim' from the project directory \n"
|
||||
"or use the option --project-dir.")
|
||||
raise UsageError("API script not found.")
|
||||
|
||||
return run_api_script(endpoint, args.project_dir,
|
||||
phpcgi_bin=args.phpcgi_path, params=params)
|
||||
|
||||
class APISearch:
|
||||
"""\
|
||||
Execute API search query.
|
||||
Execute a search query.
|
||||
|
||||
This command works exactly the same as if calling the /search endpoint on
|
||||
the web API. See the online documentation for more details on the
|
||||
various parameters:
|
||||
https://nominatim.org/release-docs/latest/api/Search/
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
@@ -70,7 +88,7 @@ class APISearch:
|
||||
|
||||
group = parser.add_argument_group('Result limitation')
|
||||
group.add_argument('--countrycodes', metavar='CC,..',
|
||||
help='Limit search results to one or more countries.')
|
||||
help='Limit search results to one or more countries')
|
||||
group.add_argument('--exclude_place_ids', metavar='ID,..',
|
||||
help='List of search object to be excluded')
|
||||
group.add_argument('--limit', type=int,
|
||||
@@ -109,12 +127,16 @@ class APISearch:
|
||||
if not args.dedupe:
|
||||
params['dedupe'] = '0'
|
||||
|
||||
return run_api_script('search', args.project_dir,
|
||||
phpcgi_bin=args.phpcgi_path, params=params)
|
||||
return _run_api('search', args, params)
|
||||
|
||||
class APIReverse:
|
||||
"""\
|
||||
Execute API reverse query.
|
||||
|
||||
This command works exactly the same as if calling the /reverse endpoint on
|
||||
the web API. See the online documentation for more details on the
|
||||
various parameters:
|
||||
https://nominatim.org/release-docs/latest/api/Reverse/
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
@@ -148,13 +170,17 @@ class APIReverse:
|
||||
if args.polygon_threshold:
|
||||
params['polygon_threshold'] = args.polygon_threshold
|
||||
|
||||
return run_api_script('reverse', args.project_dir,
|
||||
phpcgi_bin=args.phpcgi_path, params=params)
|
||||
return _run_api('reverse', args, params)
|
||||
|
||||
|
||||
class APILookup:
|
||||
"""\
|
||||
Execute API lookup query.
|
||||
|
||||
This command works exactly the same as if calling the /lookup endpoint on
|
||||
the web API. See the online documentation for more details on the
|
||||
various parameters:
|
||||
https://nominatim.org/release-docs/latest/api/Lookup/
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
@@ -183,13 +209,17 @@ class APILookup:
|
||||
if args.polygon_threshold:
|
||||
params['polygon_threshold'] = args.polygon_threshold
|
||||
|
||||
return run_api_script('lookup', args.project_dir,
|
||||
phpcgi_bin=args.phpcgi_path, params=params)
|
||||
return _run_api('lookup', args, params)
|
||||
|
||||
|
||||
class APIDetails:
|
||||
"""\
|
||||
Execute API details query.
|
||||
|
||||
This command works exactly the same as if calling the /details endpoint on
|
||||
the web API. See the online documentation for more details on the
|
||||
various parameters:
|
||||
https://nominatim.org/release-docs/latest/api/Details/
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
@@ -203,7 +233,7 @@ class APIDetails:
|
||||
objs.add_argument('--relation', '-r', type=int,
|
||||
help="Look up the OSM relation with the given ID.")
|
||||
objs.add_argument('--place_id', '-p', type=int,
|
||||
help='Database internal identifier of the OSM object to look up.')
|
||||
help='Database internal identifier of the OSM object to look up')
|
||||
group.add_argument('--class', dest='object_class',
|
||||
help=("Class type to disambiguated multiple entries "
|
||||
"of the same object."))
|
||||
@@ -229,13 +259,17 @@ class APIDetails:
|
||||
for name, _ in DETAILS_SWITCHES:
|
||||
params[name] = '1' if getattr(args, name) else '0'
|
||||
|
||||
return run_api_script('details', args.project_dir,
|
||||
phpcgi_bin=args.phpcgi_path, params=params)
|
||||
return _run_api('details', args, params)
|
||||
|
||||
|
||||
class APIStatus:
|
||||
"""\
|
||||
Execute API status query.
|
||||
|
||||
This command works exactly the same as if calling the /status endpoint on
|
||||
the web API. See the online documentation for more details on the
|
||||
various parameters:
|
||||
https://nominatim.org/release-docs/latest/api/Status/
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
@@ -246,6 +280,4 @@ class APIStatus:
|
||||
|
||||
@staticmethod
|
||||
def run(args):
|
||||
return run_api_script('status', args.project_dir,
|
||||
phpcgi_bin=args.phpcgi_path,
|
||||
params=dict(format=args.format))
|
||||
return _run_api('status', args, dict(format=args.format))
|
||||
|
||||
@@ -1,7 +1,12 @@
|
||||
"""
|
||||
Provides custom functions over command-line arguments.
|
||||
"""
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from nominatim.errors import UsageError
|
||||
|
||||
LOG = logging.getLogger()
|
||||
|
||||
class NominatimArgs:
|
||||
""" Customized namespace class for the nominatim command line tool
|
||||
@@ -18,10 +23,27 @@ class NominatimArgs:
|
||||
osm2pgsql_style=self.config.get_import_style_file(),
|
||||
threads=self.threads or default_threads,
|
||||
dsn=self.config.get_libpq_dsn(),
|
||||
flatnode_file=self.config.FLATNODE_FILE,
|
||||
flatnode_file=str(self.config.get_path('FLATNODE_FILE')),
|
||||
tablespaces=dict(slim_data=self.config.TABLESPACE_OSM_DATA,
|
||||
slim_index=self.config.TABLESPACE_OSM_INDEX,
|
||||
main_data=self.config.TABLESPACE_PLACE_DATA,
|
||||
main_index=self.config.TABLESPACE_PLACE_INDEX
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def get_osm_file_list(self):
|
||||
""" Return the --osm-file argument as a list of Paths or None
|
||||
if no argument was given. The function also checks if the files
|
||||
exist and raises a UsageError if one cannot be found.
|
||||
"""
|
||||
if not self.osm_file:
|
||||
return None
|
||||
|
||||
files = [Path(f) for f in self.osm_file]
|
||||
for fname in files:
|
||||
if not fname.is_file():
|
||||
LOG.fatal("OSM file '%s' does not exist.", fname)
|
||||
raise UsageError('Cannot access file.')
|
||||
|
||||
return files
|
||||
|
||||
@@ -31,6 +31,6 @@ class SetupFreeze:
|
||||
|
||||
with connect(args.config.get_libpq_dsn()) as conn:
|
||||
freeze.drop_update_tables(conn)
|
||||
freeze.drop_flatnode_file(args.config.FLATNODE_FILE)
|
||||
freeze.drop_flatnode_file(str(args.config.get_path('FLATNODE_FILE')))
|
||||
|
||||
return 0
|
||||
|
||||
@@ -15,6 +15,11 @@ from nominatim.db.connection import connect
|
||||
class UpdateIndex:
|
||||
"""\
|
||||
Reindex all new and modified data.
|
||||
|
||||
Indexing is the process of computing the address and search terms for
|
||||
the places in the database. Every time data is added or changed, indexing
|
||||
needs to be run. Imports and replication updates automatically take care
|
||||
of indexing. For other cases, this function allows to run indexing manually.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
|
||||
@@ -17,7 +17,13 @@ class UpdateRefresh:
|
||||
"""\
|
||||
Recompute auxiliary data used by the indexing process.
|
||||
|
||||
These functions must not be run in parallel with other update commands.
|
||||
This sub-commands updates various static data and functions in the database.
|
||||
It usually needs to be run after changing various aspects of the
|
||||
configuration. The configuration documentation will mention the exact
|
||||
command to use in such case.
|
||||
|
||||
Warning: the 'update' command must not be run in parallel with other update
|
||||
commands like 'replication' or 'add-data'.
|
||||
"""
|
||||
def __init__(self):
|
||||
self.tokenizer = None
|
||||
@@ -34,7 +40,7 @@ class UpdateRefresh:
|
||||
group.add_argument('--functions', action='store_true',
|
||||
help='Update the PL/pgSQL functions in the database')
|
||||
group.add_argument('--wiki-data', action='store_true',
|
||||
help='Update Wikipedia/data importance numbers.')
|
||||
help='Update Wikipedia/data importance numbers')
|
||||
group.add_argument('--importance', action='store_true',
|
||||
help='Recompute place importances (expensive!)')
|
||||
group.add_argument('--website', action='store_true',
|
||||
@@ -65,14 +71,13 @@ class UpdateRefresh:
|
||||
"Postcode updates on a frozen database is not possible.")
|
||||
|
||||
if args.word_counts:
|
||||
LOG.warning('Recompute frequency of full-word search terms')
|
||||
refresh.recompute_word_counts(args.config.get_libpq_dsn(), args.sqllib_dir)
|
||||
LOG.warning('Recompute word statistics')
|
||||
self._get_tokenizer(args.config).update_statistics()
|
||||
|
||||
if args.address_levels:
|
||||
cfg = Path(args.config.ADDRESS_LEVEL_CONFIG)
|
||||
LOG.warning('Updating address levels from %s', cfg)
|
||||
LOG.warning('Updating address levels')
|
||||
with connect(args.config.get_libpq_dsn()) as conn:
|
||||
refresh.load_address_levels_from_file(conn, cfg)
|
||||
refresh.load_address_levels_from_config(conn, args.config)
|
||||
|
||||
if args.functions:
|
||||
LOG.warning('Create functions')
|
||||
|
||||
@@ -20,6 +20,19 @@ LOG = logging.getLogger()
|
||||
class UpdateReplication:
|
||||
"""\
|
||||
Update the database using an online replication service.
|
||||
|
||||
An OSM replication service is an online service that provides regular
|
||||
updates (OSM diff files) for the planet or update they provide. The OSMF
|
||||
provides the primary replication service for the full planet at
|
||||
https://planet.osm.org/replication/ but there are other providers of
|
||||
extracts of OSM data who provide such a service as well.
|
||||
|
||||
This sub-command allows to set up such a replication service and download
|
||||
and import updates at regular intervals. You need to call '--init' once to
|
||||
set up the process or whenever you change the replication configuration
|
||||
parameters. Without any arguments, the sub-command will go into a loop and
|
||||
continuously apply updates as they become available. Giving `--once` just
|
||||
downloads and imports the next batch of updates.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
@@ -29,22 +42,25 @@ class UpdateReplication:
|
||||
help='Initialise the update process')
|
||||
group.add_argument('--no-update-functions', dest='update_functions',
|
||||
action='store_false',
|
||||
help=("Do not update the trigger function to "
|
||||
"support differential updates."))
|
||||
help="Do not update the trigger function to "
|
||||
"support differential updates (EXPERT)")
|
||||
group = parser.add_argument_group('Arguments for updates')
|
||||
group.add_argument('--check-for-updates', action='store_true',
|
||||
help='Check if new updates are available and exit')
|
||||
group.add_argument('--once', action='store_true',
|
||||
help=("Download and apply updates only once. When "
|
||||
"not set, updates are continuously applied"))
|
||||
help="Download and apply updates only once. When "
|
||||
"not set, updates are continuously applied")
|
||||
group.add_argument('--catch-up', action='store_true',
|
||||
help="Download and apply updates until no new "
|
||||
"data is available on the server")
|
||||
group.add_argument('--no-index', action='store_false', dest='do_index',
|
||||
help=("Do not index the new data. Only applicable "
|
||||
help=("Do not index the new data. Only usable "
|
||||
"together with --once"))
|
||||
group.add_argument('--osm2pgsql-cache', metavar='SIZE', type=int,
|
||||
help='Size of cache to be used by osm2pgsql (in MB)')
|
||||
group = parser.add_argument_group('Download parameters')
|
||||
group.add_argument('--socket-timeout', dest='socket_timeout', type=int, default=60,
|
||||
help='Set timeout for file downloads.')
|
||||
help='Set timeout for file downloads')
|
||||
|
||||
@staticmethod
|
||||
def _init_replication(args):
|
||||
@@ -79,28 +95,40 @@ class UpdateReplication:
|
||||
round_time(end - start_import),
|
||||
round_time(end - batchdate))
|
||||
|
||||
|
||||
@staticmethod
|
||||
def _compute_update_interval(args):
|
||||
if args.catch_up:
|
||||
return 0
|
||||
|
||||
update_interval = args.config.get_int('REPLICATION_UPDATE_INTERVAL')
|
||||
# Sanity check to not overwhelm the Geofabrik servers.
|
||||
if 'download.geofabrik.de' in args.config.REPLICATION_URL\
|
||||
and update_interval < 86400:
|
||||
LOG.fatal("Update interval too low for download.geofabrik.de.\n"
|
||||
"Please check install documentation "
|
||||
"(https://nominatim.org/release-docs/latest/admin/Import-and-Update#"
|
||||
"setting-up-the-update-process).")
|
||||
raise UsageError("Invalid replication update interval setting.")
|
||||
|
||||
return update_interval
|
||||
|
||||
|
||||
@staticmethod
|
||||
def _update(args):
|
||||
from ..tools import replication
|
||||
from ..indexer.indexer import Indexer
|
||||
from ..tokenizer import factory as tokenizer_factory
|
||||
|
||||
update_interval = UpdateReplication._compute_update_interval(args)
|
||||
|
||||
params = args.osm2pgsql_options(default_cache=2000, default_threads=1)
|
||||
params.update(base_url=args.config.REPLICATION_URL,
|
||||
update_interval=args.config.get_int('REPLICATION_UPDATE_INTERVAL'),
|
||||
update_interval=update_interval,
|
||||
import_file=args.project_dir / 'osmosischange.osc',
|
||||
max_diff_size=args.config.get_int('REPLICATION_MAX_DIFF'),
|
||||
indexed_only=not args.once)
|
||||
|
||||
# Sanity check to not overwhelm the Geofabrik servers.
|
||||
if 'download.geofabrik.de' in params['base_url']\
|
||||
and params['update_interval'] < 86400:
|
||||
LOG.fatal("Update interval too low for download.geofabrik.de.\n"
|
||||
"Please check install documentation "
|
||||
"(https://nominatim.org/release-docs/latest/admin/Import-and-Update#"
|
||||
"setting-up-the-update-process).")
|
||||
raise UsageError("Invalid replication update interval setting.")
|
||||
|
||||
if not args.once:
|
||||
if not args.do_index:
|
||||
LOG.fatal("Indexing cannot be disabled when running updates continuously.")
|
||||
@@ -108,6 +136,7 @@ class UpdateReplication:
|
||||
recheck_interval = args.config.get_int('REPLICATION_RECHECK_INTERVAL')
|
||||
|
||||
tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
|
||||
indexer = Indexer(args.config.get_libpq_dsn(), tokenizer, args.threads or 1)
|
||||
|
||||
while True:
|
||||
with connect(args.config.get_libpq_dsn()) as conn:
|
||||
@@ -120,10 +149,7 @@ class UpdateReplication:
|
||||
|
||||
if state is not replication.UpdateState.NO_CHANGES and args.do_index:
|
||||
index_start = dt.datetime.now(dt.timezone.utc)
|
||||
indexer = Indexer(args.config.get_libpq_dsn(), tokenizer,
|
||||
args.threads or 1)
|
||||
indexer.index_boundaries(0, 30)
|
||||
indexer.index_by_rank(0, 30)
|
||||
indexer.index_full(analyse=False)
|
||||
|
||||
with connect(args.config.get_libpq_dsn()) as conn:
|
||||
status.set_indexed(conn, True)
|
||||
@@ -132,10 +158,15 @@ class UpdateReplication:
|
||||
else:
|
||||
index_start = None
|
||||
|
||||
if state is replication.UpdateState.NO_CHANGES and \
|
||||
args.catch_up or update_interval > 40*60:
|
||||
while indexer.has_pending():
|
||||
indexer.index_full(analyse=False)
|
||||
|
||||
if LOG.isEnabledFor(logging.WARNING):
|
||||
UpdateReplication._report_update(batchdate, start, index_start)
|
||||
|
||||
if args.once:
|
||||
if args.once or (args.catch_up and state is replication.UpdateState.NO_CHANGES):
|
||||
break
|
||||
|
||||
if state is replication.UpdateState.NO_CHANGES:
|
||||
|
||||
@@ -9,7 +9,6 @@ import psutil
|
||||
from nominatim.db.connection import connect
|
||||
from nominatim.db import status, properties
|
||||
from nominatim.version import NOMINATIM_VERSION
|
||||
from nominatim.errors import UsageError
|
||||
|
||||
# Do not repeat documentation of subcommand classes.
|
||||
# pylint: disable=C0111
|
||||
@@ -21,14 +20,19 @@ LOG = logging.getLogger()
|
||||
class SetupAll:
|
||||
"""\
|
||||
Create a new Nominatim database from an OSM file.
|
||||
|
||||
This sub-command sets up a new Nominatim database from scratch starting
|
||||
with creating a new database in Postgresql. The user running this command
|
||||
needs superuser rights on the database.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def add_args(parser):
|
||||
group_name = parser.add_argument_group('Required arguments')
|
||||
group = group_name.add_mutually_exclusive_group(required=True)
|
||||
group.add_argument('--osm-file', metavar='FILE',
|
||||
help='OSM file to be imported.')
|
||||
group.add_argument('--osm-file', metavar='FILE', action='append',
|
||||
help='OSM file to be imported'
|
||||
' (repeat for importing multiple files)')
|
||||
group.add_argument('--continue', dest='continue_at',
|
||||
choices=['load-data', 'indexing', 'db-postprocess'],
|
||||
help='Continue an import that was interrupted')
|
||||
@@ -47,46 +51,35 @@ class SetupAll:
|
||||
group.add_argument('--ignore-errors', action='store_true',
|
||||
help='Continue import even when errors in SQL are present')
|
||||
group.add_argument('--index-noanalyse', action='store_true',
|
||||
help='Do not perform analyse operations during index')
|
||||
help='Do not perform analyse operations during index (expert only)')
|
||||
|
||||
|
||||
@staticmethod
|
||||
def run(args): # pylint: disable=too-many-statements
|
||||
from ..tools import database_import, refresh, postcodes, freeze
|
||||
def run(args):
|
||||
from ..tools import database_import, refresh, postcodes, freeze, country_info
|
||||
from ..indexer.indexer import Indexer
|
||||
from ..tokenizer import factory as tokenizer_factory
|
||||
|
||||
if args.osm_file and not Path(args.osm_file).is_file():
|
||||
LOG.fatal("OSM file '%s' does not exist.", args.osm_file)
|
||||
raise UsageError('Cannot access file.')
|
||||
country_info.setup_country_config(args.config)
|
||||
|
||||
if args.continue_at is None:
|
||||
files = args.get_osm_file_list()
|
||||
|
||||
LOG.warning('Creating database')
|
||||
database_import.setup_database_skeleton(args.config.get_libpq_dsn(),
|
||||
args.data_dir,
|
||||
args.no_partitions,
|
||||
rouser=args.config.DATABASE_WEBUSER)
|
||||
|
||||
LOG.warning('Setting up country tables')
|
||||
country_info.setup_country_tables(args.config.get_libpq_dsn(),
|
||||
args.data_dir,
|
||||
args.no_partitions)
|
||||
|
||||
LOG.warning('Importing OSM data file')
|
||||
database_import.import_osm_data(Path(args.osm_file),
|
||||
database_import.import_osm_data(files,
|
||||
args.osm2pgsql_options(0, 1),
|
||||
drop=args.no_updates,
|
||||
ignore_errors=args.ignore_errors)
|
||||
|
||||
with connect(args.config.get_libpq_dsn()) as conn:
|
||||
LOG.warning('Create functions (1st pass)')
|
||||
refresh.create_functions(conn, args.config, False, False)
|
||||
LOG.warning('Create tables')
|
||||
database_import.create_tables(conn, args.config,
|
||||
reverse_only=args.reverse_only)
|
||||
refresh.load_address_levels_from_file(conn, Path(args.config.ADDRESS_LEVEL_CONFIG))
|
||||
LOG.warning('Create functions (2nd pass)')
|
||||
refresh.create_functions(conn, args.config, False, False)
|
||||
LOG.warning('Create table triggers')
|
||||
database_import.create_table_triggers(conn, args.config)
|
||||
LOG.warning('Create partition tables')
|
||||
database_import.create_partition_tables(conn, args.config)
|
||||
LOG.warning('Create functions (3rd pass)')
|
||||
refresh.create_functions(conn, args.config, False, False)
|
||||
SetupAll._setup_tables(args.config, args.reverse_only)
|
||||
|
||||
LOG.warning('Importing wikipedia importance data')
|
||||
data_path = Path(args.config.WIKIPEDIA_DATA_PATH or args.project_dir)
|
||||
@@ -105,12 +98,7 @@ class SetupAll:
|
||||
args.threads or psutil.cpu_count() or 1)
|
||||
|
||||
LOG.warning("Setting up tokenizer")
|
||||
if args.continue_at is None or args.continue_at == 'load-data':
|
||||
# (re)initialise the tokenizer data
|
||||
tokenizer = tokenizer_factory.create_tokenizer(args.config)
|
||||
else:
|
||||
# just load the tokenizer
|
||||
tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
|
||||
tokenizer = SetupAll._get_tokenizer(args.continue_at, args.config)
|
||||
|
||||
if args.continue_at is None or args.continue_at == 'load-data':
|
||||
LOG.warning('Calculate postcodes')
|
||||
@@ -131,33 +119,60 @@ class SetupAll:
|
||||
database_import.create_search_indices(conn, args.config,
|
||||
drop=args.no_updates)
|
||||
LOG.warning('Create search index for default country names.')
|
||||
database_import.create_country_names(conn, tokenizer,
|
||||
args.config.LANGUAGES)
|
||||
conn.commit()
|
||||
country_info.create_country_names(conn, tokenizer,
|
||||
args.config.LANGUAGES)
|
||||
if args.no_updates:
|
||||
freeze.drop_update_tables(conn)
|
||||
tokenizer.finalize_import(args.config)
|
||||
|
||||
LOG.warning('Recompute word counts')
|
||||
tokenizer.update_statistics()
|
||||
|
||||
webdir = args.project_dir / 'website'
|
||||
LOG.warning('Setup website at %s', webdir)
|
||||
with connect(args.config.get_libpq_dsn()) as conn:
|
||||
refresh.setup_website(webdir, args.config, conn)
|
||||
|
||||
with connect(args.config.get_libpq_dsn()) as conn:
|
||||
try:
|
||||
dbdate = status.compute_database_date(conn)
|
||||
status.set_status(conn, dbdate)
|
||||
LOG.info('Database is at %s.', dbdate)
|
||||
except Exception as exc: # pylint: disable=broad-except
|
||||
LOG.error('Cannot determine date of database: %s', exc)
|
||||
|
||||
properties.set_property(conn, 'database_version',
|
||||
'{0[0]}.{0[1]}.{0[2]}-{0[3]}'.format(NOMINATIM_VERSION))
|
||||
SetupAll._set_database_date(args.config.get_libpq_dsn())
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
@staticmethod
|
||||
def _setup_tables(config, reverse_only):
|
||||
""" Set up the basic database layout: tables, indexes and functions.
|
||||
"""
|
||||
from ..tools import database_import, refresh
|
||||
|
||||
with connect(config.get_libpq_dsn()) as conn:
|
||||
LOG.warning('Create functions (1st pass)')
|
||||
refresh.create_functions(conn, config, False, False)
|
||||
LOG.warning('Create tables')
|
||||
database_import.create_tables(conn, config, reverse_only=reverse_only)
|
||||
refresh.load_address_levels_from_config(conn, config)
|
||||
LOG.warning('Create functions (2nd pass)')
|
||||
refresh.create_functions(conn, config, False, False)
|
||||
LOG.warning('Create table triggers')
|
||||
database_import.create_table_triggers(conn, config)
|
||||
LOG.warning('Create partition tables')
|
||||
database_import.create_partition_tables(conn, config)
|
||||
LOG.warning('Create functions (3rd pass)')
|
||||
refresh.create_functions(conn, config, False, False)
|
||||
|
||||
|
||||
@staticmethod
|
||||
def _get_tokenizer(continue_at, config):
|
||||
""" Set up a new tokenizer or load an already initialised one.
|
||||
"""
|
||||
from ..tokenizer import factory as tokenizer_factory
|
||||
|
||||
if continue_at is None or continue_at == 'load-data':
|
||||
# (re)initialise the tokenizer data
|
||||
return tokenizer_factory.create_tokenizer(config)
|
||||
|
||||
# just load the tokenizer
|
||||
return tokenizer_factory.get_tokenizer_for_db(config)
|
||||
|
||||
@staticmethod
|
||||
def _create_pending_index(conn, tablespace):
|
||||
""" Add a supporting index for finding places still to be indexed.
|
||||
@@ -178,3 +193,19 @@ class SetupAll:
|
||||
{} WHERE indexed_status > 0
|
||||
""".format(tablespace))
|
||||
conn.commit()
|
||||
|
||||
|
||||
@staticmethod
|
||||
def _set_database_date(dsn):
|
||||
""" Determine the database date and set the status accordingly.
|
||||
"""
|
||||
with connect(dsn) as conn:
|
||||
try:
|
||||
dbdate = status.compute_database_date(conn)
|
||||
status.set_status(conn, dbdate)
|
||||
LOG.info('Database is at %s.', dbdate)
|
||||
except Exception as exc: # pylint: disable=broad-except
|
||||
LOG.error('Cannot determine date of database: %s', exc)
|
||||
|
||||
properties.set_property(conn, 'database_version',
|
||||
'{0[0]}.{0[1]}.{0[2]}-{0[3]}'.format(NOMINATIM_VERSION))
|
||||
|
||||
@@ -19,16 +19,42 @@ LOG = logging.getLogger()
|
||||
class ImportSpecialPhrases:
|
||||
"""\
|
||||
Import special phrases.
|
||||
|
||||
Special phrases are search terms that narrow down the type of object
|
||||
that should be searched. For example, you might want to search for
|
||||
'Hotels in Barcelona'. The OSM wiki has a selection of special phrases
|
||||
in many languages, which can be imported with this command.
|
||||
|
||||
You can also provide your own phrases in a CSV file. The file needs to have
|
||||
the following five columns:
|
||||
* phrase - the term expected for searching
|
||||
* class - the OSM tag key of the object type
|
||||
* type - the OSM tag value of the object type
|
||||
* operator - the kind of search to be done (one of: in, near, name, -)
|
||||
* plural - whether the term is a plural or not (Y/N)
|
||||
|
||||
An example file can be found in the Nominatim sources at
|
||||
'test/testdb/full_en_phrases_test.csv'.
|
||||
|
||||
The import can be further configured to ignore specific key/value pairs.
|
||||
This is particularly useful when importing phrases from the wiki. The
|
||||
default configuration excludes some very common tags like building=yes.
|
||||
The configuration can be customized by putting a file `phrase-settings.json`
|
||||
with custom rules into the project directory or by using the `--config`
|
||||
option to point to another configuration file.
|
||||
"""
|
||||
@staticmethod
|
||||
def add_args(parser):
|
||||
group = parser.add_argument_group('Input arguments')
|
||||
group.add_argument('--import-from-wiki', action='store_true',
|
||||
help='Import special phrases from the OSM wiki to the database.')
|
||||
help='Import special phrases from the OSM wiki to the database')
|
||||
group.add_argument('--import-from-csv', metavar='FILE',
|
||||
help='Import special phrases from a CSV file.')
|
||||
help='Import special phrases from a CSV file')
|
||||
group.add_argument('--no-replace', action='store_true',
|
||||
help='Keep the old phrases and only add the new ones.')
|
||||
help='Keep the old phrases and only add the new ones')
|
||||
group.add_argument('--config', action='store',
|
||||
help='Configuration file for black/white listing '
|
||||
'(default: phrase-settings.json)')
|
||||
|
||||
@staticmethod
|
||||
def run(args):
|
||||
@@ -56,5 +82,5 @@ class ImportSpecialPhrases:
|
||||
should_replace = not args.no_replace
|
||||
with connect(args.config.get_libpq_dsn()) as db_connection:
|
||||
SPImporter(
|
||||
args.config, args.phplib_dir, db_connection, loader
|
||||
args.config, db_connection, loader
|
||||
).import_phrases(tokenizer, should_replace)
|
||||
|
||||
@@ -4,6 +4,8 @@ Nominatim configuration accessor.
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
import json
|
||||
import yaml
|
||||
|
||||
from dotenv import dotenv_values
|
||||
|
||||
@@ -11,6 +13,27 @@ from nominatim.errors import UsageError
|
||||
|
||||
LOG = logging.getLogger()
|
||||
|
||||
|
||||
def flatten_config_list(content, section=''):
|
||||
""" Flatten YAML configuration lists that contain include sections
|
||||
which are lists themselves.
|
||||
"""
|
||||
if not content:
|
||||
return []
|
||||
|
||||
if not isinstance(content, list):
|
||||
raise UsageError(f"List expected in section '{section}'.")
|
||||
|
||||
output = []
|
||||
for ele in content:
|
||||
if isinstance(ele, list):
|
||||
output.extend(flatten_config_list(ele, section))
|
||||
else:
|
||||
output.append(ele)
|
||||
|
||||
return output
|
||||
|
||||
|
||||
class Configuration:
|
||||
""" Load and manage the project configuration.
|
||||
|
||||
@@ -33,12 +56,6 @@ class Configuration:
|
||||
if project_dir is not None and (project_dir / '.env').is_file():
|
||||
self._config.update(dotenv_values(str((project_dir / '.env').resolve())))
|
||||
|
||||
# Add defaults for variables that are left empty to set the default.
|
||||
# They may still be overwritten by environment variables.
|
||||
if not self._config['NOMINATIM_ADDRESS_LEVEL_CONFIG']:
|
||||
self._config['NOMINATIM_ADDRESS_LEVEL_CONFIG'] = \
|
||||
str(config_dir / 'address-levels.json')
|
||||
|
||||
class _LibDirs:
|
||||
pass
|
||||
|
||||
@@ -53,7 +70,10 @@ class Configuration:
|
||||
def __getattr__(self, name):
|
||||
name = 'NOMINATIM_' + name
|
||||
|
||||
return self.environ.get(name) or self._config[name]
|
||||
if name in self.environ:
|
||||
return self.environ[name]
|
||||
|
||||
return self._config[name]
|
||||
|
||||
def get_bool(self, name):
|
||||
""" Return the given configuration parameter as a boolean.
|
||||
@@ -73,6 +93,23 @@ class Configuration:
|
||||
raise UsageError("Configuration error.") from exp
|
||||
|
||||
|
||||
def get_path(self, name):
|
||||
""" Return the given configuration parameter as a Path.
|
||||
If a relative path is configured, then the function converts this
|
||||
into an absolute path with the project directory as root path.
|
||||
If the configuration is unset, a falsy value is returned.
|
||||
"""
|
||||
value = self.__getattr__(name)
|
||||
if value:
|
||||
value = Path(value)
|
||||
|
||||
if not value.is_absolute():
|
||||
value = self.project_dir / value
|
||||
|
||||
value = value.resolve()
|
||||
|
||||
return value
|
||||
|
||||
def get_libpq_dsn(self):
|
||||
""" Get configured database DSN converted into the key/value format
|
||||
understood by libpq and psycopg.
|
||||
@@ -103,7 +140,7 @@ class Configuration:
|
||||
if style in ('admin', 'street', 'address', 'full', 'extratags'):
|
||||
return self.config_dir / 'import-{}.style'.format(style)
|
||||
|
||||
return Path(style)
|
||||
return self.find_config_file('', 'IMPORT_STYLE')
|
||||
|
||||
|
||||
def get_os_env(self):
|
||||
@@ -114,3 +151,98 @@ class Configuration:
|
||||
env.update(self.environ)
|
||||
|
||||
return env
|
||||
|
||||
|
||||
def load_sub_configuration(self, filename, config=None):
|
||||
""" Load additional configuration from a file. `filename` is the name
|
||||
of the configuration file. The file is first searched in the
|
||||
project directory and then in the global settings dirctory.
|
||||
|
||||
If `config` is set, then the name of the configuration file can
|
||||
be additionally given through a .env configuration option. When
|
||||
the option is set, then the file will be exclusively loaded as set:
|
||||
if the name is an absolute path, the file name is taken as is,
|
||||
if the name is relative, it is taken to be relative to the
|
||||
project directory.
|
||||
|
||||
The format of the file is determined from the filename suffix.
|
||||
Currently only files with extension '.yaml' are supported.
|
||||
|
||||
YAML files support a special '!include' construct. When the
|
||||
directive is given, the value is taken to be a filename, the file
|
||||
is loaded using this function and added at the position in the
|
||||
configuration tree.
|
||||
"""
|
||||
configfile = self.find_config_file(filename, config)
|
||||
|
||||
if configfile.suffix in ('.yaml', '.yml'):
|
||||
return self._load_from_yaml(configfile)
|
||||
|
||||
if configfile.suffix == '.json':
|
||||
with configfile.open('r') as cfg:
|
||||
return json.load(cfg)
|
||||
|
||||
raise UsageError(f"Config file '{configfile}' has unknown format.")
|
||||
|
||||
|
||||
def find_config_file(self, filename, config=None):
|
||||
""" Resolve the location of a configuration file given a filename and
|
||||
an optional configuration option with the file name.
|
||||
Raises a UsageError when the file cannot be found or is not
|
||||
a regular file.
|
||||
"""
|
||||
if config is not None:
|
||||
cfg_filename = self.__getattr__(config)
|
||||
if cfg_filename:
|
||||
cfg_filename = Path(cfg_filename)
|
||||
|
||||
if cfg_filename.is_absolute():
|
||||
cfg_filename = cfg_filename.resolve()
|
||||
|
||||
if not cfg_filename.is_file():
|
||||
LOG.fatal("Cannot find config file '%s'.", cfg_filename)
|
||||
raise UsageError("Config file not found.")
|
||||
|
||||
return cfg_filename
|
||||
|
||||
filename = cfg_filename
|
||||
|
||||
|
||||
search_paths = [self.project_dir, self.config_dir]
|
||||
for path in search_paths:
|
||||
if path is not None and (path / filename).is_file():
|
||||
return path / filename
|
||||
|
||||
LOG.fatal("Configuration file '%s' not found.\nDirectories searched: %s",
|
||||
filename, search_paths)
|
||||
raise UsageError("Config file not found.")
|
||||
|
||||
|
||||
def _load_from_yaml(self, cfgfile):
|
||||
""" Load a YAML configuration file. This installs a special handler that
|
||||
allows to include other YAML files using the '!include' operator.
|
||||
"""
|
||||
yaml.add_constructor('!include', self._yaml_include_representer,
|
||||
Loader=yaml.SafeLoader)
|
||||
return yaml.safe_load(cfgfile.read_text(encoding='utf-8'))
|
||||
|
||||
|
||||
def _yaml_include_representer(self, loader, node):
|
||||
""" Handler for the '!include' operator in YAML files.
|
||||
|
||||
When the filename is relative, then the file is first searched in the
|
||||
project directory and then in the global settings dirctory.
|
||||
"""
|
||||
fname = loader.construct_scalar(node)
|
||||
|
||||
if Path(fname).is_absolute():
|
||||
configfile = Path(fname)
|
||||
else:
|
||||
configfile = self.find_config_file(loader.construct_scalar(node))
|
||||
|
||||
if configfile.suffix != '.yaml':
|
||||
LOG.fatal("Format error while reading '%s': only YAML format supported.",
|
||||
configfile)
|
||||
raise UsageError("Cannot handle config file format.")
|
||||
|
||||
return yaml.safe_load(configfile.read_text(encoding='utf-8'))
|
||||
|
||||
@@ -36,7 +36,7 @@ def _setup_tablespace_sql(config):
|
||||
tspace = getattr(config, 'TABLESPACE_{}_{}'.format(subset, kind))
|
||||
if tspace:
|
||||
tspace = 'TABLESPACE "{}"'.format(tspace)
|
||||
out['{}_{}'.format(subset.lower, kind.lower())] = tspace
|
||||
out['{}_{}'.format(subset.lower(), kind.lower())] = tspace
|
||||
|
||||
return out
|
||||
|
||||
@@ -46,8 +46,10 @@ def _setup_postgresql_features(conn):
|
||||
depend on the database version.
|
||||
"""
|
||||
pg_version = conn.server_version_tuple()
|
||||
postgis_version = conn.postgis_version_tuple()
|
||||
return {
|
||||
'has_index_non_key_column': pg_version >= (11, 0, 0)
|
||||
'has_index_non_key_column': pg_version >= (11, 0, 0),
|
||||
'spgist_geom' : 'SPGIST' if postgis_version >= (3, 0) else 'GIST'
|
||||
}
|
||||
|
||||
class SQLPreprocessor:
|
||||
|
||||
@@ -91,6 +91,17 @@ class Indexer:
|
||||
self.num_threads = num_threads
|
||||
|
||||
|
||||
def has_pending(self):
|
||||
""" Check if any data still needs indexing.
|
||||
This function must only be used after the import has finished.
|
||||
Otherwise it will be very expensive.
|
||||
"""
|
||||
with connect(self.dsn) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("SELECT 'a' FROM placex WHERE indexed_status > 0 LIMIT 1")
|
||||
return cur.rowcount > 0
|
||||
|
||||
|
||||
def index_full(self, analyse=True):
|
||||
""" Index the complete database. This will first index boundaries
|
||||
followed by all other objects. When `analyse` is True, then the
|
||||
|
||||
68
nominatim/indexer/place_info.py
Normal file
68
nominatim/indexer/place_info.py
Normal file
@@ -0,0 +1,68 @@
|
||||
"""
|
||||
Wrapper around place information the indexer gets from the database and hands to
|
||||
the tokenizer.
|
||||
"""
|
||||
|
||||
import psycopg2.extras
|
||||
|
||||
class PlaceInfo:
|
||||
""" Data class containing all information the tokenizer gets about a
|
||||
place it should process the names for.
|
||||
"""
|
||||
|
||||
def __init__(self, info):
|
||||
self._info = info
|
||||
|
||||
|
||||
def analyze(self, analyzer):
|
||||
""" Process this place with the given tokenizer and return the
|
||||
result in psycopg2-compatible Json.
|
||||
"""
|
||||
return psycopg2.extras.Json(analyzer.process_place(self))
|
||||
|
||||
|
||||
@property
|
||||
def name(self):
|
||||
""" A dictionary with the names of the place or None if the place
|
||||
has no names.
|
||||
"""
|
||||
return self._info.get('name')
|
||||
|
||||
|
||||
@property
|
||||
def address(self):
|
||||
""" A dictionary with the address elements of the place
|
||||
or None if no address information is available.
|
||||
"""
|
||||
return self._info.get('address')
|
||||
|
||||
|
||||
@property
|
||||
def country_code(self):
|
||||
""" The country code of the country the place is in. Guaranteed
|
||||
to be a two-letter lower-case string or None, if no country
|
||||
could be found.
|
||||
"""
|
||||
return self._info.get('country_code')
|
||||
|
||||
|
||||
@property
|
||||
def rank_address(self):
|
||||
""" The computed rank address before rank correction.
|
||||
"""
|
||||
return self._info.get('rank_address')
|
||||
|
||||
|
||||
def is_a(self, key, value):
|
||||
""" Check if the place's primary tag corresponds to the given
|
||||
key and value.
|
||||
"""
|
||||
return self._info.get('class') == key and self._info.get('type') == value
|
||||
|
||||
|
||||
def is_country(self):
|
||||
""" Check if the place is a valid country boundary.
|
||||
"""
|
||||
return self.rank_address == 4 \
|
||||
and self.is_a('boundary', 'administrative') \
|
||||
and self.country_code is not None
|
||||
@@ -4,18 +4,21 @@ tasks.
|
||||
"""
|
||||
import functools
|
||||
|
||||
import psycopg2.extras
|
||||
from psycopg2 import sql as pysql
|
||||
|
||||
from nominatim.indexer.place_info import PlaceInfo
|
||||
|
||||
# pylint: disable=C0111
|
||||
|
||||
def _mk_valuelist(template, num):
|
||||
return pysql.SQL(',').join([pysql.SQL(template)] * num)
|
||||
|
||||
|
||||
class AbstractPlacexRunner:
|
||||
""" Returns SQL commands for indexing of the placex table.
|
||||
"""
|
||||
SELECT_SQL = pysql.SQL('SELECT place_id FROM placex ')
|
||||
UPDATE_LINE = "(%s, %s::hstore, %s::hstore, %s::int, %s::jsonb)"
|
||||
|
||||
def __init__(self, rank, analyzer):
|
||||
self.rank = rank
|
||||
@@ -27,15 +30,16 @@ class AbstractPlacexRunner:
|
||||
def _index_sql(num_places):
|
||||
return pysql.SQL(
|
||||
""" UPDATE placex
|
||||
SET indexed_status = 0, address = v.addr, token_info = v.ti
|
||||
FROM (VALUES {}) as v(id, addr, ti)
|
||||
SET indexed_status = 0, address = v.addr, token_info = v.ti,
|
||||
name = v.name, linked_place_id = v.linked_place_id
|
||||
FROM (VALUES {}) as v(id, name, addr, linked_place_id, ti)
|
||||
WHERE place_id = v.id
|
||||
""").format(_mk_valuelist("(%s, %s::hstore, %s::jsonb)", num_places))
|
||||
""").format(_mk_valuelist(AbstractPlacexRunner.UPDATE_LINE, num_places))
|
||||
|
||||
|
||||
@staticmethod
|
||||
def get_place_details(worker, ids):
|
||||
worker.perform("""SELECT place_id, (placex_prepare_update(placex)).*
|
||||
worker.perform("""SELECT place_id, (placex_indexing_prepare(placex)).*
|
||||
FROM placex WHERE place_id IN %s""",
|
||||
(tuple((p[0] for p in ids)), ))
|
||||
|
||||
@@ -43,8 +47,9 @@ class AbstractPlacexRunner:
|
||||
def index_places(self, worker, places):
|
||||
values = []
|
||||
for place in places:
|
||||
values.extend((place[x] for x in ('place_id', 'address')))
|
||||
values.append(psycopg2.extras.Json(self.analyzer.process_place(place)))
|
||||
for field in ('place_id', 'name', 'address', 'linked_place_id'):
|
||||
values.append(place[field])
|
||||
values.append(PlaceInfo(place).analyze(self.analyzer))
|
||||
|
||||
worker.perform(self._index_sql(len(places)), values)
|
||||
|
||||
@@ -138,7 +143,7 @@ class InterpolationRunner:
|
||||
values = []
|
||||
for place in places:
|
||||
values.extend((place[x] for x in ('place_id', 'address')))
|
||||
values.append(psycopg2.extras.Json(self.analyzer.process_place(place)))
|
||||
values.append(PlaceInfo(place).analyze(self.analyzer))
|
||||
|
||||
worker.perform(self._index_sql(len(places)), values)
|
||||
|
||||
|
||||
232
nominatim/tokenizer/base.py
Normal file
232
nominatim/tokenizer/base.py
Normal file
@@ -0,0 +1,232 @@
|
||||
"""
|
||||
Abstract class defintions for tokenizers. These base classes are here
|
||||
mainly for documentation purposes.
|
||||
"""
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List, Tuple, Dict, Any
|
||||
|
||||
from nominatim.config import Configuration
|
||||
from nominatim.indexer.place_info import PlaceInfo
|
||||
|
||||
# pylint: disable=unnecessary-pass
|
||||
|
||||
class AbstractAnalyzer(ABC):
|
||||
""" The analyzer provides the functions for analysing names and building
|
||||
the token database.
|
||||
|
||||
Analyzers are instantiated on a per-thread base. Access to global data
|
||||
structures must be synchronised accordingly.
|
||||
"""
|
||||
|
||||
def __enter__(self) -> 'AbstractAnalyzer':
|
||||
return self
|
||||
|
||||
|
||||
def __exit__(self, exc_type, exc_value, traceback) -> None:
|
||||
self.close()
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def close(self) -> None:
|
||||
""" Free all resources used by the analyzer.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def get_word_token_info(self, words: List[str]) -> List[Tuple[str, str, int]]:
|
||||
""" Return token information for the given list of words.
|
||||
|
||||
The function is used for testing and debugging only
|
||||
and does not need to be particularly efficient.
|
||||
|
||||
Arguments:
|
||||
words: A list of words to look up the tokens for.
|
||||
If a word starts with # it is assumed to be a full name
|
||||
otherwise is a partial term.
|
||||
|
||||
Returns:
|
||||
The function returns the list of all tuples that could be
|
||||
found for the given words. Each list entry is a tuple of
|
||||
(original word, word token, word id).
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def normalize_postcode(self, postcode: str) -> str:
|
||||
""" Convert the postcode to its standardized form.
|
||||
|
||||
This function must yield exactly the same result as the SQL function
|
||||
`token_normalized_postcode()`.
|
||||
|
||||
Arguments:
|
||||
postcode: The postcode to be normalized.
|
||||
|
||||
Returns:
|
||||
The given postcode after normalization.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def update_postcodes_from_db(self) -> None:
|
||||
""" Update the tokenizer's postcode tokens from the current content
|
||||
of the `location_postcode` table.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def update_special_phrases(self, phrases: List[Tuple[str, str, str, str]],
|
||||
should_replace: bool) -> None:
|
||||
""" Update the tokenizer's special phrase tokens from the given
|
||||
list of special phrases.
|
||||
|
||||
Arguments:
|
||||
phrases: The new list of special phrases. Each entry is
|
||||
a tuple of (phrase, class, type, operator).
|
||||
should_replace: If true, replace the current list of phrases.
|
||||
When false, just add the given phrases to the
|
||||
ones that already exist.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def add_country_names(self, country_code: str, names: Dict[str, str]):
|
||||
""" Add the given names to the tokenizer's list of country tokens.
|
||||
|
||||
Arguments:
|
||||
country_code: two-letter country code for the country the names
|
||||
refer to.
|
||||
names: Dictionary of name type to name.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def process_place(self, place: PlaceInfo) -> Any:
|
||||
""" Extract tokens for the given place and compute the
|
||||
information to be handed to the PL/pgSQL processor for building
|
||||
the search index.
|
||||
|
||||
Arguments:
|
||||
place: Place information retrived from the database.
|
||||
|
||||
Returns:
|
||||
A JSON-serialisable structure that will be handed into
|
||||
the database via the `token_info` field.
|
||||
"""
|
||||
|
||||
|
||||
|
||||
class AbstractTokenizer(ABC):
|
||||
""" The tokenizer instance is the central instance of the tokenizer in
|
||||
the system. There will only be a single instance of the tokenizer
|
||||
active at any time.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
|
||||
""" Set up a new tokenizer for the database.
|
||||
|
||||
The function should copy all necessary data into the project
|
||||
directory or save it in the property table to make sure that
|
||||
the tokenizer remains stable over updates.
|
||||
|
||||
Arguments:
|
||||
config: Read-only object with configuration options.
|
||||
|
||||
init_db: When set to False, then initialisation of database
|
||||
tables should be skipped. This option is only required for
|
||||
migration purposes and can be savely ignored by custom
|
||||
tokenizers.
|
||||
|
||||
TODO: can we move the init_db parameter somewhere else?
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def init_from_project(self, config: Configuration) -> None:
|
||||
""" Initialise the tokenizer from an existing database setup.
|
||||
|
||||
The function should load all previously saved configuration from
|
||||
the project directory and/or the property table.
|
||||
|
||||
Arguments:
|
||||
config: Read-only object with configuration options.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def finalize_import(self, config: Configuration) -> None:
|
||||
""" This function is called at the very end of an import when all
|
||||
data has been imported and indexed. The tokenizer may create
|
||||
at this point any additional indexes and data structures needed
|
||||
during query time.
|
||||
|
||||
Arguments:
|
||||
config: Read-only object with configuration options.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def update_sql_functions(self, config: Configuration) -> None:
|
||||
""" Update the SQL part of the tokenizer. This function is called
|
||||
automatically on migrations or may be called explicitly by the
|
||||
user through the `nominatim refresh --functions` command.
|
||||
|
||||
The tokenizer must only update the code of the tokenizer. The
|
||||
data structures or data itself must not be changed by this function.
|
||||
|
||||
Arguments:
|
||||
config: Read-only object with configuration options.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def check_database(self, config: Configuration) -> str:
|
||||
""" Check that the database is set up correctly and ready for being
|
||||
queried.
|
||||
|
||||
Arguments:
|
||||
config: Read-only object with configuration options.
|
||||
|
||||
Returns:
|
||||
If an issue was found, return an error message with the
|
||||
description of the issue as well as hints for the user on
|
||||
how to resolve the issue. If everything is okay, return `None`.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def update_statistics(self) -> None:
|
||||
""" Recompute any tokenizer statistics necessary for efficient lookup.
|
||||
This function is meant to be called from time to time by the user
|
||||
to improve performance. However, the tokenizer must not depend on
|
||||
it to be called in order to work.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def name_analyzer(self) -> AbstractAnalyzer:
|
||||
""" Create a new analyzer for tokenizing names and queries
|
||||
using this tokinzer. Analyzers are context managers and should
|
||||
be used accordingly:
|
||||
|
||||
```
|
||||
with tokenizer.name_analyzer() as analyzer:
|
||||
analyser.tokenize()
|
||||
```
|
||||
|
||||
When used outside the with construct, the caller must ensure to
|
||||
call the close() function before destructing the analyzer.
|
||||
"""
|
||||
pass
|
||||
@@ -85,6 +85,6 @@ def get_tokenizer_for_db(config):
|
||||
tokenizer_module = _import_tokenizer(name)
|
||||
|
||||
tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
|
||||
tokenizer.init_from_project()
|
||||
tokenizer.init_from_project(config)
|
||||
|
||||
return tokenizer
|
||||
|
||||
@@ -1,146 +0,0 @@
|
||||
"""
|
||||
Processor for names that are imported into the database based on the
|
||||
ICU library.
|
||||
"""
|
||||
from collections import defaultdict
|
||||
import itertools
|
||||
|
||||
from icu import Transliterator
|
||||
import datrie
|
||||
|
||||
from nominatim.db.properties import set_property, get_property
|
||||
from nominatim.tokenizer import icu_variants as variants
|
||||
|
||||
DBCFG_IMPORT_NORM_RULES = "tokenizer_import_normalisation"
|
||||
DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration"
|
||||
DBCFG_IMPORT_REPLACEMENTS = "tokenizer_import_replacements"
|
||||
DBCFG_SEARCH_STD_RULES = "tokenizer_search_standardization"
|
||||
|
||||
|
||||
class ICUNameProcessorRules:
|
||||
""" Data object that saves the rules needed for the name processor.
|
||||
|
||||
The rules can either be initialised through an ICURuleLoader or
|
||||
be loaded from a database when a connection is given.
|
||||
"""
|
||||
def __init__(self, loader=None, conn=None):
|
||||
if loader is not None:
|
||||
self.norm_rules = loader.get_normalization_rules()
|
||||
self.trans_rules = loader.get_transliteration_rules()
|
||||
self.replacements = loader.get_replacement_pairs()
|
||||
self.search_rules = loader.get_search_rules()
|
||||
elif conn is not None:
|
||||
self.norm_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
|
||||
self.trans_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
|
||||
self.replacements = \
|
||||
variants.unpickle_variant_set(get_property(conn, DBCFG_IMPORT_REPLACEMENTS))
|
||||
self.search_rules = get_property(conn, DBCFG_SEARCH_STD_RULES)
|
||||
else:
|
||||
assert False, "Parameter loader or conn required."
|
||||
|
||||
|
||||
def save_rules(self, conn):
|
||||
""" Save the rules in the property table of the given database.
|
||||
the rules can be loaded again by handing in a connection into
|
||||
the constructor of the class.
|
||||
"""
|
||||
set_property(conn, DBCFG_IMPORT_NORM_RULES, self.norm_rules)
|
||||
set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.trans_rules)
|
||||
set_property(conn, DBCFG_IMPORT_REPLACEMENTS,
|
||||
variants.pickle_variant_set(self.replacements))
|
||||
set_property(conn, DBCFG_SEARCH_STD_RULES, self.search_rules)
|
||||
|
||||
|
||||
class ICUNameProcessor:
|
||||
""" Collects the different transformation rules for normalisation of names
|
||||
and provides the functions to aply the transformations.
|
||||
"""
|
||||
|
||||
def __init__(self, rules):
|
||||
self.normalizer = Transliterator.createFromRules("icu_normalization",
|
||||
rules.norm_rules)
|
||||
self.to_ascii = Transliterator.createFromRules("icu_to_ascii",
|
||||
rules.trans_rules +
|
||||
";[:Space:]+ > ' '")
|
||||
self.search = Transliterator.createFromRules("icu_search",
|
||||
rules.search_rules)
|
||||
|
||||
# Intermediate reorder by source. Also compute required character set.
|
||||
immediate = defaultdict(list)
|
||||
chars = set()
|
||||
for variant in rules.replacements:
|
||||
if variant.source[-1] == ' ' and variant.replacement[-1] == ' ':
|
||||
replstr = variant.replacement[:-1]
|
||||
else:
|
||||
replstr = variant.replacement
|
||||
immediate[variant.source].append(replstr)
|
||||
chars.update(variant.source)
|
||||
# Then copy to datrie
|
||||
self.replacements = datrie.Trie(''.join(chars))
|
||||
for src, repllist in immediate.items():
|
||||
self.replacements[src] = repllist
|
||||
|
||||
|
||||
def get_normalized(self, name):
|
||||
""" Normalize the given name, i.e. remove all elements not relevant
|
||||
for search.
|
||||
"""
|
||||
return self.normalizer.transliterate(name).strip()
|
||||
|
||||
def get_variants_ascii(self, norm_name):
|
||||
""" Compute the spelling variants for the given normalized name
|
||||
and transliterate the result.
|
||||
"""
|
||||
baseform = '^ ' + norm_name + ' ^'
|
||||
partials = ['']
|
||||
|
||||
startpos = 0
|
||||
pos = 0
|
||||
force_space = False
|
||||
while pos < len(baseform):
|
||||
full, repl = self.replacements.longest_prefix_item(baseform[pos:],
|
||||
(None, None))
|
||||
if full is not None:
|
||||
done = baseform[startpos:pos]
|
||||
partials = [v + done + r
|
||||
for v, r in itertools.product(partials, repl)
|
||||
if not force_space or r.startswith(' ')]
|
||||
if len(partials) > 128:
|
||||
# If too many variants are produced, they are unlikely
|
||||
# to be helpful. Only use the original term.
|
||||
startpos = 0
|
||||
break
|
||||
startpos = pos + len(full)
|
||||
if full[-1] == ' ':
|
||||
startpos -= 1
|
||||
force_space = True
|
||||
pos = startpos
|
||||
else:
|
||||
pos += 1
|
||||
force_space = False
|
||||
|
||||
# No variants detected? Fast return.
|
||||
if startpos == 0:
|
||||
trans_name = self.to_ascii.transliterate(norm_name).strip()
|
||||
return [trans_name] if trans_name else []
|
||||
|
||||
return self._compute_result_set(partials, baseform[startpos:])
|
||||
|
||||
|
||||
def _compute_result_set(self, partials, prefix):
|
||||
results = set()
|
||||
|
||||
for variant in partials:
|
||||
vname = variant + prefix
|
||||
trans_name = self.to_ascii.transliterate(vname[1:-1]).strip()
|
||||
if trans_name:
|
||||
results.add(trans_name)
|
||||
|
||||
return list(results)
|
||||
|
||||
|
||||
def get_search_normalized(self, name):
|
||||
""" Return the normalized version of the name (including transliteration)
|
||||
to be applied at search time.
|
||||
"""
|
||||
return self.search.transliterate(' ' + name + ' ').strip()
|
||||
@@ -1,61 +1,86 @@
|
||||
"""
|
||||
Helper class to create ICU rules from a configuration file.
|
||||
"""
|
||||
import importlib
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
import itertools
|
||||
from pathlib import Path
|
||||
import re
|
||||
|
||||
import yaml
|
||||
from icu import Transliterator
|
||||
|
||||
from nominatim.config import flatten_config_list
|
||||
from nominatim.db.properties import set_property, get_property
|
||||
from nominatim.errors import UsageError
|
||||
import nominatim.tokenizer.icu_variants as variants
|
||||
from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
|
||||
from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
|
||||
import nominatim.tools.country_info
|
||||
|
||||
LOG = logging.getLogger()
|
||||
|
||||
def _flatten_yaml_list(content):
|
||||
if not content:
|
||||
return []
|
||||
|
||||
if not isinstance(content, list):
|
||||
raise UsageError("List expected in ICU yaml configuration.")
|
||||
|
||||
output = []
|
||||
for ele in content:
|
||||
if isinstance(ele, list):
|
||||
output.extend(_flatten_yaml_list(ele))
|
||||
else:
|
||||
output.append(ele)
|
||||
|
||||
return output
|
||||
DBCFG_IMPORT_NORM_RULES = "tokenizer_import_normalisation"
|
||||
DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration"
|
||||
DBCFG_IMPORT_ANALYSIS_RULES = "tokenizer_import_analysis_rules"
|
||||
|
||||
|
||||
class VariantRule:
|
||||
""" Saves a single variant expansion.
|
||||
|
||||
An expansion consists of the normalized replacement term and
|
||||
a dicitonary of properties that describe when the expansion applies.
|
||||
def _get_section(rules, section):
|
||||
""" Get the section named 'section' from the rules. If the section does
|
||||
not exist, raise a usage error with a meaningful message.
|
||||
"""
|
||||
if section not in rules:
|
||||
LOG.fatal("Section '%s' not found in tokenizer config.", section)
|
||||
raise UsageError("Syntax error in tokenizer configuration file.")
|
||||
|
||||
def __init__(self, replacement, properties):
|
||||
self.replacement = replacement
|
||||
self.properties = properties or {}
|
||||
return rules[section]
|
||||
|
||||
|
||||
class ICURuleLoader:
|
||||
""" Compiler for ICU rules from a tokenizer configuration file.
|
||||
"""
|
||||
|
||||
def __init__(self, configfile):
|
||||
self.configfile = configfile
|
||||
self.variants = set()
|
||||
def __init__(self, config):
|
||||
rules = config.load_sub_configuration('icu_tokenizer.yaml',
|
||||
config='TOKENIZER_CONFIG')
|
||||
|
||||
if configfile.suffix == '.yaml':
|
||||
self._load_from_yaml()
|
||||
else:
|
||||
raise UsageError("Unknown format of tokenizer configuration.")
|
||||
# Make sure country information is available to analyzers and sanatizers.
|
||||
nominatim.tools.country_info.setup_country_config(config)
|
||||
|
||||
self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
|
||||
self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
|
||||
self.analysis_rules = _get_section(rules, 'token-analysis')
|
||||
self._setup_analysis()
|
||||
|
||||
# Load optional sanitizer rule set.
|
||||
self.sanitizer_rules = rules.get('sanitizers', [])
|
||||
|
||||
|
||||
def load_config_from_db(self, conn):
|
||||
""" Get previously saved parts of the configuration from the
|
||||
database.
|
||||
"""
|
||||
self.normalization_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
|
||||
self.transliteration_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
|
||||
self.analysis_rules = json.loads(get_property(conn, DBCFG_IMPORT_ANALYSIS_RULES))
|
||||
self._setup_analysis()
|
||||
|
||||
|
||||
def save_config_to_db(self, conn):
|
||||
""" Save the part of the configuration that cannot be changed into
|
||||
the database.
|
||||
"""
|
||||
set_property(conn, DBCFG_IMPORT_NORM_RULES, self.normalization_rules)
|
||||
set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.transliteration_rules)
|
||||
set_property(conn, DBCFG_IMPORT_ANALYSIS_RULES, json.dumps(self.analysis_rules))
|
||||
|
||||
|
||||
def make_sanitizer(self):
|
||||
""" Create a place sanitizer from the configured rules.
|
||||
"""
|
||||
return PlaceSanitizer(self.sanitizer_rules)
|
||||
|
||||
|
||||
def make_token_analysis(self):
|
||||
""" Create a token analyser from the reviouly loaded rules.
|
||||
"""
|
||||
return ICUTokenAnalysis(self.normalization_rules,
|
||||
self.transliteration_rules, self.analysis)
|
||||
|
||||
|
||||
def get_search_rules(self):
|
||||
@@ -70,177 +95,66 @@ class ICURuleLoader:
|
||||
rules.write(self.transliteration_rules)
|
||||
return rules.getvalue()
|
||||
|
||||
|
||||
def get_normalization_rules(self):
|
||||
""" Return rules for normalisation of a term.
|
||||
"""
|
||||
return self.normalization_rules
|
||||
|
||||
|
||||
def get_transliteration_rules(self):
|
||||
""" Return the rules for converting a string into its asciii representation.
|
||||
"""
|
||||
return self.transliteration_rules
|
||||
|
||||
def get_replacement_pairs(self):
|
||||
""" Return the list of possible compound decompositions with
|
||||
application of abbreviations included.
|
||||
The result is a list of pairs: the first item is the sequence to
|
||||
replace, the second is a list of replacements.
|
||||
|
||||
def _setup_analysis(self):
|
||||
""" Process the rules used for creating the various token analyzers.
|
||||
"""
|
||||
return self.variants
|
||||
self.analysis = {}
|
||||
|
||||
def _yaml_include_representer(self, loader, node):
|
||||
value = loader.construct_scalar(node)
|
||||
if not isinstance(self.analysis_rules, list):
|
||||
raise UsageError("Configuration section 'token-analysis' must be a list.")
|
||||
|
||||
if Path(value).is_absolute():
|
||||
content = Path(value).read_text()
|
||||
else:
|
||||
content = (self.configfile.parent / value).read_text()
|
||||
|
||||
return yaml.safe_load(content)
|
||||
for section in self.analysis_rules:
|
||||
name = section.get('id', None)
|
||||
if name in self.analysis:
|
||||
if name is None:
|
||||
LOG.fatal("ICU tokenizer configuration has two default token analyzers.")
|
||||
else:
|
||||
LOG.fatal("ICU tokenizer configuration has two token "
|
||||
"analyzers with id '%s'.", name)
|
||||
raise UsageError("Syntax error in ICU tokenizer config.")
|
||||
self.analysis[name] = TokenAnalyzerRule(section, self.normalization_rules)
|
||||
|
||||
|
||||
def _load_from_yaml(self):
|
||||
yaml.add_constructor('!include', self._yaml_include_representer,
|
||||
Loader=yaml.SafeLoader)
|
||||
rules = yaml.safe_load(self.configfile.read_text())
|
||||
|
||||
self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
|
||||
self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
|
||||
self._parse_variant_list(self._get_section(rules, 'variants'))
|
||||
|
||||
|
||||
def _get_section(self, rules, section):
|
||||
""" Get the section named 'section' from the rules. If the section does
|
||||
not exist, raise a usage error with a meaningful message.
|
||||
"""
|
||||
if section not in rules:
|
||||
LOG.fatal("Section '%s' not found in tokenizer config '%s'.",
|
||||
section, str(self.configfile))
|
||||
raise UsageError("Syntax error in tokenizer configuration file.")
|
||||
|
||||
return rules[section]
|
||||
|
||||
|
||||
def _cfg_to_icu_rules(self, rules, section):
|
||||
@staticmethod
|
||||
def _cfg_to_icu_rules(rules, section):
|
||||
""" Load an ICU ruleset from the given section. If the section is a
|
||||
simple string, it is interpreted as a file name and the rules are
|
||||
loaded verbatim from the given file. The filename is expected to be
|
||||
relative to the tokenizer rule file. If the section is a list then
|
||||
each line is assumed to be a rule. All rules are concatenated and returned.
|
||||
"""
|
||||
content = self._get_section(rules, section)
|
||||
content = _get_section(rules, section)
|
||||
|
||||
if content is None:
|
||||
return ''
|
||||
|
||||
return ';'.join(_flatten_yaml_list(content)) + ';'
|
||||
return ';'.join(flatten_config_list(content, section)) + ';'
|
||||
|
||||
|
||||
def _parse_variant_list(self, rules):
|
||||
self.variants.clear()
|
||||
|
||||
if not rules:
|
||||
return
|
||||
|
||||
rules = _flatten_yaml_list(rules)
|
||||
|
||||
vmaker = _VariantMaker(self.normalization_rules)
|
||||
|
||||
properties = []
|
||||
for section in rules:
|
||||
# Create the property field and deduplicate against existing
|
||||
# instances.
|
||||
props = variants.ICUVariantProperties.from_rules(section)
|
||||
for existing in properties:
|
||||
if existing == props:
|
||||
props = existing
|
||||
break
|
||||
else:
|
||||
properties.append(props)
|
||||
|
||||
for rule in (section.get('words') or []):
|
||||
self.variants.update(vmaker.compute(rule, props))
|
||||
|
||||
|
||||
class _VariantMaker:
|
||||
""" Generater for all necessary ICUVariants from a single variant rule.
|
||||
|
||||
All text in rules is normalized to make sure the variants match later.
|
||||
class TokenAnalyzerRule:
|
||||
""" Factory for a single analysis module. The class saves the configuration
|
||||
and creates a new token analyzer on request.
|
||||
"""
|
||||
|
||||
def __init__(self, norm_rules):
|
||||
self.norm = Transliterator.createFromRules("rule_loader_normalization",
|
||||
norm_rules)
|
||||
def __init__(self, rules, normalization_rules):
|
||||
# Find the analysis module
|
||||
module_name = 'nominatim.tokenizer.token_analysis.' \
|
||||
+ _get_section(rules, 'analyzer').replace('-', '_')
|
||||
analysis_mod = importlib.import_module(module_name)
|
||||
self.create = analysis_mod.create
|
||||
|
||||
|
||||
def compute(self, rule, props):
|
||||
""" Generator for all ICUVariant tuples from a single variant rule.
|
||||
"""
|
||||
parts = re.split(r'(\|)?([=-])>', rule)
|
||||
if len(parts) != 4:
|
||||
raise UsageError("Syntax error in variant rule: " + rule)
|
||||
|
||||
decompose = parts[1] is None
|
||||
src_terms = [self._parse_variant_word(t) for t in parts[0].split(',')]
|
||||
repl_terms = (self.norm.transliterate(t.strip()) for t in parts[3].split(','))
|
||||
|
||||
# If the source should be kept, add a 1:1 replacement
|
||||
if parts[2] == '-':
|
||||
for src in src_terms:
|
||||
if src:
|
||||
for froms, tos in _create_variants(*src, src[0], decompose):
|
||||
yield variants.ICUVariant(froms, tos, props)
|
||||
|
||||
for src, repl in itertools.product(src_terms, repl_terms):
|
||||
if src and repl:
|
||||
for froms, tos in _create_variants(*src, repl, decompose):
|
||||
yield variants.ICUVariant(froms, tos, props)
|
||||
|
||||
|
||||
def _parse_variant_word(self, name):
|
||||
name = name.strip()
|
||||
match = re.fullmatch(r'([~^]?)([^~$^]*)([~$]?)', name)
|
||||
if match is None or (match.group(1) == '~' and match.group(3) == '~'):
|
||||
raise UsageError("Invalid variant word descriptor '{}'".format(name))
|
||||
norm_name = self.norm.transliterate(match.group(2))
|
||||
if not norm_name:
|
||||
return None
|
||||
|
||||
return norm_name, match.group(1), match.group(3)
|
||||
|
||||
|
||||
_FLAG_MATCH = {'^': '^ ',
|
||||
'$': ' ^',
|
||||
'': ' '}
|
||||
|
||||
|
||||
def _create_variants(src, preflag, postflag, repl, decompose):
|
||||
if preflag == '~':
|
||||
postfix = _FLAG_MATCH[postflag]
|
||||
# suffix decomposition
|
||||
src = src + postfix
|
||||
repl = repl + postfix
|
||||
|
||||
yield src, repl
|
||||
yield ' ' + src, ' ' + repl
|
||||
|
||||
if decompose:
|
||||
yield src, ' ' + repl
|
||||
yield ' ' + src, repl
|
||||
elif postflag == '~':
|
||||
# prefix decomposition
|
||||
prefix = _FLAG_MATCH[preflag]
|
||||
src = prefix + src
|
||||
repl = prefix + repl
|
||||
|
||||
yield src, repl
|
||||
yield src + ' ', repl + ' '
|
||||
|
||||
if decompose:
|
||||
yield src, repl + ' '
|
||||
yield src + ' ', repl
|
||||
else:
|
||||
prefix = _FLAG_MATCH[preflag]
|
||||
postfix = _FLAG_MATCH[postflag]
|
||||
|
||||
yield prefix + src + postfix, prefix + repl + postfix
|
||||
# Load the configuration.
|
||||
self.config = analysis_mod.configure(rules, normalization_rules)
|
||||
|
||||
23
nominatim/tokenizer/icu_token_analysis.py
Normal file
23
nominatim/tokenizer/icu_token_analysis.py
Normal file
@@ -0,0 +1,23 @@
|
||||
"""
|
||||
Container class collecting all components required to transform an OSM name
|
||||
into a Nominatim token.
|
||||
"""
|
||||
|
||||
from icu import Transliterator
|
||||
|
||||
class ICUTokenAnalysis:
|
||||
""" Container class collecting the transliterators and token analysis
|
||||
modules for a single NameAnalyser instance.
|
||||
"""
|
||||
|
||||
def __init__(self, norm_rules, trans_rules, analysis_rules):
|
||||
self.normalizer = Transliterator.createFromRules("icu_normalization",
|
||||
norm_rules)
|
||||
trans_rules += ";[:Space:]+ > ' '"
|
||||
self.to_ascii = Transliterator.createFromRules("icu_to_ascii",
|
||||
trans_rules)
|
||||
self.search = Transliterator.createFromRules("icu_search",
|
||||
norm_rules + trans_rules)
|
||||
|
||||
self.analysis = {name: arules.create(self.to_ascii, arules.config)
|
||||
for name, arules in analysis_rules.items()}
|
||||
@@ -2,22 +2,19 @@
|
||||
Tokenizer implementing normalisation as used before Nominatim 4 but using
|
||||
libICU instead of the PostgreSQL module.
|
||||
"""
|
||||
from collections import Counter
|
||||
import itertools
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from textwrap import dedent
|
||||
from pathlib import Path
|
||||
|
||||
from nominatim.db.connection import connect
|
||||
from nominatim.db.properties import set_property, get_property
|
||||
from nominatim.db.utils import CopyBuffer
|
||||
from nominatim.db.sql_preprocessor import SQLPreprocessor
|
||||
from nominatim.indexer.place_info import PlaceInfo
|
||||
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
|
||||
from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
|
||||
from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
|
||||
|
||||
DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
|
||||
DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
|
||||
|
||||
LOG = logging.getLogger()
|
||||
@@ -28,7 +25,7 @@ def create(dsn, data_dir):
|
||||
return LegacyICUTokenizer(dsn, data_dir)
|
||||
|
||||
|
||||
class LegacyICUTokenizer:
|
||||
class LegacyICUTokenizer(AbstractTokenizer):
|
||||
""" This tokenizer uses libICU to covert names and queries to ASCII.
|
||||
Otherwise it uses the same algorithms and data structures as the
|
||||
normalization routines in Nominatim 3.
|
||||
@@ -37,9 +34,7 @@ class LegacyICUTokenizer:
|
||||
def __init__(self, dsn, data_dir):
|
||||
self.dsn = dsn
|
||||
self.data_dir = data_dir
|
||||
self.naming_rules = None
|
||||
self.term_normalization = None
|
||||
self.max_word_frequency = None
|
||||
self.loader = None
|
||||
|
||||
|
||||
def init_new_db(self, config, init_db=True):
|
||||
@@ -48,58 +43,67 @@ class LegacyICUTokenizer:
|
||||
This copies all necessary data in the project directory to make
|
||||
sure the tokenizer remains stable even over updates.
|
||||
"""
|
||||
if config.TOKENIZER_CONFIG:
|
||||
cfgfile = Path(config.TOKENIZER_CONFIG)
|
||||
else:
|
||||
cfgfile = config.config_dir / 'legacy_icu_tokenizer.yaml'
|
||||
|
||||
loader = ICURuleLoader(cfgfile)
|
||||
self.naming_rules = ICUNameProcessorRules(loader=loader)
|
||||
self.term_normalization = config.TERM_NORMALIZATION
|
||||
self.max_word_frequency = config.MAX_WORD_FREQUENCY
|
||||
self.loader = ICURuleLoader(config)
|
||||
|
||||
self._install_php(config.lib_dir.php)
|
||||
self._save_config(config)
|
||||
self._save_config()
|
||||
|
||||
if init_db:
|
||||
self.update_sql_functions(config)
|
||||
self._init_db_tables(config)
|
||||
|
||||
|
||||
def init_from_project(self):
|
||||
def init_from_project(self, config):
|
||||
""" Initialise the tokenizer from the project directory.
|
||||
"""
|
||||
self.loader = ICURuleLoader(config)
|
||||
|
||||
with connect(self.dsn) as conn:
|
||||
self.naming_rules = ICUNameProcessorRules(conn=conn)
|
||||
self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
|
||||
self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
|
||||
self.loader.load_config_from_db(conn)
|
||||
|
||||
|
||||
def finalize_import(self, _):
|
||||
def finalize_import(self, config):
|
||||
""" Do any required postprocessing to make the tokenizer data ready
|
||||
for use.
|
||||
"""
|
||||
with connect(self.dsn) as conn:
|
||||
sqlp = SQLPreprocessor(conn, config)
|
||||
sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
|
||||
|
||||
|
||||
def update_sql_functions(self, config):
|
||||
""" Reimport the SQL functions for this tokenizer.
|
||||
"""
|
||||
with connect(self.dsn) as conn:
|
||||
max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
|
||||
sqlp = SQLPreprocessor(conn, config)
|
||||
sqlp.run_sql_file(conn, 'tokenizer/legacy_icu_tokenizer.sql',
|
||||
max_word_freq=max_word_freq)
|
||||
sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
|
||||
|
||||
|
||||
def check_database(self):
|
||||
def check_database(self, config):
|
||||
""" Check that the tokenizer is set up correctly.
|
||||
"""
|
||||
self.init_from_project()
|
||||
# Will throw an error if there is an issue.
|
||||
self.init_from_project(config)
|
||||
|
||||
if self.naming_rules is None:
|
||||
return "Configuration for tokenizer 'legacy_icu' are missing."
|
||||
|
||||
return None
|
||||
def update_statistics(self):
|
||||
""" Recompute frequencies for all name words.
|
||||
"""
|
||||
with connect(self.dsn) as conn:
|
||||
if conn.table_exists('search_name'):
|
||||
with conn.cursor() as cur:
|
||||
cur.drop_table("word_frequencies")
|
||||
LOG.info("Computing word frequencies")
|
||||
cur.execute("""CREATE TEMP TABLE word_frequencies AS
|
||||
SELECT unnest(name_vector) as id, count(*)
|
||||
FROM search_name GROUP BY id""")
|
||||
cur.execute("CREATE INDEX ON word_frequencies(id)")
|
||||
LOG.info("Update word table with recomputed frequencies")
|
||||
cur.execute("""UPDATE word
|
||||
SET info = info || jsonb_build_object('count', count)
|
||||
FROM word_frequencies WHERE word_id = id""")
|
||||
cur.drop_table("word_frequencies")
|
||||
conn.commit()
|
||||
|
||||
|
||||
def name_analyzer(self):
|
||||
@@ -117,7 +121,8 @@ class LegacyICUTokenizer:
|
||||
|
||||
Analyzers are not thread-safe. You need to instantiate one per thread.
|
||||
"""
|
||||
return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
|
||||
return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
|
||||
self.loader.make_token_analysis())
|
||||
|
||||
|
||||
def _install_php(self, phpdir):
|
||||
@@ -126,21 +131,18 @@ class LegacyICUTokenizer:
|
||||
php_file = self.data_dir / "tokenizer.php"
|
||||
php_file.write_text(dedent(f"""\
|
||||
<?php
|
||||
@define('CONST_Max_Word_Frequency', {self.max_word_frequency});
|
||||
@define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
|
||||
@define('CONST_Transliteration', "{self.naming_rules.search_rules}");
|
||||
require_once('{phpdir}/tokenizer/legacy_icu_tokenizer.php');"""))
|
||||
@define('CONST_Max_Word_Frequency', 10000000);
|
||||
@define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
|
||||
@define('CONST_Transliteration', "{self.loader.get_search_rules()}");
|
||||
require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
|
||||
|
||||
|
||||
def _save_config(self, config):
|
||||
def _save_config(self):
|
||||
""" Save the configuration that needs to remain stable for the given
|
||||
database as database properties.
|
||||
"""
|
||||
with connect(self.dsn) as conn:
|
||||
self.naming_rules.save_rules(conn)
|
||||
|
||||
set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
|
||||
set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
|
||||
self.loader.save_config_to_db(conn)
|
||||
|
||||
|
||||
def _init_db_tables(self, config):
|
||||
@@ -152,69 +154,23 @@ class LegacyICUTokenizer:
|
||||
sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
|
||||
conn.commit()
|
||||
|
||||
LOG.warning("Precomputing word tokens")
|
||||
|
||||
# get partial words and their frequencies
|
||||
words = self._count_partial_terms(conn)
|
||||
|
||||
# copy them back into the word table
|
||||
with CopyBuffer() as copystr:
|
||||
for term, cnt in words.items():
|
||||
copystr.add('w', term, json.dumps({'count': cnt}))
|
||||
|
||||
with conn.cursor() as cur:
|
||||
copystr.copy_out(cur, 'word',
|
||||
columns=['type', 'word_token', 'info'])
|
||||
cur.execute("""UPDATE word SET word_id = nextval('seq_word')
|
||||
WHERE word_id is null and type = 'w'""")
|
||||
|
||||
conn.commit()
|
||||
|
||||
def _count_partial_terms(self, conn):
|
||||
""" Count the partial terms from the names in the place table.
|
||||
"""
|
||||
words = Counter()
|
||||
name_proc = ICUNameProcessor(self.naming_rules)
|
||||
|
||||
with conn.cursor(name="words") as cur:
|
||||
cur.execute(""" SELECT v, count(*) FROM
|
||||
(SELECT svals(name) as v FROM place)x
|
||||
WHERE length(v) < 75 GROUP BY v""")
|
||||
|
||||
for name, cnt in cur:
|
||||
terms = set()
|
||||
for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
|
||||
if ' ' in word:
|
||||
terms.update(word.split())
|
||||
for term in terms:
|
||||
words[term] += cnt
|
||||
|
||||
return words
|
||||
|
||||
|
||||
class LegacyICUNameAnalyzer:
|
||||
class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
||||
""" The legacy analyzer uses the ICU library for splitting names.
|
||||
|
||||
Each instance opens a connection to the database to request the
|
||||
normalization.
|
||||
"""
|
||||
|
||||
def __init__(self, dsn, name_proc):
|
||||
def __init__(self, dsn, sanitizer, token_analysis):
|
||||
self.conn = connect(dsn).connection
|
||||
self.conn.autocommit = True
|
||||
self.name_processor = name_proc
|
||||
self.sanitizer = sanitizer
|
||||
self.token_analysis = token_analysis
|
||||
|
||||
self._cache = _TokenCache()
|
||||
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
|
||||
def __exit__(self, exc_type, exc_value, traceback):
|
||||
self.close()
|
||||
|
||||
|
||||
def close(self):
|
||||
""" Free all resources used by the analyzer.
|
||||
"""
|
||||
@@ -223,6 +179,19 @@ class LegacyICUNameAnalyzer:
|
||||
self.conn = None
|
||||
|
||||
|
||||
def _search_normalized(self, name):
|
||||
""" Return the search token transliteration of the given name.
|
||||
"""
|
||||
return self.token_analysis.search.transliterate(name).strip()
|
||||
|
||||
|
||||
def _normalized(self, name):
|
||||
""" Return the normalized version of the given name with all
|
||||
non-relevant information removed.
|
||||
"""
|
||||
return self.token_analysis.normalizer.transliterate(name).strip()
|
||||
|
||||
|
||||
def get_word_token_info(self, words):
|
||||
""" Return token information for the given list of words.
|
||||
If a word starts with # it is assumed to be a full name
|
||||
@@ -238,9 +207,9 @@ class LegacyICUNameAnalyzer:
|
||||
partial_tokens = {}
|
||||
for word in words:
|
||||
if word.startswith('#'):
|
||||
full_tokens[word] = self.name_processor.get_search_normalized(word[1:])
|
||||
full_tokens[word] = self._search_normalized(word[1:])
|
||||
else:
|
||||
partial_tokens[word] = self.name_processor.get_search_normalized(word)
|
||||
partial_tokens[word] = self._search_normalized(word)
|
||||
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("""SELECT word_token, word_id
|
||||
@@ -271,7 +240,7 @@ class LegacyICUNameAnalyzer:
|
||||
|
||||
This function takes minor shortcuts on transliteration.
|
||||
"""
|
||||
return self.name_processor.get_search_normalized(hnr)
|
||||
return self._search_normalized(hnr)
|
||||
|
||||
def update_postcodes_from_db(self):
|
||||
""" Update postcode tokens in the word table from the location_postcode
|
||||
@@ -294,7 +263,7 @@ class LegacyICUNameAnalyzer:
|
||||
if postcode is None:
|
||||
to_delete.append(word)
|
||||
else:
|
||||
copystr.add(self.name_processor.get_search_normalized(postcode),
|
||||
copystr.add(self._search_normalized(postcode),
|
||||
'P', postcode)
|
||||
|
||||
if to_delete:
|
||||
@@ -312,7 +281,7 @@ class LegacyICUNameAnalyzer:
|
||||
completely replaced. Otherwise the phrases are added to the
|
||||
already existing ones.
|
||||
"""
|
||||
norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
|
||||
norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
|
||||
for p in phrases))
|
||||
|
||||
with self.conn.cursor() as cur:
|
||||
@@ -342,7 +311,7 @@ class LegacyICUNameAnalyzer:
|
||||
added = 0
|
||||
with CopyBuffer() as copystr:
|
||||
for word, cls, typ, oper in to_add:
|
||||
term = self.name_processor.get_search_normalized(word)
|
||||
term = self._search_normalized(word)
|
||||
if term:
|
||||
copystr.add(term, 'S', word,
|
||||
json.dumps({'class': cls, 'type': typ,
|
||||
@@ -376,9 +345,21 @@ class LegacyICUNameAnalyzer:
|
||||
def add_country_names(self, country_code, names):
|
||||
""" Add names for the given country to the search index.
|
||||
"""
|
||||
# Make sure any name preprocessing for country names applies.
|
||||
info = PlaceInfo({'name': names, 'country_code': country_code,
|
||||
'rank_address': 4, 'class': 'boundary',
|
||||
'type': 'administrative'})
|
||||
self._add_country_full_names(country_code,
|
||||
self.sanitizer.process_names(info)[0])
|
||||
|
||||
|
||||
def _add_country_full_names(self, country_code, names):
|
||||
""" Add names for the given country from an already sanitized
|
||||
name list.
|
||||
"""
|
||||
word_tokens = set()
|
||||
for name in self._compute_full_names(names):
|
||||
norm_name = self.name_processor.get_search_normalized(name)
|
||||
for name in names:
|
||||
norm_name = self._search_normalized(name.name)
|
||||
if norm_name:
|
||||
word_tokens.add(norm_name)
|
||||
|
||||
@@ -404,23 +385,21 @@ class LegacyICUNameAnalyzer:
|
||||
def process_place(self, place):
|
||||
""" Determine tokenizer information about the given place.
|
||||
|
||||
Returns a JSON-serialisable structure that will be handed into
|
||||
Returns a JSON-serializable structure that will be handed into
|
||||
the database via the token_info field.
|
||||
"""
|
||||
token_info = _TokenInfo(self._cache)
|
||||
|
||||
names = place.get('name')
|
||||
names, address = self.sanitizer.process_names(place)
|
||||
|
||||
if names:
|
||||
fulls, partials = self._compute_name_tokens(names)
|
||||
|
||||
token_info.add_names(fulls, partials)
|
||||
|
||||
country_feature = place.get('country_feature')
|
||||
if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
|
||||
self.add_country_names(country_feature.lower(), names)
|
||||
if place.is_country():
|
||||
self._add_country_full_names(place.country_code, names)
|
||||
|
||||
address = place.get('address')
|
||||
if address:
|
||||
self._process_place_address(token_info, address)
|
||||
|
||||
@@ -430,18 +409,18 @@ class LegacyICUNameAnalyzer:
|
||||
def _process_place_address(self, token_info, address):
|
||||
hnrs = []
|
||||
addr_terms = []
|
||||
for key, value in address.items():
|
||||
if key == 'postcode':
|
||||
self._add_postcode(value)
|
||||
elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
|
||||
hnrs.append(value)
|
||||
elif key == 'street':
|
||||
token_info.add_street(*self._compute_name_tokens({'name': value}))
|
||||
elif key == 'place':
|
||||
token_info.add_place(*self._compute_name_tokens({'name': value}))
|
||||
elif not key.startswith('_') and \
|
||||
key not in ('country', 'full'):
|
||||
addr_terms.append((key, *self._compute_name_tokens({'name': value})))
|
||||
for item in address:
|
||||
if item.kind == 'postcode':
|
||||
self._add_postcode(item.name)
|
||||
elif item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'):
|
||||
hnrs.append(item.name)
|
||||
elif item.kind == 'street':
|
||||
token_info.add_street(self._compute_partial_tokens(item.name))
|
||||
elif item.kind == 'place':
|
||||
token_info.add_place(self._compute_partial_tokens(item.name))
|
||||
elif not item.kind.startswith('_') and \
|
||||
item.kind not in ('country', 'full'):
|
||||
addr_terms.append((item.kind, self._compute_partial_tokens(item.name)))
|
||||
|
||||
if hnrs:
|
||||
hnrs = self._split_housenumbers(hnrs)
|
||||
@@ -451,28 +430,61 @@ class LegacyICUNameAnalyzer:
|
||||
token_info.add_address_terms(addr_terms)
|
||||
|
||||
|
||||
def _compute_partial_tokens(self, name):
|
||||
""" Normalize the given term, split it into partial words and return
|
||||
then token list for them.
|
||||
"""
|
||||
norm_name = self._search_normalized(name)
|
||||
|
||||
tokens = []
|
||||
need_lookup = []
|
||||
for partial in norm_name.split():
|
||||
token = self._cache.partials.get(partial)
|
||||
if token:
|
||||
tokens.append(token)
|
||||
else:
|
||||
need_lookup.append(partial)
|
||||
|
||||
if need_lookup:
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("""SELECT word, getorcreate_partial_word(word)
|
||||
FROM unnest(%s) word""",
|
||||
(need_lookup, ))
|
||||
|
||||
for partial, token in cur:
|
||||
tokens.append(token)
|
||||
self._cache.partials[partial] = token
|
||||
|
||||
return tokens
|
||||
|
||||
|
||||
def _compute_name_tokens(self, names):
|
||||
""" Computes the full name and partial name tokens for the given
|
||||
dictionary of names.
|
||||
"""
|
||||
full_names = self._compute_full_names(names)
|
||||
full_tokens = set()
|
||||
partial_tokens = set()
|
||||
|
||||
for name in full_names:
|
||||
norm_name = self.name_processor.get_normalized(name)
|
||||
full, part = self._cache.names.get(norm_name, (None, None))
|
||||
for name in names:
|
||||
analyzer_id = name.get_attr('analyzer')
|
||||
norm_name = self._normalized(name.name)
|
||||
if analyzer_id is None:
|
||||
token_id = norm_name
|
||||
else:
|
||||
token_id = f'{norm_name}@{analyzer_id}'
|
||||
|
||||
full, part = self._cache.names.get(token_id, (None, None))
|
||||
if full is None:
|
||||
variants = self.name_processor.get_variants_ascii(norm_name)
|
||||
variants = self.token_analysis.analysis[analyzer_id].get_variants_ascii(norm_name)
|
||||
if not variants:
|
||||
continue
|
||||
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
|
||||
(norm_name, variants))
|
||||
(token_id, variants))
|
||||
full, part = cur.fetchone()
|
||||
|
||||
self._cache.names[norm_name] = (full, part)
|
||||
self._cache.names[token_id] = (full, part)
|
||||
|
||||
full_tokens.add(full)
|
||||
partial_tokens.update(part)
|
||||
@@ -480,23 +492,6 @@ class LegacyICUNameAnalyzer:
|
||||
return full_tokens, partial_tokens
|
||||
|
||||
|
||||
@staticmethod
|
||||
def _compute_full_names(names):
|
||||
""" Return the set of all full name word ids to be used with the
|
||||
given dictionary of names.
|
||||
"""
|
||||
full_names = set()
|
||||
for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)):
|
||||
if name:
|
||||
full_names.add(name)
|
||||
|
||||
brace_idx = name.find('(')
|
||||
if brace_idx >= 0:
|
||||
full_names.add(name[:brace_idx].strip())
|
||||
|
||||
return full_names
|
||||
|
||||
|
||||
def _add_postcode(self, postcode):
|
||||
""" Make sure the normalized postcode is present in the word table.
|
||||
"""
|
||||
@@ -504,7 +499,7 @@ class LegacyICUNameAnalyzer:
|
||||
postcode = self.normalize_postcode(postcode)
|
||||
|
||||
if postcode not in self._cache.postcodes:
|
||||
term = self.name_processor.get_search_normalized(postcode)
|
||||
term = self._search_normalized(postcode)
|
||||
if not term:
|
||||
return
|
||||
|
||||
@@ -563,30 +558,25 @@ class _TokenInfo:
|
||||
self.data['hnr'] = ';'.join(hnrs)
|
||||
|
||||
|
||||
def add_street(self, fulls, _):
|
||||
def add_street(self, tokens):
|
||||
""" Add addr:street match terms.
|
||||
"""
|
||||
if fulls:
|
||||
self.data['street'] = self._mk_array(fulls)
|
||||
if tokens:
|
||||
self.data['street'] = self._mk_array(tokens)
|
||||
|
||||
|
||||
def add_place(self, fulls, partials):
|
||||
def add_place(self, tokens):
|
||||
""" Add addr:place search and match terms.
|
||||
"""
|
||||
if fulls:
|
||||
self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials))
|
||||
self.data['place_match'] = self._mk_array(fulls)
|
||||
if tokens:
|
||||
self.data['place'] = self._mk_array(tokens)
|
||||
|
||||
|
||||
def add_address_terms(self, terms):
|
||||
""" Add additional address terms.
|
||||
"""
|
||||
tokens = {}
|
||||
|
||||
for key, fulls, partials in terms:
|
||||
if fulls:
|
||||
tokens[key] = [self._mk_array(itertools.chain(fulls, partials)),
|
||||
self._mk_array(fulls)]
|
||||
tokens = {key: self._mk_array(partials)
|
||||
for key, partials in terms if partials}
|
||||
|
||||
if tokens:
|
||||
self.data['addr'] = tokens
|
||||
@@ -600,6 +590,7 @@ class _TokenCache:
|
||||
"""
|
||||
def __init__(self):
|
||||
self.names = {}
|
||||
self.partials = {}
|
||||
self.postcodes = set()
|
||||
self.housenumbers = {}
|
||||
|
||||
@@ -1,57 +0,0 @@
|
||||
"""
|
||||
Data structures for saving variant expansions for ICU tokenizer.
|
||||
"""
|
||||
from collections import namedtuple
|
||||
import json
|
||||
|
||||
_ICU_VARIANT_PORPERTY_FIELDS = ['lang']
|
||||
|
||||
|
||||
class ICUVariantProperties(namedtuple('_ICUVariantProperties', _ICU_VARIANT_PORPERTY_FIELDS)):
|
||||
""" Data container for saving properties that describe when a variant
|
||||
should be applied.
|
||||
|
||||
Property instances are hashable.
|
||||
"""
|
||||
@classmethod
|
||||
def from_rules(cls, _):
|
||||
""" Create a new property type from a generic dictionary.
|
||||
|
||||
The function only takes into account the properties that are
|
||||
understood presently and ignores all others.
|
||||
"""
|
||||
return cls(lang=None)
|
||||
|
||||
|
||||
ICUVariant = namedtuple('ICUVariant', ['source', 'replacement', 'properties'])
|
||||
|
||||
|
||||
def pickle_variant_set(variants):
|
||||
""" Serializes an iterable of variant rules to a string.
|
||||
"""
|
||||
# Create a list of property sets. So they don't need to be duplicated
|
||||
properties = {}
|
||||
pid = 1
|
||||
for variant in variants:
|
||||
if variant.properties not in properties:
|
||||
properties[variant.properties] = pid
|
||||
pid += 1
|
||||
|
||||
# Convert the variants into a simple list.
|
||||
variants = [(v.source, v.replacement, properties[v.properties]) for v in variants]
|
||||
|
||||
# Convert everythin to json.
|
||||
return json.dumps({'properties': {v: k._asdict() for k, v in properties.items()},
|
||||
'variants': variants})
|
||||
|
||||
|
||||
def unpickle_variant_set(variant_string):
|
||||
""" Deserializes a variant string that was previously created with
|
||||
pickle_variant_set() into a set of ICUVariants.
|
||||
"""
|
||||
data = json.loads(variant_string)
|
||||
|
||||
properties = {int(k): ICUVariantProperties.from_rules(v)
|
||||
for k, v in data['properties'].items()}
|
||||
|
||||
return set((ICUVariant(src, repl, properties[pid]) for src, repl, pid in data['variants']))
|
||||
@@ -16,6 +16,7 @@ from nominatim.db import properties
|
||||
from nominatim.db import utils as db_utils
|
||||
from nominatim.db.sql_preprocessor import SQLPreprocessor
|
||||
from nominatim.errors import UsageError
|
||||
from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
|
||||
|
||||
DBCFG_NORMALIZATION = "tokenizer_normalization"
|
||||
DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
|
||||
@@ -76,7 +77,7 @@ def _check_module(module_dir, conn):
|
||||
raise UsageError("Database module cannot be accessed.") from err
|
||||
|
||||
|
||||
class LegacyTokenizer:
|
||||
class LegacyTokenizer(AbstractTokenizer):
|
||||
""" The legacy tokenizer uses a special PostgreSQL module to normalize
|
||||
names and queries. The tokenizer thus implements normalization through
|
||||
calls to the database.
|
||||
@@ -112,7 +113,7 @@ class LegacyTokenizer:
|
||||
self._init_db_tables(config)
|
||||
|
||||
|
||||
def init_from_project(self):
|
||||
def init_from_project(self, _):
|
||||
""" Initialise the tokenizer from the project directory.
|
||||
"""
|
||||
with connect(self.dsn) as conn:
|
||||
@@ -141,7 +142,7 @@ class LegacyTokenizer:
|
||||
modulepath=modulepath)
|
||||
|
||||
|
||||
def check_database(self):
|
||||
def check_database(self, _):
|
||||
""" Check that the tokenizer is set up correctly.
|
||||
"""
|
||||
hint = """\
|
||||
@@ -185,6 +186,25 @@ class LegacyTokenizer:
|
||||
self._save_config(conn, config)
|
||||
|
||||
|
||||
def update_statistics(self):
|
||||
""" Recompute the frequency of full words.
|
||||
"""
|
||||
with connect(self.dsn) as conn:
|
||||
if conn.table_exists('search_name'):
|
||||
with conn.cursor() as cur:
|
||||
cur.drop_table("word_frequencies")
|
||||
LOG.info("Computing word frequencies")
|
||||
cur.execute("""CREATE TEMP TABLE word_frequencies AS
|
||||
SELECT unnest(name_vector) as id, count(*)
|
||||
FROM search_name GROUP BY id""")
|
||||
cur.execute("CREATE INDEX ON word_frequencies(id)")
|
||||
LOG.info("Update word table with recomputed frequencies")
|
||||
cur.execute("""UPDATE word SET search_name_count = count
|
||||
FROM word_frequencies
|
||||
WHERE word_token like ' %' and word_id = id""")
|
||||
cur.drop_table("word_frequencies")
|
||||
conn.commit()
|
||||
|
||||
def name_analyzer(self):
|
||||
""" Create a new analyzer for tokenizing names and queries
|
||||
using this tokinzer. Analyzers are context managers and should
|
||||
@@ -238,7 +258,7 @@ class LegacyTokenizer:
|
||||
properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
|
||||
|
||||
|
||||
class LegacyNameAnalyzer:
|
||||
class LegacyNameAnalyzer(AbstractAnalyzer):
|
||||
""" The legacy analyzer uses the special Postgresql module for
|
||||
splitting names.
|
||||
|
||||
@@ -255,14 +275,6 @@ class LegacyNameAnalyzer:
|
||||
self._cache = _TokenCache(self.conn)
|
||||
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
|
||||
def __exit__(self, exc_type, exc_value, traceback):
|
||||
self.close()
|
||||
|
||||
|
||||
def close(self):
|
||||
""" Free all resources used by the analyzer.
|
||||
"""
|
||||
@@ -412,16 +424,15 @@ class LegacyNameAnalyzer:
|
||||
"""
|
||||
token_info = _TokenInfo(self._cache)
|
||||
|
||||
names = place.get('name')
|
||||
names = place.name
|
||||
|
||||
if names:
|
||||
token_info.add_names(self.conn, names)
|
||||
|
||||
country_feature = place.get('country_feature')
|
||||
if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
|
||||
self.add_country_names(country_feature.lower(), names)
|
||||
if place.is_country():
|
||||
self.add_country_names(place.country_code, names)
|
||||
|
||||
address = place.get('address')
|
||||
address = place.address
|
||||
if address:
|
||||
self._process_place_address(token_info, address)
|
||||
|
||||
|
||||
127
nominatim/tokenizer/place_sanitizer.py
Normal file
127
nominatim/tokenizer/place_sanitizer.py
Normal file
@@ -0,0 +1,127 @@
|
||||
"""
|
||||
Handler for cleaning name and address tags in place information before it
|
||||
is handed to the token analysis.
|
||||
"""
|
||||
import importlib
|
||||
|
||||
from nominatim.errors import UsageError
|
||||
|
||||
class PlaceName:
|
||||
""" A searchable name for a place together with properties.
|
||||
Every name object saves the name proper and two basic properties:
|
||||
* 'kind' describes the name of the OSM key used without any suffixes
|
||||
(i.e. the part after the colon removed)
|
||||
* 'suffix' contains the suffix of the OSM tag, if any. The suffix
|
||||
is the part of the key after the first colon.
|
||||
In addition to that, the name may have arbitrary additional attributes.
|
||||
Which attributes are used, depends on the token analyser.
|
||||
"""
|
||||
|
||||
def __init__(self, name, kind, suffix):
|
||||
self.name = name
|
||||
self.kind = kind
|
||||
self.suffix = suffix
|
||||
self.attr = {}
|
||||
|
||||
|
||||
def __repr__(self):
|
||||
return f"PlaceName(name='{self.name}',kind='{self.kind}',suffix='{self.suffix}')"
|
||||
|
||||
|
||||
def clone(self, name=None, kind=None, suffix=None, attr=None):
|
||||
""" Create a deep copy of the place name, optionally with the
|
||||
given parameters replaced. In the attribute list only the given
|
||||
keys are updated. The list is not replaced completely.
|
||||
In particular, the function cannot to be used to remove an
|
||||
attribute from a place name.
|
||||
"""
|
||||
newobj = PlaceName(name or self.name,
|
||||
kind or self.kind,
|
||||
suffix or self.suffix)
|
||||
|
||||
newobj.attr.update(self.attr)
|
||||
if attr:
|
||||
newobj.attr.update(attr)
|
||||
|
||||
return newobj
|
||||
|
||||
|
||||
def set_attr(self, key, value):
|
||||
""" Add the given property to the name. If the property was already
|
||||
set, then the value is overwritten.
|
||||
"""
|
||||
self.attr[key] = value
|
||||
|
||||
|
||||
def get_attr(self, key, default=None):
|
||||
""" Return the given property or the value of 'default' if it
|
||||
is not set.
|
||||
"""
|
||||
return self.attr.get(key, default)
|
||||
|
||||
|
||||
def has_attr(self, key):
|
||||
""" Check if the given attribute is set.
|
||||
"""
|
||||
return key in self.attr
|
||||
|
||||
|
||||
class _ProcessInfo:
|
||||
""" Container class for information handed into to handler functions.
|
||||
The 'names' and 'address' members are mutable. A handler must change
|
||||
them by either modifying the lists place or replacing the old content
|
||||
with a new list.
|
||||
"""
|
||||
|
||||
def __init__(self, place):
|
||||
self.place = place
|
||||
self.names = self._convert_name_dict(place.name)
|
||||
self.address = self._convert_name_dict(place.address)
|
||||
|
||||
|
||||
@staticmethod
|
||||
def _convert_name_dict(names):
|
||||
""" Convert a dictionary of names into a list of PlaceNames.
|
||||
The dictionary key is split into the primary part of the key
|
||||
and the suffix (the part after an optional colon).
|
||||
"""
|
||||
out = []
|
||||
|
||||
if names:
|
||||
for key, value in names.items():
|
||||
parts = key.split(':', 1)
|
||||
out.append(PlaceName(value.strip(),
|
||||
parts[0].strip(),
|
||||
parts[1].strip() if len(parts) > 1 else None))
|
||||
|
||||
return out
|
||||
|
||||
|
||||
class PlaceSanitizer:
|
||||
""" Controller class which applies sanitizer functions on the place
|
||||
names and address before they are used by the token analysers.
|
||||
"""
|
||||
|
||||
def __init__(self, rules):
|
||||
self.handlers = []
|
||||
|
||||
if rules:
|
||||
for func in rules:
|
||||
if 'step' not in func:
|
||||
raise UsageError("Sanitizer rule is missing the 'step' attribute.")
|
||||
module_name = 'nominatim.tokenizer.sanitizers.' + func['step'].replace('-', '_')
|
||||
handler_module = importlib.import_module(module_name)
|
||||
self.handlers.append(handler_module.create(func))
|
||||
|
||||
|
||||
def process_names(self, place):
|
||||
""" Extract a sanitized list of names and address parts from the
|
||||
given place. The function returns a tuple
|
||||
(list of names, list of address names)
|
||||
"""
|
||||
obj = _ProcessInfo(place)
|
||||
|
||||
for func in self.handlers:
|
||||
func(obj)
|
||||
|
||||
return obj.names, obj.address
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user