forked from hans/Nominatim
Compare commits
175 Commits
helm-chart
...
v4.0.2
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e943a2c8a4 | ||
|
|
95958458c6 | ||
|
|
3c703c3f14 | ||
|
|
cb66887c3b | ||
|
|
e56add9888 | ||
|
|
9628df3031 | ||
|
|
423f338d04 | ||
|
|
3a2597e5c4 | ||
|
|
641f261495 | ||
|
|
5884a6e7a6 | ||
|
|
10e979e841 | ||
|
|
8dc1441635 | ||
|
|
c79dcfad9a | ||
|
|
1886952666 | ||
|
|
7326b246b7 | ||
|
|
345c812e43 | ||
|
|
fd4ba3989e | ||
|
|
e2d2571ad0 | ||
|
|
d479a0585d | ||
|
|
addfae31b6 | ||
|
|
ccf61db726 | ||
|
|
5b86b2078a | ||
|
|
a069479340 | ||
|
|
d11bf9288e | ||
|
|
86eeb4d2ed | ||
|
|
2275fe59ab | ||
|
|
48be8c33ba | ||
|
|
d3d07128b2 | ||
|
|
37eeccbf4c | ||
|
|
1722fc537f | ||
|
|
b240b182cb | ||
|
|
c0f347fc8c | ||
|
|
53dbe58ada | ||
|
|
2c4b798f9b | ||
|
|
1cf14a8e94 | ||
|
|
4864bf1509 | ||
|
|
9934421442 | ||
|
|
d7267c1603 | ||
|
|
5c778c6d32 | ||
|
|
85797acf1e | ||
|
|
c4f5c11a4e | ||
|
|
5a1c3dbea3 | ||
|
|
8e439d3dd9 | ||
|
|
9ebf921c53 | ||
|
|
7bd9094aaa | ||
|
|
16cc395f78 | ||
|
|
13e7398566 | ||
|
|
8b90ee4364 | ||
|
|
1098ab732f | ||
|
|
507fdd4f40 | ||
|
|
0ae8d7ac08 | ||
|
|
c77df2d1eb | ||
|
|
cefae021db | ||
|
|
771aee8cd8 | ||
|
|
2d13d8b3b6 | ||
|
|
c1fa70639b | ||
|
|
12643c5986 | ||
|
|
a0f5613a23 | ||
|
|
824562357b | ||
|
|
ec7184c533 | ||
|
|
e8e2502e2f | ||
|
|
c86cfefc48 | ||
|
|
2635fe8b4c | ||
|
|
632436d54d | ||
|
|
74be6828dd | ||
|
|
f4acfed48f | ||
|
|
91e1c1bea8 | ||
|
|
bbb9a41ea4 | ||
|
|
f6418887b2 | ||
|
|
a3f8a097a1 | ||
|
|
751563644f | ||
|
|
e52b801cd0 | ||
|
|
445a6428a6 | ||
|
|
d59b26dad7 | ||
|
|
47417d1871 | ||
|
|
381aecb952 | ||
|
|
45344575c6 | ||
|
|
83381625bd | ||
|
|
552fb16cb2 | ||
|
|
75c631f080 | ||
|
|
e2464fdf62 | ||
|
|
9ff98073db | ||
|
|
98ee5def37 | ||
|
|
3649487f5e | ||
|
|
4b007ae740 | ||
|
|
6c79a60e19 | ||
|
|
2a94bfc703 | ||
|
|
299934fd2a | ||
|
|
b18d042832 | ||
|
|
97a10ec218 | ||
|
|
d35400a7d7 | ||
|
|
92f6ec2328 | ||
|
|
9ba2019470 | ||
|
|
c171d88194 | ||
|
|
7cfcbacfc7 | ||
|
|
52847b61a3 | ||
|
|
5a36559834 | ||
|
|
19d4e047f6 | ||
|
|
6b348d43c6 | ||
|
|
732cd27d2e | ||
|
|
8171fe4571 | ||
|
|
16daa57e47 | ||
|
|
5e5addcdbf | ||
|
|
be65c8303f | ||
|
|
231250f2eb | ||
|
|
d44a428b74 | ||
|
|
40f9d52ad8 | ||
|
|
7f3b05c179 | ||
|
|
09c9fad6c3 | ||
|
|
bb18479d5b | ||
|
|
779ea8ac62 | ||
|
|
bd7c7ddad0 | ||
|
|
c6fdcf9b0d | ||
|
|
59fe74ddf6 | ||
|
|
6d7c067461 | ||
|
|
316205e455 | ||
|
|
834ae0a93f | ||
|
|
d562f11298 | ||
|
|
972628c751 | ||
|
|
09b1db63f4 | ||
|
|
e9d54f752c | ||
|
|
c335025167 | ||
|
|
2b2109c89a | ||
|
|
56124546a6 | ||
|
|
336258ecf8 | ||
|
|
b894d2c04a | ||
|
|
8e1d4818ac | ||
|
|
28c98584c1 | ||
|
|
1c42780bb5 | ||
|
|
18554dfed7 | ||
|
|
2e493fec46 | ||
|
|
98c2e08add | ||
|
|
94d3dee369 | ||
|
|
7e7dd769fd | ||
|
|
79da96b369 | ||
|
|
78fcabade8 | ||
|
|
284645f505 | ||
|
|
0b349761a8 | ||
|
|
d18794931a | ||
|
|
b7d4ff3201 | ||
|
|
4c6d674e03 | ||
|
|
2c97af8021 | ||
|
|
832f75a55e | ||
|
|
4e77969545 | ||
|
|
6ebbbfee61 | ||
|
|
0fabeefc3e | ||
|
|
c70d72f06b | ||
|
|
cc141bf1a5 | ||
|
|
199532c802 | ||
|
|
28ee3d0949 | ||
|
|
925195725d | ||
|
|
f6d22df76e | ||
|
|
118858a55e | ||
|
|
656c1291b1 | ||
|
|
f00b8dd1c3 | ||
|
|
5f2b9e317a | ||
|
|
4ae5ba7fc4 | ||
|
|
3656eed9ad | ||
|
|
2e82a6ce03 | ||
|
|
c4b8a3b768 | ||
|
|
1147b83b22 | ||
|
|
0fb8eade13 | ||
|
|
78d11fe628 | ||
|
|
90b40fc3e6 | ||
|
|
e25e268e2e | ||
|
|
68bff31cc9 | ||
|
|
31d9545702 | ||
|
|
e449071a35 | ||
|
|
23e3724abb | ||
|
|
75a5c7013f | ||
|
|
56d24085f9 | ||
|
|
95b82af42a | ||
|
|
87dedde5d6 | ||
|
|
8b6489c60e | ||
|
|
bf4f05fff3 |
@@ -7,6 +7,8 @@ assignees: ''
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
<!-- Note: this template is for reporting problems with searching. If you have found an issue with the data, you need to report/fix the issue directly in OpenStreetMap. See https://www.openstreetmap.org/fixthemap for details. -->
|
||||||
|
|
||||||
## What did you search for?
|
## What did you search for?
|
||||||
|
|
||||||
<!-- Please try to provide a link to your search. You can go to https://nominatim.openstreetmap.org and repeat your search there. If you originally found the issue somewhere else, please tell us what software/website you were using. -->
|
<!-- Please try to provide a link to your search. You can go to https://nominatim.openstreetmap.org and repeat your search there. If you originally found the issue somewhere else, please tell us what software/website you were using. -->
|
||||||
@@ -15,11 +17,11 @@ assignees: ''
|
|||||||
|
|
||||||
## What result did you expect?
|
## What result did you expect?
|
||||||
|
|
||||||
**Is the result in the right place and just named wrongly?**
|
**When the result in the right place and just named wrongly:**
|
||||||
|
|
||||||
<!-- Please tell us the display name you expected. -->
|
<!-- Please tell us the display name you expected. -->
|
||||||
|
|
||||||
**Is the result missing completely?**
|
**When the result missing completely:**
|
||||||
|
|
||||||
<!-- Make sure that the data you are looking for is in OpenStreetMap. Provide a link to the OpenStreetMap object or if you cannot get it, a link to the map on https://openstreetmap.org where you expect the result to be.
|
<!-- Make sure that the data you are looking for is in OpenStreetMap. Provide a link to the OpenStreetMap object or if you cannot get it, a link to the map on https://openstreetmap.org where you expect the result to be.
|
||||||
|
|
||||||
|
|||||||
235
.github/workflows/ci-tests.yml
vendored
235
.github/workflows/ci-tests.yml
vendored
@@ -3,7 +3,38 @@ name: CI Tests
|
|||||||
on: [ push, pull_request ]
|
on: [ push, pull_request ]
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
|
create-archive:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v2
|
||||||
|
with:
|
||||||
|
submodules: true
|
||||||
|
|
||||||
|
- uses: actions/cache@v2
|
||||||
|
with:
|
||||||
|
path: |
|
||||||
|
data/country_osm_grid.sql.gz
|
||||||
|
key: nominatim-country-data-1
|
||||||
|
|
||||||
|
- name: Package tarball
|
||||||
|
run: |
|
||||||
|
if [ ! -f data/country_osm_grid.sql.gz ]; then
|
||||||
|
wget --no-verbose -O data/country_osm_grid.sql.gz https://www.nominatim.org/data/country_grid.sql.gz
|
||||||
|
fi
|
||||||
|
cd ..
|
||||||
|
tar czf nominatim-src.tar.bz2 Nominatim
|
||||||
|
mv nominatim-src.tar.bz2 Nominatim
|
||||||
|
|
||||||
|
- name: 'Upload Artifact'
|
||||||
|
uses: actions/upload-artifact@v2
|
||||||
|
with:
|
||||||
|
name: full-source
|
||||||
|
path: nominatim-src.tar.bz2
|
||||||
|
retention-days: 1
|
||||||
|
|
||||||
tests:
|
tests:
|
||||||
|
needs: create-archive
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
ubuntu: [18, 20]
|
ubuntu: [18, 20]
|
||||||
@@ -22,10 +53,12 @@ jobs:
|
|||||||
runs-on: ubuntu-${{ matrix.ubuntu }}.04
|
runs-on: ubuntu-${{ matrix.ubuntu }}.04
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/download-artifact@v2
|
||||||
with:
|
with:
|
||||||
submodules: true
|
name: full-source
|
||||||
path: Nominatim
|
|
||||||
|
- name: Unpack Nominatim
|
||||||
|
run: tar xf nominatim-src.tar.bz2
|
||||||
|
|
||||||
- name: Setup PHP
|
- name: Setup PHP
|
||||||
uses: shivammathur/setup-php@v2
|
uses: shivammathur/setup-php@v2
|
||||||
@@ -39,18 +72,6 @@ jobs:
|
|||||||
python-version: 3.6
|
python-version: 3.6
|
||||||
if: matrix.ubuntu == 18
|
if: matrix.ubuntu == 18
|
||||||
|
|
||||||
- name: Get Date
|
|
||||||
id: get-date
|
|
||||||
run: |
|
|
||||||
echo "::set-output name=date::$(/bin/date -u "+%Y%W")"
|
|
||||||
shell: bash
|
|
||||||
|
|
||||||
- uses: actions/cache@v2
|
|
||||||
with:
|
|
||||||
path: |
|
|
||||||
country_grid.sql.gz
|
|
||||||
key: nominatim-country-data-${{ steps.get-date.outputs.date }}
|
|
||||||
|
|
||||||
- uses: ./Nominatim/.github/actions/setup-postgresql
|
- uses: ./Nominatim/.github/actions/setup-postgresql
|
||||||
with:
|
with:
|
||||||
postgresql-version: ${{ matrix.postgresql }}
|
postgresql-version: ${{ matrix.postgresql }}
|
||||||
@@ -65,8 +86,7 @@ jobs:
|
|||||||
if: matrix.ubuntu == 20
|
if: matrix.ubuntu == 20
|
||||||
|
|
||||||
- name: Install test prerequsites
|
- name: Install test prerequsites
|
||||||
run: |
|
run: pip3 install pylint==2.6.0 pytest pytest-cov behave==1.2.6
|
||||||
pip3 install pylint==2.6.0 pytest pytest-cov behave==1.2.6
|
|
||||||
if: matrix.ubuntu == 18
|
if: matrix.ubuntu == 18
|
||||||
|
|
||||||
- name: PHP linting
|
- name: PHP linting
|
||||||
@@ -103,11 +123,6 @@ jobs:
|
|||||||
working-directory: Nominatim/test/bdd
|
working-directory: Nominatim/test/bdd
|
||||||
if: matrix.ubuntu == 18
|
if: matrix.ubuntu == 18
|
||||||
|
|
||||||
- name: BDD tests (legacy_icu tokenizer)
|
|
||||||
run: |
|
|
||||||
behave -DREMOVE_TEMPLATE=1 -DBUILDDIR=$GITHUB_WORKSPACE/build -DTOKENIZER=legacy_icu --format=progress3
|
|
||||||
working-directory: Nominatim/test/bdd
|
|
||||||
|
|
||||||
- name: Upload coverage to Codecov
|
- name: Upload coverage to Codecov
|
||||||
uses: codecov/codecov-action@v1
|
uses: codecov/codecov-action@v1
|
||||||
with:
|
with:
|
||||||
@@ -119,43 +134,35 @@ jobs:
|
|||||||
verbose: true
|
verbose: true
|
||||||
if: matrix.ubuntu == 20
|
if: matrix.ubuntu == 20
|
||||||
|
|
||||||
import:
|
|
||||||
|
icu-test:
|
||||||
|
needs: create-archive
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
ubuntu: [18, 20]
|
ubuntu: [20]
|
||||||
include:
|
include:
|
||||||
- ubuntu: 18
|
|
||||||
postgresql: 9.5
|
|
||||||
postgis: 2.5
|
|
||||||
- ubuntu: 20
|
- ubuntu: 20
|
||||||
postgresql: 13
|
postgresql: 13
|
||||||
postgis: 3
|
postgis: 3
|
||||||
|
pytest: py.test-3
|
||||||
|
php: 7.4
|
||||||
|
|
||||||
runs-on: ubuntu-${{ matrix.ubuntu }}.04
|
runs-on: ubuntu-${{ matrix.ubuntu }}.04
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/download-artifact@v2
|
||||||
with:
|
with:
|
||||||
submodules: true
|
name: full-source
|
||||||
path: Nominatim
|
|
||||||
|
|
||||||
- name: Get Date
|
- name: Unpack Nominatim
|
||||||
id: get-date
|
run: tar xf nominatim-src.tar.bz2
|
||||||
run: |
|
|
||||||
echo "::set-output name=date::$(/bin/date -u "+%Y%W")"
|
|
||||||
shell: bash
|
|
||||||
|
|
||||||
- uses: actions/cache@v2
|
- name: Setup PHP
|
||||||
|
uses: shivammathur/setup-php@v2
|
||||||
with:
|
with:
|
||||||
path: |
|
php-version: ${{ matrix.php }}
|
||||||
country_grid.sql.gz
|
coverage: xdebug
|
||||||
key: nominatim-country-data-${{ steps.get-date.outputs.date }}
|
tools: phpunit, phpcs, composer
|
||||||
|
|
||||||
- uses: actions/cache@v2
|
|
||||||
with:
|
|
||||||
path: |
|
|
||||||
monaco-latest.osm.pbf
|
|
||||||
key: nominatim-test-data-${{ steps.get-date.outputs.date }}
|
|
||||||
|
|
||||||
- uses: actions/setup-python@v2
|
- uses: actions/setup-python@v2
|
||||||
with:
|
with:
|
||||||
@@ -166,52 +173,148 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
postgresql-version: ${{ matrix.postgresql }}
|
postgresql-version: ${{ matrix.postgresql }}
|
||||||
postgis-version: ${{ matrix.postgis }}
|
postgis-version: ${{ matrix.postgis }}
|
||||||
|
|
||||||
- uses: ./Nominatim/.github/actions/build-nominatim
|
- uses: ./Nominatim/.github/actions/build-nominatim
|
||||||
with:
|
with:
|
||||||
ubuntu: ${{ matrix.ubuntu }}
|
ubuntu: ${{ matrix.ubuntu }}
|
||||||
|
|
||||||
- name: Clean installation
|
- name: Install test prerequsites
|
||||||
run: rm -rf Nominatim build
|
run: sudo apt-get install -y -qq python3-behave
|
||||||
|
if: matrix.ubuntu == 20
|
||||||
|
|
||||||
|
- name: Install test prerequsites
|
||||||
|
run: pip3 install behave==1.2.6
|
||||||
|
if: matrix.ubuntu == 18
|
||||||
|
|
||||||
|
- name: BDD tests (icu tokenizer)
|
||||||
|
run: |
|
||||||
|
behave -DREMOVE_TEMPLATE=1 -DBUILDDIR=$GITHUB_WORKSPACE/build -DTOKENIZER=icu --format=progress3
|
||||||
|
working-directory: Nominatim/test/bdd
|
||||||
|
|
||||||
|
|
||||||
|
install:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
needs: create-archive
|
||||||
|
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
name: [Ubuntu-18, Ubuntu-20, Centos-8]
|
||||||
|
include:
|
||||||
|
- name: Ubuntu-18
|
||||||
|
flavour: ubuntu
|
||||||
|
image: "ubuntu:18.04"
|
||||||
|
ubuntu: 18
|
||||||
|
install_mode: install-nginx
|
||||||
|
- name: Ubuntu-20
|
||||||
|
flavour: ubuntu
|
||||||
|
image: "ubuntu:20.04"
|
||||||
|
ubuntu: 20
|
||||||
|
install_mode: install-apache
|
||||||
|
- name: Centos-8
|
||||||
|
flavour: centos
|
||||||
|
image: "centos:8"
|
||||||
|
|
||||||
|
container:
|
||||||
|
image: ${{ matrix.image }}
|
||||||
|
env:
|
||||||
|
LANG: en_US.UTF-8
|
||||||
|
|
||||||
|
defaults:
|
||||||
|
run:
|
||||||
|
shell: sudo -Hu nominatim bash --noprofile --norc -eo pipefail {0}
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Prepare container (Ubuntu)
|
||||||
|
run: |
|
||||||
|
export APT_LISTCHANGES_FRONTEND=none
|
||||||
|
export DEBIAN_FRONTEND=noninteractive
|
||||||
|
apt-get update -qq
|
||||||
|
apt-get install -y git sudo wget
|
||||||
|
ln -snf /usr/share/zoneinfo/$CONTAINER_TIMEZONE /etc/localtime && echo $CONTAINER_TIMEZONE > /etc/timezone
|
||||||
shell: bash
|
shell: bash
|
||||||
|
if: matrix.flavour == 'ubuntu'
|
||||||
|
|
||||||
|
- name: Prepare container (CentOS)
|
||||||
|
run: |
|
||||||
|
dnf update -y
|
||||||
|
dnf install -y sudo glibc-langpack-en
|
||||||
|
shell: bash
|
||||||
|
if: matrix.flavour == 'centos'
|
||||||
|
|
||||||
|
- name: Setup import user
|
||||||
|
run: |
|
||||||
|
useradd -m nominatim
|
||||||
|
echo 'nominatim ALL=(ALL:ALL) NOPASSWD: ALL' > /etc/sudoers.d/nominiatim
|
||||||
|
echo "/home/nominatim/Nominatim/vagrant/Install-on-${OS}.sh no $INSTALL_MODE" > /home/nominatim/vagrant.sh
|
||||||
|
shell: bash
|
||||||
|
env:
|
||||||
|
OS: ${{ matrix.name }}
|
||||||
|
INSTALL_MODE: ${{ matrix.install_mode }}
|
||||||
|
|
||||||
|
- uses: actions/download-artifact@v2
|
||||||
|
with:
|
||||||
|
name: full-source
|
||||||
|
path: /home/nominatim
|
||||||
|
|
||||||
|
- name: Install Nominatim
|
||||||
|
run: |
|
||||||
|
export USERNAME=nominatim
|
||||||
|
export USERHOME=/home/nominatim
|
||||||
|
export NOSYSTEMD=yes
|
||||||
|
export HAVE_SELINUX=no
|
||||||
|
tar xf nominatim-src.tar.bz2
|
||||||
|
. vagrant.sh
|
||||||
|
working-directory: /home/nominatim
|
||||||
|
|
||||||
- name: Prepare import environment
|
- name: Prepare import environment
|
||||||
run: |
|
run: |
|
||||||
if [ ! -f monaco-latest.osm.pbf ]; then
|
mv Nominatim/test/testdb/apidb-test-data.pbf test.pbf
|
||||||
wget --no-verbose https://download.geofabrik.de/europe/monaco-latest.osm.pbf
|
rm -rf Nominatim
|
||||||
fi
|
mkdir data-env-reverse
|
||||||
mkdir data-env
|
working-directory: /home/nominatim
|
||||||
cd data-env
|
|
||||||
shell: bash
|
- name: Prepare import environment (CentOS)
|
||||||
|
run: |
|
||||||
|
sudo ln -s /usr/local/bin/nominatim /usr/bin/nominatim
|
||||||
|
echo NOMINATIM_DATABASE_WEBUSER="apache" > nominatim-project/.env
|
||||||
|
cp nominatim-project/.env data-env-reverse/.env
|
||||||
|
working-directory: /home/nominatim
|
||||||
|
if: matrix.flavour == 'centos'
|
||||||
|
|
||||||
- name: Import
|
- name: Import
|
||||||
run: nominatim import --osm-file ../monaco-latest.osm.pbf
|
run: nominatim import --osm-file ../test.pbf
|
||||||
shell: bash
|
working-directory: /home/nominatim/nominatim-project
|
||||||
working-directory: data-env
|
|
||||||
|
|
||||||
- name: Import special phrases
|
- name: Import special phrases
|
||||||
run: nominatim special-phrases --import-from-wiki
|
run: nominatim special-phrases --import-from-wiki
|
||||||
working-directory: data-env
|
working-directory: /home/nominatim/nominatim-project
|
||||||
|
|
||||||
- name: Check full import
|
- name: Check full import
|
||||||
run: nominatim admin --check-database
|
run: nominatim admin --check-database
|
||||||
working-directory: data-env
|
working-directory: /home/nominatim/nominatim-project
|
||||||
|
|
||||||
- name: Warm up database
|
- name: Warm up database
|
||||||
run: nominatim admin --warm
|
run: nominatim admin --warm
|
||||||
working-directory: data-env
|
working-directory: /home/nominatim/nominatim-project
|
||||||
|
|
||||||
|
- name: Prepare update (Ubuntu)
|
||||||
|
run: apt-get install -y python3-pip
|
||||||
|
shell: bash
|
||||||
|
if: matrix.flavour == 'ubuntu'
|
||||||
|
|
||||||
- name: Run update
|
- name: Run update
|
||||||
run: |
|
run: |
|
||||||
|
pip3 install --user osmium
|
||||||
nominatim replication --init
|
nominatim replication --init
|
||||||
nominatim replication --once
|
NOMINATIM_REPLICATION_MAX_DIFF=1 nominatim replication --once
|
||||||
working-directory: data-env
|
working-directory: /home/nominatim/nominatim-project
|
||||||
|
|
||||||
- name: Run reverse-only import
|
- name: Run reverse-only import
|
||||||
run : nominatim import --osm-file ../monaco-latest.osm.pbf --reverse-only --no-updates
|
run : |
|
||||||
working-directory: data-env
|
echo 'NOMINATIM_DATABASE_DSN="pgsql:dbname=reverse"' >> .env
|
||||||
env:
|
nominatim import --osm-file ../test.pbf --reverse-only --no-updates
|
||||||
NOMINATIM_DATABASE_DSN: pgsql:dbname=reverse
|
working-directory: /home/nominatim/data-env-reverse
|
||||||
|
|
||||||
- name: Check reverse import
|
- name: Check reverse import
|
||||||
run: nominatim admin --check-database
|
run: nominatim admin --check-database
|
||||||
working-directory: data-env
|
working-directory: /home/nominatim/data-env-reverse
|
||||||
|
|||||||
7
.gitignore
vendored
7
.gitignore
vendored
@@ -1,12 +1,9 @@
|
|||||||
*.log
|
*.log
|
||||||
*.pyc
|
*.pyc
|
||||||
|
|
||||||
build
|
docs/develop/*.png
|
||||||
settings/local.php
|
|
||||||
|
|
||||||
data/wiki_import.sql
|
build
|
||||||
data/wiki_specialphrases.sql
|
|
||||||
data/osmosischange.osc
|
|
||||||
|
|
||||||
.vagrant
|
.vagrant
|
||||||
data/country_osm_grid.sql.gz
|
data/country_osm_grid.sql.gz
|
||||||
|
|||||||
@@ -18,9 +18,9 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake")
|
|||||||
|
|
||||||
project(nominatim)
|
project(nominatim)
|
||||||
|
|
||||||
set(NOMINATIM_VERSION_MAJOR 3)
|
set(NOMINATIM_VERSION_MAJOR 4)
|
||||||
set(NOMINATIM_VERSION_MINOR 7)
|
set(NOMINATIM_VERSION_MINOR 0)
|
||||||
set(NOMINATIM_VERSION_PATCH 0)
|
set(NOMINATIM_VERSION_PATCH 2)
|
||||||
|
|
||||||
set(NOMINATIM_VERSION "${NOMINATIM_VERSION_MAJOR}.${NOMINATIM_VERSION_MINOR}.${NOMINATIM_VERSION_PATCH}")
|
set(NOMINATIM_VERSION "${NOMINATIM_VERSION_MAJOR}.${NOMINATIM_VERSION_MINOR}.${NOMINATIM_VERSION_PATCH}")
|
||||||
|
|
||||||
@@ -38,6 +38,7 @@ set(BUILD_TESTS on CACHE BOOL "Build test suite")
|
|||||||
set(BUILD_DOCS on CACHE BOOL "Build documentation")
|
set(BUILD_DOCS on CACHE BOOL "Build documentation")
|
||||||
set(BUILD_MANPAGE on CACHE BOOL "Build Manual Page")
|
set(BUILD_MANPAGE on CACHE BOOL "Build Manual Page")
|
||||||
set(BUILD_OSM2PGSQL on CACHE BOOL "Build osm2pgsql (expert only)")
|
set(BUILD_OSM2PGSQL on CACHE BOOL "Build osm2pgsql (expert only)")
|
||||||
|
set(INSTALL_MUNIN_PLUGINS on CACHE BOOL "Install Munin plugins for supervising Nominatim")
|
||||||
|
|
||||||
#-----------------------------------------------------------------------------
|
#-----------------------------------------------------------------------------
|
||||||
# osm2pgsql (imports/updates only)
|
# osm2pgsql (imports/updates only)
|
||||||
@@ -153,7 +154,7 @@ if (BUILD_TESTS)
|
|||||||
if (PHPCS)
|
if (PHPCS)
|
||||||
message(STATUS "Using phpcs binary ${PHPCS}")
|
message(STATUS "Using phpcs binary ${PHPCS}")
|
||||||
add_test(NAME phpcs
|
add_test(NAME phpcs
|
||||||
COMMAND ${PHPCS} --report-width=120 --colors lib website utils
|
COMMAND ${PHPCS} --report-width=120 --colors lib-php
|
||||||
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR})
|
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR})
|
||||||
else()
|
else()
|
||||||
message(WARNING "phpcs not found. PHP linting tests disabled." )
|
message(WARNING "phpcs not found. PHP linting tests disabled." )
|
||||||
@@ -199,7 +200,7 @@ endif()
|
|||||||
#-----------------------------------------------------------------------------
|
#-----------------------------------------------------------------------------
|
||||||
|
|
||||||
if (BUILD_MANPAGE)
|
if (BUILD_MANPAGE)
|
||||||
add_subdirectory(manual)
|
add_subdirectory(man)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
#-----------------------------------------------------------------------------
|
#-----------------------------------------------------------------------------
|
||||||
@@ -211,6 +212,7 @@ include(GNUInstallDirs)
|
|||||||
set(NOMINATIM_DATADIR ${CMAKE_INSTALL_FULL_DATADIR}/${PROJECT_NAME})
|
set(NOMINATIM_DATADIR ${CMAKE_INSTALL_FULL_DATADIR}/${PROJECT_NAME})
|
||||||
set(NOMINATIM_LIBDIR ${CMAKE_INSTALL_FULL_LIBDIR}/${PROJECT_NAME})
|
set(NOMINATIM_LIBDIR ${CMAKE_INSTALL_FULL_LIBDIR}/${PROJECT_NAME})
|
||||||
set(NOMINATIM_CONFIGDIR ${CMAKE_INSTALL_FULL_SYSCONFDIR}/${PROJECT_NAME})
|
set(NOMINATIM_CONFIGDIR ${CMAKE_INSTALL_FULL_SYSCONFDIR}/${PROJECT_NAME})
|
||||||
|
set(NOMINATIM_MUNINDIR ${CMAKE_INSTALL_FULL_DATADIR}/munin/plugins)
|
||||||
|
|
||||||
if (BUILD_IMPORTER)
|
if (BUILD_IMPORTER)
|
||||||
configure_file(${PROJECT_SOURCE_DIR}/cmake/tool-installed.tmpl installed.bin)
|
configure_file(${PROJECT_SOURCE_DIR}/cmake/tool-installed.tmpl installed.bin)
|
||||||
@@ -258,6 +260,16 @@ install(FILES settings/env.defaults
|
|||||||
settings/import-address.style
|
settings/import-address.style
|
||||||
settings/import-full.style
|
settings/import-full.style
|
||||||
settings/import-extratags.style
|
settings/import-extratags.style
|
||||||
settings/legacy_icu_tokenizer.yaml
|
settings/icu_tokenizer.yaml
|
||||||
settings/icu-rules/extended-unicode-to-asccii.yaml
|
settings/country_settings.yaml
|
||||||
DESTINATION ${NOMINATIM_CONFIGDIR})
|
DESTINATION ${NOMINATIM_CONFIGDIR})
|
||||||
|
|
||||||
|
install(DIRECTORY settings/icu-rules
|
||||||
|
DESTINATION ${NOMINATIM_CONFIGDIR})
|
||||||
|
|
||||||
|
if (INSTALL_MUNIN_PLUGINS)
|
||||||
|
install(FILES munin/nominatim_importlag
|
||||||
|
munin/nominatim_query_speed
|
||||||
|
munin/nominatim_requests
|
||||||
|
DESTINATION ${NOMINATIM_MUNINDIR})
|
||||||
|
endif()
|
||||||
|
|||||||
63
ChangeLog
63
ChangeLog
@@ -1,3 +1,65 @@
|
|||||||
|
4.0.2
|
||||||
|
|
||||||
|
* fix XSS vulnerability in debug view
|
||||||
|
|
||||||
|
4.0.1
|
||||||
|
|
||||||
|
* fix initialisation error in replication script
|
||||||
|
* ICU tokenizer: avoid any special characters in word tokens
|
||||||
|
* better error message when API php script does not exist
|
||||||
|
* fix quoting of house numbers in SQL queries
|
||||||
|
* small fixes and improvements in search query parsing
|
||||||
|
* add documentation for moving the database to a different machine
|
||||||
|
|
||||||
|
4.0.0
|
||||||
|
|
||||||
|
* refactor name token computation and introduce ICU tokenizer
|
||||||
|
* name processing now happens in the indexer outside the DB
|
||||||
|
* reorganizes abbreviation handling and moves it to the indexing phases
|
||||||
|
* adds preprocessing of names
|
||||||
|
* add country-specific ranking for Spain, Slovakia
|
||||||
|
* partially switch to using SP-GIST indexes
|
||||||
|
* better updating of dependent addresses for name changes in streets
|
||||||
|
* remove unused/broken tables for external housenumbers
|
||||||
|
* move external postcodes to CSV format and no longer save them in tables
|
||||||
|
(adds support for postcodes for arbitrary countries)
|
||||||
|
* remove postcode helper entries from placex (thanks @AntoJvlt)
|
||||||
|
* change required format for TIGER data to CSV
|
||||||
|
* move configuration of default languages from wiki into config file
|
||||||
|
* expect customized configuration files in project directory by default
|
||||||
|
* disable search API for reverse-only import (thanks @darkshredder)
|
||||||
|
* port most of maintenance/import code to Python and remove PHP utils
|
||||||
|
* add catch-up mode for replication
|
||||||
|
* add updating of special phrases (thanks @AntoJvlt)
|
||||||
|
* add support for special phrases in CSV files (thanks @AntoJvlt)
|
||||||
|
* switch to case-independent matching between place and boundary names
|
||||||
|
* remove disabling of reverse query parsing
|
||||||
|
* minor tweaks to search algorithm to avoid more false positives
|
||||||
|
* major overhaul of the administrator and developer documentation
|
||||||
|
* add security disclosure policy
|
||||||
|
* add testing of installation scripts via CI
|
||||||
|
* drop support for Python < 3.6 and Postgresql < 9.5
|
||||||
|
|
||||||
|
3.7.3
|
||||||
|
|
||||||
|
* fix XSS vulnerability in debug view
|
||||||
|
|
||||||
|
3.7.2
|
||||||
|
|
||||||
|
* fix database check for reverse-only imports
|
||||||
|
* do not error out in status API result when import date is missing
|
||||||
|
* add array_key_last function for PHP < 7.3 (thanks to @woodpeck)
|
||||||
|
* fix more url when server name is unknown (thanks to @mogita)
|
||||||
|
* commit changes to replication log table
|
||||||
|
|
||||||
|
3.7.1
|
||||||
|
|
||||||
|
* fix smaller issues with special phrases import (thanks @AntoJvlt)
|
||||||
|
* add index to speed up continued indexing during import
|
||||||
|
* fix index on location_property_tiger(parent_place_id) (thanks @changpingc)
|
||||||
|
* make sure Python code is backward-compatible with Python 3.5
|
||||||
|
* various documentation fixes
|
||||||
|
|
||||||
3.7.0
|
3.7.0
|
||||||
|
|
||||||
* switch to dotenv for configuration file
|
* switch to dotenv for configuration file
|
||||||
@@ -20,7 +82,6 @@
|
|||||||
* add non-key indexes to speed up housenumber + street searches
|
* add non-key indexes to speed up housenumber + street searches
|
||||||
* switch housenumber field in placex to save transliterated names
|
* switch housenumber field in placex to save transliterated names
|
||||||
|
|
||||||
|
|
||||||
3.6.0
|
3.6.0
|
||||||
|
|
||||||
* add full support for searching by and displaying of addr:* tags
|
* add full support for searching by and displaying of addr:* tags
|
||||||
|
|||||||
@@ -20,14 +20,6 @@ https://nominatim.org/release-docs/develop/ .
|
|||||||
Installation
|
Installation
|
||||||
============
|
============
|
||||||
|
|
||||||
**Nominatim is a complex piece of software and runs in a complex environment.
|
|
||||||
Installing and running Nominatim is something for experienced system
|
|
||||||
administrators only who can do some trouble-shooting themselves. We are sorry,
|
|
||||||
but we can not provide installation support. We are all doing this in our free
|
|
||||||
time and there is just so much of that time to go around. Do not open issues in
|
|
||||||
our bug tracker if you need help. Use the discussions forum
|
|
||||||
or ask for help on [help.openstreetmap.org](https://help.openstreetmap.org/).**
|
|
||||||
|
|
||||||
The latest stable release can be downloaded from https://nominatim.org.
|
The latest stable release can be downloaded from https://nominatim.org.
|
||||||
There you can also find [installation instructions for the release](https://nominatim.org/release-docs/latest/admin/Installation), as well as an extensive [Troubleshooting/FAQ section](https://nominatim.org/release-docs/latest/admin/Faq/).
|
There you can also find [installation instructions for the release](https://nominatim.org/release-docs/latest/admin/Installation), as well as an extensive [Troubleshooting/FAQ section](https://nominatim.org/release-docs/latest/admin/Faq/).
|
||||||
|
|
||||||
|
|||||||
File diff suppressed because one or more lines are too long
@@ -10,6 +10,7 @@ set (DOC_SOURCES
|
|||||||
admin
|
admin
|
||||||
develop
|
develop
|
||||||
api
|
api
|
||||||
|
customize
|
||||||
index.md
|
index.md
|
||||||
extra.css
|
extra.css
|
||||||
styles.css
|
styles.css
|
||||||
@@ -26,7 +27,10 @@ ADD_CUSTOM_TARGET(doc
|
|||||||
COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/bash2md.sh ${PROJECT_SOURCE_DIR}/vagrant/Install-on-Centos-8.sh ${CMAKE_CURRENT_BINARY_DIR}/appendix/Install-on-Centos-8.md
|
COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/bash2md.sh ${PROJECT_SOURCE_DIR}/vagrant/Install-on-Centos-8.sh ${CMAKE_CURRENT_BINARY_DIR}/appendix/Install-on-Centos-8.md
|
||||||
COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/bash2md.sh ${PROJECT_SOURCE_DIR}/vagrant/Install-on-Ubuntu-18.sh ${CMAKE_CURRENT_BINARY_DIR}/appendix/Install-on-Ubuntu-18.md
|
COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/bash2md.sh ${PROJECT_SOURCE_DIR}/vagrant/Install-on-Ubuntu-18.sh ${CMAKE_CURRENT_BINARY_DIR}/appendix/Install-on-Ubuntu-18.md
|
||||||
COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/bash2md.sh ${PROJECT_SOURCE_DIR}/vagrant/Install-on-Ubuntu-20.sh ${CMAKE_CURRENT_BINARY_DIR}/appendix/Install-on-Ubuntu-20.md
|
COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/bash2md.sh ${PROJECT_SOURCE_DIR}/vagrant/Install-on-Ubuntu-20.sh ${CMAKE_CURRENT_BINARY_DIR}/appendix/Install-on-Ubuntu-20.md
|
||||||
COMMAND mkdocs build -d ${CMAKE_CURRENT_BINARY_DIR}/../site-html -f ${CMAKE_CURRENT_BINARY_DIR}/../mkdocs.yml
|
COMMAND PYTHONPATH=${PROJECT_SOURCE_DIR} mkdocs build -d ${CMAKE_CURRENT_BINARY_DIR}/../site-html -f ${CMAKE_CURRENT_BINARY_DIR}/../mkdocs.yml
|
||||||
)
|
)
|
||||||
|
|
||||||
|
ADD_CUSTOM_TARGET(serve-doc
|
||||||
|
COMMAND PYTHONPATH=${PROJECT_SOURCE_DIR} mkdocs serve
|
||||||
|
WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
|
||||||
|
)
|
||||||
|
|||||||
@@ -5,9 +5,34 @@ your Nominatim database. It is assumed that you have already successfully
|
|||||||
installed the Nominatim software itself, if not return to the
|
installed the Nominatim software itself, if not return to the
|
||||||
[installation page](Installation.md).
|
[installation page](Installation.md).
|
||||||
|
|
||||||
## Importing multiple regions
|
## Importing multiple regions (without updates)
|
||||||
|
|
||||||
To import multiple regions in your database, you need to configure and run `utils/import_multiple_regions.sh` file. This script will set up the update directory which has the following structure:
|
To import multiple regions in your database you can simply give multiple
|
||||||
|
OSM files to the import command:
|
||||||
|
|
||||||
|
```
|
||||||
|
nominatim import --osm-file file1.pbf --osm-file file2.pbf
|
||||||
|
```
|
||||||
|
|
||||||
|
If you already have imported a file and want to add another one, you can
|
||||||
|
use the add-data function to import the additional data as follows:
|
||||||
|
|
||||||
|
```
|
||||||
|
nominatim add-data --file <FILE>
|
||||||
|
nominatim refresh --postcodes
|
||||||
|
nominatim index -j <NUMBER OF THREADS>
|
||||||
|
```
|
||||||
|
|
||||||
|
Please note that adding additional data is always significantly slower than
|
||||||
|
the original import.
|
||||||
|
|
||||||
|
## Importing multiple regions (with updates)
|
||||||
|
|
||||||
|
If you want to import multiple regions _and_ be able to keep them up-to-date
|
||||||
|
with updates, then you can use the scripts provided in the `utils` directory.
|
||||||
|
|
||||||
|
These scripts will set up an `update` directory in your project directory,
|
||||||
|
which has the following structure:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
update
|
update
|
||||||
@@ -17,7 +42,6 @@ update
|
|||||||
│ └── monaco
|
│ └── monaco
|
||||||
│ └── sequence.state
|
│ └── sequence.state
|
||||||
└── tmp
|
└── tmp
|
||||||
├── combined.osm.pbf
|
|
||||||
└── europe
|
└── europe
|
||||||
├── andorra-latest.osm.pbf
|
├── andorra-latest.osm.pbf
|
||||||
└── monaco-latest.osm.pbf
|
└── monaco-latest.osm.pbf
|
||||||
@@ -25,87 +49,59 @@ update
|
|||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
The `sequence.state` files will contain the sequence ID, which will be used by pyosmium to get updates. The tmp folder is used for import dump.
|
The `sequence.state` files contain the sequence ID for each region. They will
|
||||||
|
be used by pyosmium to get updates. The `tmp` folder is used for import dump and
|
||||||
|
can be deleted once the import is complete.
|
||||||
|
|
||||||
### Configuring multiple regions
|
|
||||||
|
|
||||||
The file `import_multiple_regions.sh` needs to be edited as per your requirement:
|
### Setting up multiple regions
|
||||||
|
|
||||||
1. List of countries. eg:
|
Create a project directory as described for the
|
||||||
|
[simple import](Import.md#creating-the-project-directory). If necessary,
|
||||||
|
you can also add an `.env` configuration with customized options. In particular,
|
||||||
|
you need to make sure that `NOMINATIM_REPLICATION_UPDATE_INTERVAL` and
|
||||||
|
`NOMINATIM_REPLICATION_RECHECK_INTERVAL` are set according to the update
|
||||||
|
interval of the extract server you use.
|
||||||
|
|
||||||
|
Copy the scripts `utils/import_multiple_regions.sh` and `utils/update_database.sh`
|
||||||
|
into the project directory.
|
||||||
|
|
||||||
|
Now customize both files as per your requirements
|
||||||
|
|
||||||
|
1. List of countries. e.g.
|
||||||
|
|
||||||
COUNTRIES="europe/monaco europe/andorra"
|
COUNTRIES="europe/monaco europe/andorra"
|
||||||
|
|
||||||
2. Path to Build directory. eg:
|
2. URL to the service providing the extracts and updates. eg:
|
||||||
|
|
||||||
NOMINATIMBUILD="/srv/nominatim/build"
|
|
||||||
|
|
||||||
3. Path to Update directory. eg:
|
|
||||||
|
|
||||||
UPDATEDIR="/srv/nominatim/update"
|
|
||||||
|
|
||||||
4. Replication URL. eg:
|
|
||||||
|
|
||||||
BASEURL="https://download.geofabrik.de"
|
BASEURL="https://download.geofabrik.de"
|
||||||
DOWNCOUNTRYPOSTFIX="-latest.osm.pbf"
|
DOWNCOUNTRYPOSTFIX="-latest.osm.pbf"
|
||||||
|
|
||||||
### Setting up multiple regions
|
5. Followup in the update script can be set according to your installation.
|
||||||
|
E.g. for Photon,
|
||||||
!!! tip
|
|
||||||
If your database already exists and you want to add more countries,
|
|
||||||
replace the setting up part
|
|
||||||
`${SETUPFILE} --osm-file ${UPDATEDIR}/tmp/combined.osm.pbf --all 2>&1`
|
|
||||||
with `${UPDATEFILE} --import-file ${UPDATEDIR}/tmp/combined.osm.pbf --index --index-instances N 2>&1`
|
|
||||||
where N is the numbers of CPUs in your system.
|
|
||||||
|
|
||||||
Run the following command from your Nominatim directory after configuring the file.
|
|
||||||
|
|
||||||
bash ./utils/import_multiple_regions.sh
|
|
||||||
|
|
||||||
!!! danger "Important"
|
|
||||||
This file uses osmium-tool. It must be installed before executing the import script.
|
|
||||||
Installation instructions can be found [here](https://osmcode.org/osmium-tool/manual.html#installation).
|
|
||||||
|
|
||||||
### Updating multiple regions
|
|
||||||
|
|
||||||
To import multiple regions in your database, you need to configure and run ```utils/update_database.sh```.
|
|
||||||
This uses the update directory set up while setting up the DB.
|
|
||||||
|
|
||||||
### Configuring multiple regions
|
|
||||||
|
|
||||||
The file `update_database.sh` needs to be edited as per your requirement:
|
|
||||||
|
|
||||||
1. List of countries. eg:
|
|
||||||
|
|
||||||
COUNTRIES="europe/monaco europe/andorra"
|
|
||||||
|
|
||||||
2. Path to Build directory. eg:
|
|
||||||
|
|
||||||
NOMINATIMBUILD="/srv/nominatim/build"
|
|
||||||
|
|
||||||
3. Path to Update directory. eg:
|
|
||||||
|
|
||||||
UPDATEDIR="/srv/nominatim/update"
|
|
||||||
|
|
||||||
4. Replication URL. eg:
|
|
||||||
|
|
||||||
BASEURL="https://download.geofabrik.de"
|
|
||||||
DOWNCOUNTRYPOSTFIX="-updates"
|
|
||||||
|
|
||||||
5. Followup can be set according to your installation. eg: For Photon,
|
|
||||||
|
|
||||||
FOLLOWUP="curl http://localhost:2322/nominatim-update"
|
FOLLOWUP="curl http://localhost:2322/nominatim-update"
|
||||||
|
|
||||||
will handle the indexing.
|
will handle the indexing.
|
||||||
|
|
||||||
|
|
||||||
|
To start the initial import, change into the project directory and run
|
||||||
|
|
||||||
|
```
|
||||||
|
bash import_multiple_regions.sh
|
||||||
|
```
|
||||||
|
|
||||||
### Updating the database
|
### Updating the database
|
||||||
|
|
||||||
Run the following command from your Nominatim directory after configuring the file.
|
Change into the project directory and run the following command:
|
||||||
|
|
||||||
bash ./utils/update_database.sh
|
bash update_database.sh
|
||||||
|
|
||||||
This will get diffs from the replication server, import diffs and index the database. The default replication server in the script([Geofabrik](https://download.geofabrik.de)) provides daily updates.
|
This will get diffs from the replication server, import diffs and index
|
||||||
|
the database. The default replication server in the
|
||||||
|
script([Geofabrik](https://download.geofabrik.de)) provides daily updates.
|
||||||
|
|
||||||
## Importing Nominatim to an external PostgreSQL database
|
## Using an external PostgreSQL database
|
||||||
|
|
||||||
You can install Nominatim using a database that runs on a different server when
|
You can install Nominatim using a database that runs on a different server when
|
||||||
you have physical access to the file system on the other server. Nominatim
|
you have physical access to the file system on the other server. Nominatim
|
||||||
@@ -113,6 +109,11 @@ uses a custom normalization library that needs to be made accessible to the
|
|||||||
PostgreSQL server. This section explains how to set up the normalization
|
PostgreSQL server. This section explains how to set up the normalization
|
||||||
library.
|
library.
|
||||||
|
|
||||||
|
!!! note
|
||||||
|
The external module is only needed when using the legacy tokenizer.
|
||||||
|
If you have choosen the ICU tokenizer, then you can ignore this section
|
||||||
|
and follow the standard import documentation.
|
||||||
|
|
||||||
### Option 1: Compiling the library on the database server
|
### Option 1: Compiling the library on the database server
|
||||||
|
|
||||||
The most sure way to get a working library is to compile it on the database
|
The most sure way to get a working library is to compile it on the database
|
||||||
@@ -170,4 +171,45 @@ NOMINATIM_DATABASE_MODULE_PATH="<directory on the database server where nominati
|
|||||||
```
|
```
|
||||||
|
|
||||||
Now change the `NOMINATIM_DATABASE_DSN` to point to your remote server and continue
|
Now change the `NOMINATIM_DATABASE_DSN` to point to your remote server and continue
|
||||||
to follow the [standard instructions for importing](/admin/Import).
|
to follow the [standard instructions for importing](Import.md).
|
||||||
|
|
||||||
|
|
||||||
|
## Moving the database to another machine
|
||||||
|
|
||||||
|
For some configurations it may be useful to run the import on one machine, then
|
||||||
|
move the database to another machine and run the Nominatim service from there.
|
||||||
|
For example, you might want to use a large machine to be able to run the import
|
||||||
|
quickly but only want a smaller machine for production because there is not so
|
||||||
|
much load. Or you might want to do the import once and then replicate the
|
||||||
|
database to many machines.
|
||||||
|
|
||||||
|
The important thing to keep in mind when transferring the Nominatim installation
|
||||||
|
is that you need to transfer the database _and the project directory_. Both
|
||||||
|
parts are essential for your installation.
|
||||||
|
|
||||||
|
The Nominatim database can be transferred using the `pg_dump`/`pg_restore` tool.
|
||||||
|
Make sure to use the same version of PostgreSQL and PostGIS on source and
|
||||||
|
target machine.
|
||||||
|
|
||||||
|
!!! note
|
||||||
|
Before creating a dump of your Nominatim database, consider running
|
||||||
|
`nominatim freeze` first. Your database looses the ability to receive further
|
||||||
|
data updates but the resulting database is only about a third of the size
|
||||||
|
of a full database.
|
||||||
|
|
||||||
|
Next install Nominatim on the target machine by following the standard installation
|
||||||
|
instructions. Again make sure to use the same version as the source machine.
|
||||||
|
|
||||||
|
You can now copy the project directory from the source machine to the new machine.
|
||||||
|
If necessary, edit the `.env` file to point it to the restored database.
|
||||||
|
Finally run
|
||||||
|
|
||||||
|
nominatim refresh --website
|
||||||
|
|
||||||
|
to make sure that the local installation of Nominatim will be used.
|
||||||
|
|
||||||
|
If you are using the legacy tokenizer you might also have to switch to the
|
||||||
|
PostgreSQL module that was compiled on your target machine. If you get errors
|
||||||
|
that PostgreSQL cannot find or access `nominatim.so` then copy the installed
|
||||||
|
version into the `module` directory of your project directory. The installed
|
||||||
|
copy can usually be found under `/usr/local/lib/nominatim/module/nominatim.so`.
|
||||||
|
|||||||
@@ -1,101 +0,0 @@
|
|||||||
# Customization of the Database
|
|
||||||
|
|
||||||
This section explains in detail how to configure a Nominatim import and
|
|
||||||
the various means to use external data.
|
|
||||||
|
|
||||||
## External postcode data
|
|
||||||
|
|
||||||
Nominatim creates a table of known postcode centroids during import. This table
|
|
||||||
is used for searches of postcodes and for adding postcodes to places where the
|
|
||||||
OSM data does not provide one. These postcode centroids are mainly computed
|
|
||||||
from the OSM data itself. In addition, Nominatim supports reading postcode
|
|
||||||
information from an external CSV file, to supplement the postcodes that are
|
|
||||||
missing in OSM.
|
|
||||||
|
|
||||||
To enable external postcode support, simply put one CSV file per country into
|
|
||||||
your project directory and name it `<CC>_postcodes.csv`. `<CC>` must be the
|
|
||||||
two-letter country code for which to apply the file. The file may also be
|
|
||||||
gzipped. Then it must be called `<CC>_postcodes.csv.gz`.
|
|
||||||
|
|
||||||
The CSV file must use commas as a delimiter and have a header line. Nominatim
|
|
||||||
expects three columns to be present: `postcode`, `lat` and `lon`. All other
|
|
||||||
columns are ignored. `lon` and `lat` must describe the x and y coordinates of the
|
|
||||||
postcode centroids in WGS84.
|
|
||||||
|
|
||||||
The postcode files are loaded only when there is data for the given country
|
|
||||||
in your database. For example, if there is a `us_postcodes.csv` file in your
|
|
||||||
project directory but you import only an excerpt of Italy, then the US postcodes
|
|
||||||
will simply be ignored.
|
|
||||||
|
|
||||||
As a rule, the external postcode data should be put into the project directory
|
|
||||||
**before** starting the initial import. Still, you can add, remove and update the
|
|
||||||
external postcode data at any time. Simply
|
|
||||||
run:
|
|
||||||
|
|
||||||
```
|
|
||||||
nominatim refresh --postcodes
|
|
||||||
```
|
|
||||||
|
|
||||||
to make the changes visible in your database. Be aware, however, that the changes
|
|
||||||
only have an immediate effect on searches for postcodes. Postcodes that were
|
|
||||||
added to places are only updated, when they are reindexed. That usually happens
|
|
||||||
only during replication updates.
|
|
||||||
|
|
||||||
## Installing Tiger housenumber data for the US
|
|
||||||
|
|
||||||
Nominatim is able to use the official [TIGER](https://www.census.gov/geographies/mapping-files/time-series/geo/tiger-line-file.html)
|
|
||||||
address set to complement the OSM house number data in the US. You can add
|
|
||||||
TIGER data to your own Nominatim instance by following these steps. The
|
|
||||||
entire US adds about 10GB to your database.
|
|
||||||
|
|
||||||
1. Get preprocessed TIGER 2020 data:
|
|
||||||
|
|
||||||
cd $PROJECT_DIR
|
|
||||||
wget https://nominatim.org/data/tiger2020-nominatim-preprocessed.csv.tar.gz
|
|
||||||
|
|
||||||
2. Import the data into your Nominatim database:
|
|
||||||
|
|
||||||
nominatim add-data --tiger-data tiger2020-nominatim-preprocessed.csv.tar.gz
|
|
||||||
|
|
||||||
3. Enable use of the Tiger data in your `.env` by adding:
|
|
||||||
|
|
||||||
echo NOMINATIM_USE_US_TIGER_DATA=yes >> .env
|
|
||||||
|
|
||||||
4. Apply the new settings:
|
|
||||||
|
|
||||||
nominatim refresh --functions
|
|
||||||
|
|
||||||
|
|
||||||
See the [developer's guide](../develop/data-sources.md#us-census-tiger) for more
|
|
||||||
information on how the data got preprocessed.
|
|
||||||
|
|
||||||
## Special phrases import
|
|
||||||
|
|
||||||
As described in the [Importation chapter](Import.md), it is possible to
|
|
||||||
import special phrases from the wiki with the following command:
|
|
||||||
|
|
||||||
```sh
|
|
||||||
nominatim special-phrases --import-from-wiki
|
|
||||||
```
|
|
||||||
|
|
||||||
But, it is also possible to import some phrases from a csv file.
|
|
||||||
To do so, you have access to the following command:
|
|
||||||
|
|
||||||
```sh
|
|
||||||
nominatim special-phrases --import-from-csv <csv file>
|
|
||||||
```
|
|
||||||
|
|
||||||
Note that the two previous import commands will update the phrases from your database.
|
|
||||||
This means that if you import some phrases from a csv file, only the phrases
|
|
||||||
present in the csv file will be kept into the database. All other phrases will
|
|
||||||
be removed.
|
|
||||||
|
|
||||||
If you want to only add new phrases and not update the other ones you can add
|
|
||||||
the argument `--no-replace` to the import command. For example:
|
|
||||||
|
|
||||||
```sh
|
|
||||||
nominatim special-phrases --import-from-csv <csv file> --no-replace
|
|
||||||
```
|
|
||||||
|
|
||||||
This will add the phrases present in the csv file into the database without
|
|
||||||
removing the other ones.
|
|
||||||
@@ -134,7 +134,7 @@ On CentOS v7 the PostgreSQL server is started with `systemd`. Check if
|
|||||||
`/usr/lib/systemd/system/httpd.service` contains a line `PrivateTmp=true`. If
|
`/usr/lib/systemd/system/httpd.service` contains a line `PrivateTmp=true`. If
|
||||||
so then Apache cannot see the `/tmp/.s.PGSQL.5432` file. It's a good security
|
so then Apache cannot see the `/tmp/.s.PGSQL.5432` file. It's a good security
|
||||||
feature, so use the
|
feature, so use the
|
||||||
[preferred solution](../appendix/Install-on-Centos-7/#adding-selinux-security-settings).
|
[preferred solution](../appendix/Install-on-Centos-7.md#adding-selinux-security-settings).
|
||||||
|
|
||||||
However, you can solve this the quick and dirty way by commenting out that line and then run
|
However, you can solve this the quick and dirty way by commenting out that line and then run
|
||||||
|
|
||||||
@@ -182,7 +182,7 @@ by everybody, e.g.
|
|||||||
Try `chmod a+r nominatim.so; chmod a+x nominatim.so`.
|
Try `chmod a+r nominatim.so; chmod a+x nominatim.so`.
|
||||||
|
|
||||||
When running SELinux, make sure that the
|
When running SELinux, make sure that the
|
||||||
[context is set up correctly](../appendix/Install-on-Centos-7/#adding-selinux-security-settings).
|
[context is set up correctly](../appendix/Install-on-Centos-7.md#adding-selinux-security-settings).
|
||||||
|
|
||||||
When you recently updated your operating system, updated PostgreSQL to
|
When you recently updated your operating system, updated PostgreSQL to
|
||||||
a new version or moved files (e.g. the build directory) you should
|
a new version or moved files (e.g. the build directory) you should
|
||||||
|
|||||||
@@ -47,8 +47,9 @@ You can also set the same configuration via environment variables. All
|
|||||||
settings have a `NOMINATIM_` prefix to avoid conflicts with other environment
|
settings have a `NOMINATIM_` prefix to avoid conflicts with other environment
|
||||||
variables.
|
variables.
|
||||||
|
|
||||||
There are lots of configuration settings you can tweak. Have a look
|
There are lots of configuration settings you can tweak. A full reference
|
||||||
at `Nominatim/settings/env.default` for a full list. Most should have a sensible default.
|
can be found in the chapter [Configuration Settings](../customize/Settings.md).
|
||||||
|
Most should have a sensible default.
|
||||||
|
|
||||||
#### Flatnode files
|
#### Flatnode files
|
||||||
|
|
||||||
@@ -95,7 +96,7 @@ This data can be optionally downloaded into the project directory:
|
|||||||
wget https://www.nominatim.org/data/us_postcodes.csv.gz
|
wget https://www.nominatim.org/data/us_postcodes.csv.gz
|
||||||
|
|
||||||
You can also add your own custom postcode sources, see
|
You can also add your own custom postcode sources, see
|
||||||
[Customization of postcodes](Customization.md#external-postcode-data).
|
[Customization of postcodes](../customize/Postcodes.md).
|
||||||
|
|
||||||
## Choosing the data to import
|
## Choosing the data to import
|
||||||
|
|
||||||
@@ -111,7 +112,7 @@ If you only need geocoding for a smaller region, then precomputed OSM extracts
|
|||||||
are a good way to reduce the database size and import time.
|
are a good way to reduce the database size and import time.
|
||||||
[Geofabrik](https://download.geofabrik.de) offers extracts for most countries.
|
[Geofabrik](https://download.geofabrik.de) offers extracts for most countries.
|
||||||
They even have daily updates which can be used with the update process described
|
They even have daily updates which can be used with the update process described
|
||||||
[in the next section](../Update). There are also
|
[in the next section](Update.md). There are also
|
||||||
[other providers for extracts](https://wiki.openstreetmap.org/wiki/Planet.osm#Downloading).
|
[other providers for extracts](https://wiki.openstreetmap.org/wiki/Planet.osm#Downloading).
|
||||||
|
|
||||||
Please be aware that some extracts are not cut exactly along the country
|
Please be aware that some extracts are not cut exactly along the country
|
||||||
@@ -137,6 +138,14 @@ Note that you still need to provide for sufficient disk space for the initial
|
|||||||
import. So this option is particularly interesting if you plan to transfer the
|
import. So this option is particularly interesting if you plan to transfer the
|
||||||
database or reuse the space later.
|
database or reuse the space later.
|
||||||
|
|
||||||
|
!!! warning
|
||||||
|
The datastructure for updates are also required when adding additional data
|
||||||
|
after the import, for example [TIGER housenumber data](../customize/Tiger.md).
|
||||||
|
If you plan to use those, you must not use the `--no-updates` parameter.
|
||||||
|
Do a normal import, add the external data and once you are done with
|
||||||
|
everything run `nominatim freeze`.
|
||||||
|
|
||||||
|
|
||||||
### Reverse-only Imports
|
### Reverse-only Imports
|
||||||
|
|
||||||
If you only want to use the Nominatim database for reverse lookups or
|
If you only want to use the Nominatim database for reverse lookups or
|
||||||
@@ -152,15 +161,15 @@ Nominatim normally sets up a full search database containing administrative
|
|||||||
boundaries, places, streets, addresses and POI data. There are also other
|
boundaries, places, streets, addresses and POI data. There are also other
|
||||||
import styles available which only read selected data:
|
import styles available which only read selected data:
|
||||||
|
|
||||||
* **settings/import-admin.style**
|
* **admin**
|
||||||
Only import administrative boundaries and places.
|
Only import administrative boundaries and places.
|
||||||
* **settings/import-street.style**
|
* **street**
|
||||||
Like the admin style but also adds streets.
|
Like the admin style but also adds streets.
|
||||||
* **settings/import-address.style**
|
* **address**
|
||||||
Import all data necessary to compute addresses down to house number level.
|
Import all data necessary to compute addresses down to house number level.
|
||||||
* **settings/import-full.style**
|
* **full**
|
||||||
Default style that also includes points of interest.
|
Default style that also includes points of interest.
|
||||||
* **settings/import-extratags.style**
|
* **extratags**
|
||||||
Like the full style but also adds most of the OSM tags into the extratags
|
Like the full style but also adds most of the OSM tags into the extratags
|
||||||
column.
|
column.
|
||||||
|
|
||||||
@@ -183,8 +192,8 @@ full | 54h | 640 GB | 330 GB
|
|||||||
extratags | 54h | 650 GB | 340 GB
|
extratags | 54h | 650 GB | 340 GB
|
||||||
|
|
||||||
You can also customize the styles further.
|
You can also customize the styles further.
|
||||||
A [description of the style format](../develop/Import.md#configuring-the-import)
|
A [description of the style format](../customize/Import-Styles.md)
|
||||||
can be found in the development section.
|
can be found in the customization guide.
|
||||||
|
|
||||||
## Initial import of the data
|
## Initial import of the data
|
||||||
|
|
||||||
@@ -200,7 +209,7 @@ nominatim import --osm-file <data file> 2>&1 | tee setup.log
|
|||||||
```
|
```
|
||||||
|
|
||||||
The **project directory** is the one that you have set up at the beginning.
|
The **project directory** is the one that you have set up at the beginning.
|
||||||
See [creating the project directory](Import#creating-the-project-directory).
|
See [creating the project directory](#creating-the-project-directory).
|
||||||
|
|
||||||
### Notes on full planet imports
|
### Notes on full planet imports
|
||||||
|
|
||||||
@@ -219,7 +228,7 @@ to load the OSM data into the PostgreSQL database. This step is very demanding
|
|||||||
in terms of RAM usage. osm2pgsql and PostgreSQL are running in parallel at
|
in terms of RAM usage. osm2pgsql and PostgreSQL are running in parallel at
|
||||||
this point. PostgreSQL blocks at least the part of RAM that has been configured
|
this point. PostgreSQL blocks at least the part of RAM that has been configured
|
||||||
with the `shared_buffers` parameter during
|
with the `shared_buffers` parameter during
|
||||||
[PostgreSQL tuning](Installation#postgresql-tuning)
|
[PostgreSQL tuning](Installation.md#postgresql-tuning)
|
||||||
and needs some memory on top of that. osm2pgsql needs at least 2GB of RAM for
|
and needs some memory on top of that. osm2pgsql needs at least 2GB of RAM for
|
||||||
its internal data structures, potentially more when it has to process very large
|
its internal data structures, potentially more when it has to process very large
|
||||||
relations. In addition it needs to maintain a cache for node locations. The size
|
relations. In addition it needs to maintain a cache for node locations. The size
|
||||||
@@ -238,7 +247,8 @@ reduce the cache size or even consider using a flatnode file.
|
|||||||
|
|
||||||
### Testing the installation
|
### Testing the installation
|
||||||
|
|
||||||
Run this script to verify all required tables and indices got created successfully.
|
Run this script to verify that all required tables and indices got created
|
||||||
|
successfully.
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
nominatim admin --check-database
|
nominatim admin --check-database
|
||||||
@@ -261,23 +271,10 @@ reverse query, e.g. `http://localhost:8088/reverse.php?lat=27.1750090510034&lon=
|
|||||||
To run Nominatim via webservers like Apache or nginx, please read the
|
To run Nominatim via webservers like Apache or nginx, please read the
|
||||||
[Deployment chapter](Deployment.md).
|
[Deployment chapter](Deployment.md).
|
||||||
|
|
||||||
## Tuning the database
|
## Adding search through category phrases
|
||||||
|
|
||||||
Accurate word frequency information for search terms helps PostgreSQL's query
|
|
||||||
planner to make the right decisions. Recomputing them can improve the performance
|
|
||||||
of forward geocoding in particular under high load. To recompute word counts run:
|
|
||||||
|
|
||||||
```sh
|
|
||||||
nominatim refresh --word-counts
|
|
||||||
```
|
|
||||||
|
|
||||||
This will take a couple of hours for a full planet installation. You can
|
|
||||||
also defer that step to a later point in time when you realise that
|
|
||||||
performance becomes an issue. Just make sure that updates are stopped before
|
|
||||||
running this function.
|
|
||||||
|
|
||||||
If you want to be able to search for places by their type through
|
If you want to be able to search for places by their type through
|
||||||
[special key phrases](https://wiki.openstreetmap.org/wiki/Nominatim/Special_Phrases)
|
[special phrases](https://wiki.openstreetmap.org/wiki/Nominatim/Special_Phrases)
|
||||||
you also need to import these key phrases like this:
|
you also need to import these key phrases like this:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
@@ -288,4 +285,4 @@ Note that this command downloads the phrases from the wiki link above. You
|
|||||||
need internet access for the step.
|
need internet access for the step.
|
||||||
|
|
||||||
You can also import special phrases from a csv file, for more
|
You can also import special phrases from a csv file, for more
|
||||||
information please read the [Customization chapter](Customization.md).
|
information please see the [Customization part](../customize/Special-Phrases.md).
|
||||||
|
|||||||
@@ -24,6 +24,10 @@ and can't offer support.
|
|||||||
|
|
||||||
### Software
|
### Software
|
||||||
|
|
||||||
|
!!! Warning
|
||||||
|
For larger installations you **must have** PostgreSQL 11+ and Postgis 3+
|
||||||
|
otherwise import and queries will be slow to the point of being unusable.
|
||||||
|
|
||||||
For compiling:
|
For compiling:
|
||||||
|
|
||||||
* [cmake](https://cmake.org/)
|
* [cmake](https://cmake.org/)
|
||||||
@@ -39,7 +43,7 @@ For compiling:
|
|||||||
For running Nominatim:
|
For running Nominatim:
|
||||||
|
|
||||||
* [PostgreSQL](https://www.postgresql.org) (9.5+ will work, 11+ strongly recommended)
|
* [PostgreSQL](https://www.postgresql.org) (9.5+ will work, 11+ strongly recommended)
|
||||||
* [PostGIS](https://postgis.net) (2.2+)
|
* [PostGIS](https://postgis.net) (2.2+ will work, 3.0+ strongly recommended)
|
||||||
* [Python 3](https://www.python.org/) (3.6+)
|
* [Python 3](https://www.python.org/) (3.6+)
|
||||||
* [Psycopg2](https://www.psycopg.org) (2.7+)
|
* [Psycopg2](https://www.psycopg.org) (2.7+)
|
||||||
* [Python Dotenv](https://github.com/theskumar/python-dotenv)
|
* [Python Dotenv](https://github.com/theskumar/python-dotenv)
|
||||||
|
|||||||
51
docs/admin/Maintenance.md
Normal file
51
docs/admin/Maintenance.md
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
This chapter describes the various operations the Nominatim database administrator
|
||||||
|
may use to clean and maintain the database. None of these operations is mandatory
|
||||||
|
but they may help improve the performance and accuracy of results.
|
||||||
|
|
||||||
|
|
||||||
|
## Updating postcodes
|
||||||
|
|
||||||
|
Command: `nominatim refresh --postcodes`
|
||||||
|
|
||||||
|
Postcode centroids (aka 'calculated postcodes') are generated by looking at all
|
||||||
|
postcodes of a country, grouping them and calculating the geometric centroid.
|
||||||
|
There is currently no logic to deal with extreme outliers (typos or other
|
||||||
|
mistakes in OSM data). There is also no check if a postcodes adheres to a
|
||||||
|
country's format, e.g. if Swiss postcodes are 4 digits.
|
||||||
|
|
||||||
|
When running regular updates, postcodes results can be improved by running
|
||||||
|
this command on a regular basis. Note that only the postcode table and the
|
||||||
|
postcode search terms are updated. The postcode that is assigned to each place
|
||||||
|
is only updated when the place is updated.
|
||||||
|
|
||||||
|
The command takes around 70min to run on the planet and needs ca. 40GB of
|
||||||
|
temporary disk space.
|
||||||
|
|
||||||
|
|
||||||
|
## Updating word counts
|
||||||
|
|
||||||
|
Command: `nominatim refresh --word-counts`
|
||||||
|
|
||||||
|
Nominatim keeps frequency statistics about all search terms it indexes. These
|
||||||
|
statistics are currently used to optimise queries to the database. Thus better
|
||||||
|
statistics mean better performance. Word counts are created once after import
|
||||||
|
and are usually sufficient even when running regular updates. You might want
|
||||||
|
to rerun the statistics computation when adding larger amounts of new data,
|
||||||
|
for example, when adding an additional country via `nominatim add-data`.
|
||||||
|
|
||||||
|
|
||||||
|
## Removing large deleted objects
|
||||||
|
|
||||||
|
Nominatim refuses to delete very large areas because often these deletions are
|
||||||
|
accidental and are reverted within hours. Instead the deletions are logged in
|
||||||
|
the `import_polygon_delete` table and left to the administrator to clean up.
|
||||||
|
|
||||||
|
There is currently no command to do that. You can use the following SQL
|
||||||
|
query to force a deletion on all objects that have been deleted more than
|
||||||
|
a certain timespan ago (here: 1 month):
|
||||||
|
|
||||||
|
```sql
|
||||||
|
SELECT place_force_delete(p.place_id) FROM import_polygon_delete d, placex p
|
||||||
|
WHERE p.osm_type = d.osm_type and p.osm_id = d.osm_id
|
||||||
|
and age(p.indexed_date) > '1 month'::interval
|
||||||
|
```
|
||||||
@@ -15,6 +15,27 @@ breaking changes. **Please read them before running the migration.**
|
|||||||
If you are migrating from a version <3.6, then you still have to follow
|
If you are migrating from a version <3.6, then you still have to follow
|
||||||
the manual migration steps up to 3.6.
|
the manual migration steps up to 3.6.
|
||||||
|
|
||||||
|
## 3.7.0 -> 4.0.0
|
||||||
|
|
||||||
|
### NOMINATIM_PHRASE_CONFIG removed
|
||||||
|
|
||||||
|
Custom blacklist configurations for special phrases now need to be handed
|
||||||
|
with the `--config` parameter to `nominatim special-phrases`. Alternatively
|
||||||
|
you can put your custom configuration in the project directory in a file
|
||||||
|
named `phrase-settings.json`.
|
||||||
|
|
||||||
|
Version 3.8 also removes the automatic converter for the php format of
|
||||||
|
the configuration in older versions. If you are updating from Nominatim < 3.7
|
||||||
|
and still work with a custom `phrase-settings.php`, you need to manually
|
||||||
|
convert it into a json format.
|
||||||
|
|
||||||
|
### PHP utils removed
|
||||||
|
|
||||||
|
The old PHP utils have now been removed completely. You need to switch to
|
||||||
|
the appropriate functions of the nominatim command line tool. See
|
||||||
|
[Introducing `nominatim` command line tool](#introducing-nominatim-command-line-tool)
|
||||||
|
below.
|
||||||
|
|
||||||
## 3.6.0 -> 3.7.0
|
## 3.6.0 -> 3.7.0
|
||||||
|
|
||||||
### New format and name of configuration file
|
### New format and name of configuration file
|
||||||
@@ -80,7 +101,7 @@ done
|
|||||||
|
|
||||||
The debugging UI is no longer directly provided with Nominatim. Instead we
|
The debugging UI is no longer directly provided with Nominatim. Instead we
|
||||||
now provide a simple Javascript application. Please refer to
|
now provide a simple Javascript application. Please refer to
|
||||||
[Setting up the Nominatim UI](../Setup-Nominatim-UI) for details on how to
|
[Setting up the Nominatim UI](Setup-Nominatim-UI.md) for details on how to
|
||||||
set up the UI.
|
set up the UI.
|
||||||
|
|
||||||
The icons served together with the API responses have been moved to the
|
The icons served together with the API responses have been moved to the
|
||||||
|
|||||||
@@ -16,13 +16,14 @@ and run it. Grab the latest release from
|
|||||||
[nominatim-ui's Github release page](https://github.com/osm-search/nominatim-ui/releases)
|
[nominatim-ui's Github release page](https://github.com/osm-search/nominatim-ui/releases)
|
||||||
and unpack it. You can use `nominatim-ui-x.x.x.tar.gz` or `nominatim-ui-x.x.x.zip`.
|
and unpack it. You can use `nominatim-ui-x.x.x.tar.gz` or `nominatim-ui-x.x.x.zip`.
|
||||||
|
|
||||||
Copy the example configuration into the right place:
|
Next you need to adapt the UI yo your installation. Custom settings need to be
|
||||||
|
put into `dist/theme/config.theme.js`. At a minimum you need to
|
||||||
|
set `Nominatim_API_Endpoint` to point to your Nominatim installation:
|
||||||
|
|
||||||
cd nominatim-ui
|
cd nominatim-ui
|
||||||
cp dist/config.example.js dist/config.js
|
echo "Nominatim_Config.Nominatim_API_Endpoint='https:\\myserver.org\nominatim';" > dist/theme/config.theme.js
|
||||||
|
|
||||||
Now adapt the configuration to your needs. You need at least
|
For the full set of available settings, have a look at `dist/config.defaults.js`.
|
||||||
to change the `Nominatim_API_Endpoint` to point to your Nominatim installation.
|
|
||||||
|
|
||||||
Then you can just test it locally by spinning up a webserver in the `dist`
|
Then you can just test it locally by spinning up a webserver in the `dist`
|
||||||
directory. For example, with Python:
|
directory. For example, with Python:
|
||||||
|
|||||||
@@ -10,18 +10,21 @@ For a list of other methods to add or update data see the output of
|
|||||||
If you have configured a flatnode file for the import, then you
|
If you have configured a flatnode file for the import, then you
|
||||||
need to keep this flatnode file around for updates.
|
need to keep this flatnode file around for updates.
|
||||||
|
|
||||||
#### Installing the newest version of Pyosmium
|
### Installing the newest version of Pyosmium
|
||||||
|
|
||||||
It is recommended to install Pyosmium via pip. Make sure to use python3.
|
The replication process uses
|
||||||
|
[Pyosmium](https://docs.osmcode.org/pyosmium/latest/updating_osm_data.html)
|
||||||
|
to download update data from the server.
|
||||||
|
It is recommended to install Pyosmium via pip.
|
||||||
Run (as the same user who will later run the updates):
|
Run (as the same user who will later run the updates):
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
pip3 install --user osmium
|
pip3 install --user osmium
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Setting up the update process
|
### Setting up the update process
|
||||||
|
|
||||||
Next the update needs to be initialised. By default Nominatim is configured
|
Next the update process needs to be initialised. By default Nominatim is configured
|
||||||
to update using the global minutely diffs.
|
to update using the global minutely diffs.
|
||||||
|
|
||||||
If you want a different update source you will need to add some settings
|
If you want a different update source you will need to add some settings
|
||||||
@@ -45,12 +48,119 @@ what you expect.
|
|||||||
The `replication --init` command needs to be rerun whenever the replication
|
The `replication --init` command needs to be rerun whenever the replication
|
||||||
service is changed.
|
service is changed.
|
||||||
|
|
||||||
#### Updating Nominatim
|
### Updating Nominatim
|
||||||
|
|
||||||
The following command will keep your database constantly up to date:
|
Nominatim supports different modes how to retrieve the update data from the
|
||||||
|
server. Which one you want to use depends on your exact setup and how often you
|
||||||
|
want to retrieve updates.
|
||||||
|
|
||||||
|
These instructions are for using a single source of updates. If you have
|
||||||
|
imported multiple country extracts and want to keep them
|
||||||
|
up-to-date, [Advanced installations section](Advanced-Installations.md)
|
||||||
|
contains instructions to set up and update multiple country extracts.
|
||||||
|
|
||||||
|
#### Continuous updates
|
||||||
|
|
||||||
|
This is the easiest mode. Simply run the replication command without any
|
||||||
|
parameters:
|
||||||
|
|
||||||
nominatim replication
|
nominatim replication
|
||||||
|
|
||||||
If you have imported multiple country extracts and want to keep them
|
The update application keeps running forever and retrieves and applies
|
||||||
up-to-date, [Advanced installations section](Advanced-Installations.md) contains instructions
|
new updates from the server as they are published.
|
||||||
to set up and update multiple country extracts.
|
|
||||||
|
You can run this command as a simple systemd service. Create a service
|
||||||
|
description like that in `/etc/systemd/system/nominatim-update.service`:
|
||||||
|
|
||||||
|
```
|
||||||
|
[Unit]
|
||||||
|
Description=Continuous updates of Nominatim
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
WorkingDirectory=/srv/nominatim
|
||||||
|
ExecStart=nominatim replication
|
||||||
|
StandardOutput=append:/var/log/nominatim-updates.log
|
||||||
|
StandardError=append:/var/log/nominatim-updates.error.log
|
||||||
|
User=nominatim
|
||||||
|
Group=nominatim
|
||||||
|
Type=simple
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
```
|
||||||
|
|
||||||
|
Replace the `WorkingDirectory` with your project directory. Also adapt user
|
||||||
|
and group names as required.
|
||||||
|
|
||||||
|
Now activate the service and start the updates:
|
||||||
|
|
||||||
|
```
|
||||||
|
sudo systemctl daemon-reload
|
||||||
|
sudo systemctl enable nominatim-updates
|
||||||
|
sudo systemctl start nominatim-updates
|
||||||
|
```
|
||||||
|
|
||||||
|
#### One-time mode
|
||||||
|
|
||||||
|
When the `--once` parameter is given, then Nominatim will download exactly one
|
||||||
|
batch of updates and then exit. This one-time mode still respects the
|
||||||
|
`NOMINATIM_REPLICATION_UPDATE_INTERVAL` that you have set. If according to
|
||||||
|
the update interval no new data has been published yet, it will go to sleep
|
||||||
|
until the next expected update and only then attempt to download the next batch.
|
||||||
|
|
||||||
|
The one-time mode is particularly useful if you want to run updates continuously
|
||||||
|
but need to schedule other work in between updates. For example, the main
|
||||||
|
service at osm.org uses it, to regularly recompute postcodes -- a process that
|
||||||
|
must not be run while updates are in progress. Its update script
|
||||||
|
looks like this:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Switch to your project directory.
|
||||||
|
cd /srv/nominatim
|
||||||
|
|
||||||
|
while true; do
|
||||||
|
nominatim replication --once
|
||||||
|
if [ -f "/srv/nominatim/schedule-mainenance" ]; then
|
||||||
|
rm /srv/nominatim/schedule-mainenance
|
||||||
|
nominatim refresh --postcodes
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
```
|
||||||
|
|
||||||
|
A cron job then creates the file `/srv/nominatim/need-mainenance` once per night.
|
||||||
|
|
||||||
|
|
||||||
|
#### Catch-up mode
|
||||||
|
|
||||||
|
With the `--catch-up` parameter, Nominatim will immediately try to download
|
||||||
|
all changes from the server until the database is up-to-date. The catch-up mode
|
||||||
|
still respects the parameter `NOMINATIM_REPLICATION_MAX_DIFF`. It downloads and
|
||||||
|
applies the changes in appropriate batches until all is done.
|
||||||
|
|
||||||
|
The catch-up mode is foremost useful to bring the database up to speed after the
|
||||||
|
initial import. Give that the service usually is not in production at this
|
||||||
|
point, you can temporarily be a bit more generous with the batch size and
|
||||||
|
number of threads you use for the updates by running catch-up like this:
|
||||||
|
|
||||||
|
```
|
||||||
|
cd /srv/nominatim
|
||||||
|
NOMINATIM_REPLICATION_MAX_DIFF=5000 nominatim replication --catch-up --threads 15
|
||||||
|
```
|
||||||
|
|
||||||
|
The catch-up mode is also useful when you want to apply updates at a lower
|
||||||
|
frequency than what the source publishes. You can set up a cron job to run
|
||||||
|
replication catch-up at whatever interval you desire.
|
||||||
|
|
||||||
|
!!! hint
|
||||||
|
When running scheduled updates with catch-up, it is a good idea to choose
|
||||||
|
a replication source with an update frequency that is an order of magnitude
|
||||||
|
lower. For example, if you want to update once a day, use an hourly updated
|
||||||
|
source. This makes sure that you don't miss an entire day of updates when
|
||||||
|
the source is unexpectely late to publish its update.
|
||||||
|
|
||||||
|
If you want to use the source with the same update frequency (e.g. a daily
|
||||||
|
updated source with daily updates), use the
|
||||||
|
continuous update mode. It ensures to re-request the newest update until it
|
||||||
|
is published.
|
||||||
|
|||||||
@@ -35,7 +35,7 @@ it contains the county/state/country across the border.
|
|||||||
#### 3. I get different counties/states/countries when I change the zoom parameter in the reverse query. How is that possible?
|
#### 3. I get different counties/states/countries when I change the zoom parameter in the reverse query. How is that possible?
|
||||||
|
|
||||||
This is basically the same problem as in the previous answer.
|
This is basically the same problem as in the previous answer.
|
||||||
The zoom level influences at which [search rank](https://wiki.openstreetmap.org/wiki/Nominatim/Development_overview#Country_to_street_level) Nominatim starts looking
|
The zoom level influences at which [search rank](../customize/Ranking.md#search-rank) Nominatim starts looking
|
||||||
for the closest object. So the closest house number maybe on one side of the
|
for the closest object. So the closest house number maybe on one side of the
|
||||||
border while the closest street is on the other. As the address details contain
|
border while the closest street is on the other. As the address details contain
|
||||||
the address of the closest object found, you might sometimes get one result,
|
the address of the closest object found, you might sometimes get one result,
|
||||||
|
|||||||
@@ -290,6 +290,7 @@ with a designation label. Per default the following labels may appear:
|
|||||||
* emergency, historic, military, natural, landuse, place, railway,
|
* emergency, historic, military, natural, landuse, place, railway,
|
||||||
man_made, aerialway, boundary, amenity, aeroway, club, craft, leisure,
|
man_made, aerialway, boundary, amenity, aeroway, club, craft, leisure,
|
||||||
office, mountain_pass, shop, tourism, bridge, tunnel, waterway
|
office, mountain_pass, shop, tourism, bridge, tunnel, waterway
|
||||||
|
* postcode
|
||||||
|
|
||||||
They roughly correspond to the classification of the OpenStreetMap data
|
They roughly correspond to the classification of the OpenStreetMap data
|
||||||
according to either the `place` tag or the main key of the object.
|
according to either the `place` tag or the main key of the object.
|
||||||
|
|||||||
@@ -27,8 +27,8 @@ The search term may be specified with two different sets of parameters:
|
|||||||
|
|
||||||
Free-form query string to search for.
|
Free-form query string to search for.
|
||||||
Free-form queries are processed first left-to-right and then right-to-left if that fails. So you may search for
|
Free-form queries are processed first left-to-right and then right-to-left if that fails. So you may search for
|
||||||
[pilkington avenue, birmingham](//nominatim.openstreetmap.org/search?q=pilkington+avenue,birmingham) as well as for
|
[pilkington avenue, birmingham](https://nominatim.openstreetmap.org/search?q=pilkington+avenue,birmingham) as well as for
|
||||||
[birmingham, pilkington avenue](//nominatim.openstreetmap.org/search?q=birmingham,+pilkington+avenue).
|
[birmingham, pilkington avenue](https://nominatim.openstreetmap.org/search?q=birmingham,+pilkington+avenue).
|
||||||
Commas are optional, but improve performance by reducing the complexity of the search.
|
Commas are optional, but improve performance by reducing the complexity of the search.
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,38 +1,24 @@
|
|||||||
# OSM Data Import
|
|
||||||
|
|
||||||
OSM data is initially imported using [osm2pgsql](https://osm2pgsql.org).
|
|
||||||
Nominatim uses its own data output style 'gazetteer', which differs from the
|
|
||||||
output style created for map rendering.
|
|
||||||
|
|
||||||
## Database Layout
|
|
||||||
|
|
||||||
The gazetteer style produces a single table `place` with the following rows:
|
|
||||||
|
|
||||||
* `osm_type` - kind of OSM object (**N** - node, **W** - way, **R** - relation)
|
|
||||||
* `osm_id` - original OSM ID
|
|
||||||
* `class` - key of principal tag defining the object type
|
|
||||||
* `type` - value of principal tag defining the object type
|
|
||||||
* `name` - collection of tags that contain a name or reference
|
|
||||||
* `admin_level` - numerical value of the tagged administrative level
|
|
||||||
* `address` - collection of tags defining the address of an object
|
|
||||||
* `extratags` - collection of additional interesting tags that are not
|
|
||||||
directly relevant for searching
|
|
||||||
* `geometry` - geometry of the object (in WGS84)
|
|
||||||
|
|
||||||
A single OSM object may appear multiple times in this table when it is tagged
|
|
||||||
with multiple tags that may constitute a principal tag. Take for example a
|
|
||||||
motorway bridge. In OSM, this would be a way which is tagged with
|
|
||||||
`highway=motorway` and `bridge=yes`. This way would appear in the `place` table
|
|
||||||
once with `class` of `highway` and once with a `class` of `bridge`. Thus the
|
|
||||||
*unique key* for `place` is (`osm_type`, `osm_id`, `class`).
|
|
||||||
|
|
||||||
## Configuring the Import
|
## Configuring the Import
|
||||||
|
|
||||||
How tags are interpreted and assigned to the different `place` columns can be
|
Which OSM objects are added to the database and which of the tags are used
|
||||||
configured via the import style configuration file (`NOMINATIM_IMPORT_STYLE`). This
|
can be configured via the import style configuration file. This
|
||||||
is a JSON file which contains a list of rules which are matched against every
|
is a JSON file which contains a list of rules which are matched against every
|
||||||
tag of every object and then assign the tag its specific role.
|
tag of every object and then assign the tag its specific role.
|
||||||
|
|
||||||
|
The style to use is given by the `NOMINATIM_IMPORT_STYLE` configuration
|
||||||
|
option. There are a number of default styles, which are explained in detail
|
||||||
|
in the [Import section](../admin/Import.md#filtering-imported-data). These
|
||||||
|
standard styles may be referenced by their name.
|
||||||
|
|
||||||
|
You can also create your own custom syle. Put the style file into your
|
||||||
|
project directory and then set `NOMINATIM_IMPORT_STYLE` to the name of the file.
|
||||||
|
It is always recommended to start with one of the standard styles and customize
|
||||||
|
those. You find the standard styles under the name `import-<stylename>.style`
|
||||||
|
in the standard Nominatim configuration path (usually `/etc/nominatim` or
|
||||||
|
`/usr/local/etc/nominatim`).
|
||||||
|
|
||||||
|
The remainder of the page describes the format of the file.
|
||||||
|
|
||||||
### Configuration Rules
|
### Configuration Rules
|
||||||
|
|
||||||
A single rule looks like this:
|
A single rule looks like this:
|
||||||
@@ -159,9 +145,6 @@ A rule can define as many of these properties for one match as it likes. For
|
|||||||
example, if the property is `"main,extra"` then the tag will open a new row
|
example, if the property is `"main,extra"` then the tag will open a new row
|
||||||
but also have the tag appear in the list of extra tags.
|
but also have the tag appear in the list of extra tags.
|
||||||
|
|
||||||
There are a number of pre-defined styles in the `settings/` directory. It is
|
|
||||||
advisable to start from one of these styles when defining your own.
|
|
||||||
|
|
||||||
### Changing the Style of Existing Databases
|
### Changing the Style of Existing Databases
|
||||||
|
|
||||||
There is normally no issue changing the style of a database that is already
|
There is normally no issue changing the style of a database that is already
|
||||||
20
docs/customize/Overview.md
Normal file
20
docs/customize/Overview.md
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
Nominatim comes with a predefined set of configuration options that should
|
||||||
|
work for most standard installations. If you have special requirements, there
|
||||||
|
are many places where the configuration can be adapted. This chapter describes
|
||||||
|
the following configurable parts:
|
||||||
|
|
||||||
|
* [Global Settings](Settings.md) has a detailed description of all parameters that
|
||||||
|
can be set in your local `.env` configuration
|
||||||
|
* [Import styles](Import-Styles.md) explains how to write your own import style
|
||||||
|
in order to control what kind of OSM data will be imported
|
||||||
|
* [Place ranking](Ranking.md) describes the configuration around classifing
|
||||||
|
places in terms of their importance and their role in an address
|
||||||
|
* [Tokenizers](Tokenizers.md) describes the configuration of the module
|
||||||
|
responsible for analysing and indexing names
|
||||||
|
* [Special Phrases](Special-Phrases.md) are common nouns or phrases that
|
||||||
|
can be used in search to identify a class of places
|
||||||
|
|
||||||
|
There are also guides for adding the following external data:
|
||||||
|
|
||||||
|
* [US house numbers from the TIGER dataset](Tiger.md)
|
||||||
|
* [External postcodes](Postcodes.md)
|
||||||
37
docs/customize/Postcodes.md
Normal file
37
docs/customize/Postcodes.md
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
# External postcode data
|
||||||
|
|
||||||
|
Nominatim creates a table of known postcode centroids during import. This table
|
||||||
|
is used for searches of postcodes and for adding postcodes to places where the
|
||||||
|
OSM data does not provide one. These postcode centroids are mainly computed
|
||||||
|
from the OSM data itself. In addition, Nominatim supports reading postcode
|
||||||
|
information from an external CSV file, to supplement the postcodes that are
|
||||||
|
missing in OSM.
|
||||||
|
|
||||||
|
To enable external postcode support, simply put one CSV file per country into
|
||||||
|
your project directory and name it `<CC>_postcodes.csv`. `<CC>` must be the
|
||||||
|
two-letter country code for which to apply the file. The file may also be
|
||||||
|
gzipped. Then it must be called `<CC>_postcodes.csv.gz`.
|
||||||
|
|
||||||
|
The CSV file must use commas as a delimiter and have a header line. Nominatim
|
||||||
|
expects three columns to be present: `postcode`, `lat` and `lon`. All other
|
||||||
|
columns are ignored. `lon` and `lat` must describe the x and y coordinates of the
|
||||||
|
postcode centroids in WGS84.
|
||||||
|
|
||||||
|
The postcode files are loaded only when there is data for the given country
|
||||||
|
in your database. For example, if there is a `us_postcodes.csv` file in your
|
||||||
|
project directory but you import only an excerpt of Italy, then the US postcodes
|
||||||
|
will simply be ignored.
|
||||||
|
|
||||||
|
As a rule, the external postcode data should be put into the project directory
|
||||||
|
**before** starting the initial import. Still, you can add, remove and update the
|
||||||
|
external postcode data at any time. Simply
|
||||||
|
run:
|
||||||
|
|
||||||
|
```
|
||||||
|
nominatim refresh --postcodes
|
||||||
|
```
|
||||||
|
|
||||||
|
to make the changes visible in your database. Be aware, however, that the changes
|
||||||
|
only have an immediate effect on searches for postcodes. Postcodes that were
|
||||||
|
added to places are only updated, when they are reindexed. That usually happens
|
||||||
|
only during replication updates.
|
||||||
@@ -1,8 +1,7 @@
|
|||||||
# Place Ranking in Nominatim
|
# Place Ranking in Nominatim
|
||||||
|
|
||||||
Nominatim uses two metrics to rank a place: search rank and address rank.
|
Nominatim uses two metrics to rank a place: search rank and address rank.
|
||||||
Both can be assigned a value between 0 and 30. They serve slightly
|
This chapter explains what place ranking means and how it can be customized.
|
||||||
different purposes, which are explained in this chapter.
|
|
||||||
|
|
||||||
## Search rank
|
## Search rank
|
||||||
|
|
||||||
649
docs/customize/Settings.md
Normal file
649
docs/customize/Settings.md
Normal file
@@ -0,0 +1,649 @@
|
|||||||
|
This section provides a reference of all configuration parameters that can
|
||||||
|
be used with Nominatim.
|
||||||
|
|
||||||
|
# Configuring Nominatim
|
||||||
|
|
||||||
|
Nominatim uses [dotenv](https://github.com/theskumar/python-dotenv) to manage
|
||||||
|
its configuration settings. There are two means to set configuration
|
||||||
|
variables: through an `.env` configuration file or through an environment
|
||||||
|
variable.
|
||||||
|
|
||||||
|
The `.env` configuration file needs to be placed into the
|
||||||
|
[project directory](../admin/Import.md#creating-the-project-directory). It
|
||||||
|
must contain configuration parameters in `<parameter>=<value>` format.
|
||||||
|
Please refer to the dotenv documentation for details.
|
||||||
|
|
||||||
|
The configuration options may also be set in the form of shell environment
|
||||||
|
variables. This is particularly useful, when you want to temporarily change
|
||||||
|
a configuration option. For example, to force the replication serve to
|
||||||
|
download the next change, you can temporarily disable the update interval:
|
||||||
|
|
||||||
|
NOMINATIM_REPLICATION_UPDATE_INTERVAL=0 nominatim replication --once
|
||||||
|
|
||||||
|
If a configuration option is defined through .env file and environment
|
||||||
|
variable, then the latter takes precedence.
|
||||||
|
|
||||||
|
## Configuration Parameter Reference
|
||||||
|
|
||||||
|
### Import and Database Settings
|
||||||
|
|
||||||
|
#### NOMINATIM_DATABASE_DSN
|
||||||
|
|
||||||
|
| Summary | |
|
||||||
|
| -------------- | --------------------------------------------------- |
|
||||||
|
| **Description:** | Database connection string |
|
||||||
|
| **Format:** | string: `pgsql:<param1>=<value1>;<param2>=<value2>;...` |
|
||||||
|
| **Default:** | pgsql:dbname=nominatim |
|
||||||
|
| **After Changes:** | run `nominatim refresh --website` |
|
||||||
|
|
||||||
|
Sets the connection parameters for the Nominatim database. At a minimum
|
||||||
|
the name of the database (`dbname`) is required. You can set any additional
|
||||||
|
parameter that is understood by libpq. See the [Postgres documentation](https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-PARAMKEYWORDS) for a full list.
|
||||||
|
|
||||||
|
!!! note
|
||||||
|
It is usually recommended not to set the password directly in this
|
||||||
|
configuration parameter. Use a
|
||||||
|
[password file](https://www.postgresql.org/docs/current/libpq-pgpass.html)
|
||||||
|
instead.
|
||||||
|
|
||||||
|
|
||||||
|
#### NOMINATIM_DATABASE_WEBUSER
|
||||||
|
|
||||||
|
| Summary | |
|
||||||
|
| -------------- | --------------------------------------------------- |
|
||||||
|
| **Description:** | Database query user |
|
||||||
|
| **Format:** | string |
|
||||||
|
| **Default:** | www-data |
|
||||||
|
| **After Changes:** | cannot be changed after import |
|
||||||
|
|
||||||
|
Defines the name of the database user that will run search queries. Usually
|
||||||
|
this is the user under which the webserver is executed. When running Nominatim
|
||||||
|
via php-fpm, you can also define a separate query user. The Postgres user
|
||||||
|
needs to be set up before starting the import.
|
||||||
|
|
||||||
|
Nominatim grants minimal rights to this user to all tables that are needed
|
||||||
|
for running geocoding queries.
|
||||||
|
|
||||||
|
|
||||||
|
#### NOMINATIM_DATABASE_MODULE_PATH
|
||||||
|
|
||||||
|
| Summary | |
|
||||||
|
| -------------- | --------------------------------------------------- |
|
||||||
|
| **Description:** | Directory where to find the PostgreSQL server module |
|
||||||
|
| **Format:** | path |
|
||||||
|
| **Default:** | _empty_ (use `<project_directory>/module`) |
|
||||||
|
| **After Changes:** | run `nominatim refresh --functions` |
|
||||||
|
| **Comment:** | Legacy tokenizer only |
|
||||||
|
|
||||||
|
Defines the directory in which the PostgreSQL server module `nominatim.so`
|
||||||
|
is stored. The directory and module must be accessible by the PostgreSQL
|
||||||
|
server.
|
||||||
|
|
||||||
|
For information on how to use this setting when working with external databases,
|
||||||
|
see [Advanced Installations](../admin/Advanced-Installations.md).
|
||||||
|
|
||||||
|
The option is only used by the Legacy tokenizer and ignored otherwise.
|
||||||
|
|
||||||
|
|
||||||
|
#### NOMINATIM_TOKENIZER
|
||||||
|
|
||||||
|
| Summary | |
|
||||||
|
| -------------- | --------------------------------------------------- |
|
||||||
|
| **Description:** | Tokenizer used for normalizing and parsing queries and names |
|
||||||
|
| **Format:** | string |
|
||||||
|
| **Default:** | legacy |
|
||||||
|
| **After Changes:** | cannot be changed after import |
|
||||||
|
|
||||||
|
Sets the tokenizer type to use for the import. For more information on
|
||||||
|
available tokenizers and how they are configured, see
|
||||||
|
[Tokenizers](../customize/Tokenizers.md).
|
||||||
|
|
||||||
|
|
||||||
|
#### NOMINATIM_TOKENIZER_CONFIG
|
||||||
|
|
||||||
|
| Summary | |
|
||||||
|
| -------------- | --------------------------------------------------- |
|
||||||
|
| **Description:** | Configuration file for the tokenizer |
|
||||||
|
| **Format:** | path |
|
||||||
|
| **Default:** | _empty_ (default file depends on tokenizer) |
|
||||||
|
| **After Changes:** | see documentation for each tokenizer |
|
||||||
|
|
||||||
|
Points to the file with additional configuration for the tokenizer.
|
||||||
|
See the [Tokenizer](../customize/Tokenizers.md) descriptions for details
|
||||||
|
on the file format.
|
||||||
|
|
||||||
|
If a relative path is given, then the file is searched first relative to the
|
||||||
|
project directory and then in the global settings directory.
|
||||||
|
|
||||||
|
#### NOMINATIM_MAX_WORD_FREQUENCY
|
||||||
|
|
||||||
|
| Summary | |
|
||||||
|
| -------------- | --------------------------------------------------- |
|
||||||
|
| **Description:** | Number of occurrences before a word is considered frequent |
|
||||||
|
| **Format:** | int |
|
||||||
|
| **Default:** | 50000 |
|
||||||
|
| **After Changes:** | cannot be changed after import |
|
||||||
|
| **Comment:** | Legacy tokenizer only |
|
||||||
|
|
||||||
|
The word frequency count is used by the Legacy tokenizer to automatically
|
||||||
|
identify _stop words_. Any partial term that occurs more often then what
|
||||||
|
is defined in this setting, is effectively ignored during search.
|
||||||
|
|
||||||
|
|
||||||
|
#### NOMINATIM_LIMIT_REINDEXING
|
||||||
|
|
||||||
|
| Summary | |
|
||||||
|
| -------------- | --------------------------------------------------- |
|
||||||
|
| **Description:** | Avoid invalidating large areas |
|
||||||
|
| **Format:** | bool |
|
||||||
|
| **Default:** | yes |
|
||||||
|
|
||||||
|
Nominatim computes the address of each place at indexing time. This has the
|
||||||
|
advantage to make search faster but also means that more objects needs to
|
||||||
|
be invalidated when the data changes. For example, changing the name of
|
||||||
|
the state of Florida would require recomputing every single address point
|
||||||
|
in the state to make the new name searchable in conjunction with addresses.
|
||||||
|
|
||||||
|
Setting this option to 'yes' means that Nominatim skips reindexing of contained
|
||||||
|
objects when the area becomes too large.
|
||||||
|
|
||||||
|
|
||||||
|
#### NOMINATIM_LANGUAGES
|
||||||
|
|
||||||
|
| Summary | |
|
||||||
|
| -------------- | --------------------------------------------------- |
|
||||||
|
| **Description:** | Restrict search languages |
|
||||||
|
| **Format:** | string: comma-separated list of language codes |
|
||||||
|
| **Default:** | _empty_ |
|
||||||
|
|
||||||
|
Normally Nominatim will include all language variants of name:XX
|
||||||
|
in the search index. Set this to a comma separated list of language
|
||||||
|
codes, to restrict import to a subset of languages.
|
||||||
|
|
||||||
|
Currently only affects the initial import of country names and special phrases.
|
||||||
|
|
||||||
|
|
||||||
|
#### NOMINATIM_TERM_NORMALIZATION
|
||||||
|
|
||||||
|
| Summary | |
|
||||||
|
| -------------- | --------------------------------------------------- |
|
||||||
|
| **Description:** | Rules for normalizing terms for comparisons |
|
||||||
|
| **Format:** | string: semicolon-separated list of ICU rules |
|
||||||
|
| **Default:** | :: NFD (); [[:Nonspacing Mark:] [:Cf:]] >; :: lower (); [[:Punctuation:][:Space:]]+ > ' '; :: NFC (); |
|
||||||
|
| **Comment:** | Legacy tokenizer only |
|
||||||
|
|
||||||
|
[Special phrases](Special-Phrases.md) have stricter matching requirements than
|
||||||
|
normal search terms. They must appear exactly in the query after this term
|
||||||
|
normalization has been applied.
|
||||||
|
|
||||||
|
Only has an effect on the Legacy tokenizer. For the ICU tokenizer the rules
|
||||||
|
defined in the
|
||||||
|
[normalization section](Tokenizers.md#normalization-and-transliteration)
|
||||||
|
will be used.
|
||||||
|
|
||||||
|
|
||||||
|
#### NOMINATIM_USE_US_TIGER_DATA
|
||||||
|
|
||||||
|
| Summary | |
|
||||||
|
| -------------- | --------------------------------------------------- |
|
||||||
|
| **Description:** | Enable searching for Tiger house number data |
|
||||||
|
| **Format:** | boolean |
|
||||||
|
| **Default:** | no |
|
||||||
|
| **After Changes:** | run `nominatim --refresh --functions` |
|
||||||
|
|
||||||
|
When this setting is enabled, search and reverse queries also take data
|
||||||
|
from [Tiger house number data](Tiger.md) into account.
|
||||||
|
|
||||||
|
|
||||||
|
#### NOMINATIM_USE_AUX_LOCATION_DATA
|
||||||
|
|
||||||
|
| Summary | |
|
||||||
|
| -------------- | --------------------------------------------------- |
|
||||||
|
| **Description:** | Enable searching in external house number tables |
|
||||||
|
| **Format:** | boolean |
|
||||||
|
| **Default:** | no |
|
||||||
|
| **After Changes:** | run `nominatim --refresh --functions` |
|
||||||
|
| **Comment:** | Do not use. |
|
||||||
|
|
||||||
|
When this setting is enabled, search queries also take data from external
|
||||||
|
house number tables into account.
|
||||||
|
|
||||||
|
*Warning:* This feature is currently unmaintained and should not be used.
|
||||||
|
|
||||||
|
|
||||||
|
#### NOMINATIM_HTTP_PROXY
|
||||||
|
|
||||||
|
| Summary | |
|
||||||
|
| -------------- | --------------------------------------------------- |
|
||||||
|
| **Description:** | Use HTTP proxy when downloading data |
|
||||||
|
| **Format:** | boolean |
|
||||||
|
| **Default:** | no |
|
||||||
|
|
||||||
|
When this setting is enabled and at least
|
||||||
|
[NOMINATIM_HTTP_PROXY_HOST](#nominatim_http_proxy_host) and
|
||||||
|
[NOMINATIM_HTTP_PROXY_PORT](#nominatim_http_proxy_port) are set, the
|
||||||
|
configured proxy will be used, when downloading external data like
|
||||||
|
replication diffs.
|
||||||
|
|
||||||
|
|
||||||
|
#### NOMINATIM_HTTP_PROXY_HOST
|
||||||
|
|
||||||
|
| Summary | |
|
||||||
|
| -------------- | --------------------------------------------------- |
|
||||||
|
| **Description:** | Host name of the proxy to use |
|
||||||
|
| **Format:** | string |
|
||||||
|
| **Default:** | _empty_ |
|
||||||
|
|
||||||
|
When [NOMINATIM_HTTP_PROXY](#nominatim_http_proxy) is enabled, this setting
|
||||||
|
configures the proxy host name.
|
||||||
|
|
||||||
|
|
||||||
|
#### NOMINATIM_HTTP_PROXY_PORT
|
||||||
|
|
||||||
|
| Summary | |
|
||||||
|
| -------------- | --------------------------------------------------- |
|
||||||
|
| **Description:** | Port number of the proxy to use |
|
||||||
|
| **Format:** | integer |
|
||||||
|
| **Default:** | 3128 |
|
||||||
|
|
||||||
|
When [NOMINATIM_HTTP_PROXY](#nominatim_http_proxy) is enabled, this setting
|
||||||
|
configures the port number to use with the proxy.
|
||||||
|
|
||||||
|
|
||||||
|
#### NOMINATIM_HTTP_PROXY_LOGIN
|
||||||
|
|
||||||
|
| Summary | |
|
||||||
|
| -------------- | --------------------------------------------------- |
|
||||||
|
| **Description:** | Username for proxies that require login |
|
||||||
|
| **Format:** | string |
|
||||||
|
| **Default:** | _empty_ |
|
||||||
|
|
||||||
|
When [NOMINATIM_HTTP_PROXY](#nominatim_http_proxy) is enabled, use this
|
||||||
|
setting to define the username for proxies that require a login.
|
||||||
|
|
||||||
|
|
||||||
|
#### NOMINATIM_HTTP_PROXY_PASSWORD
|
||||||
|
|
||||||
|
| Summary | |
|
||||||
|
| -------------- | --------------------------------------------------- |
|
||||||
|
| **Description:** | Password for proxies that require login |
|
||||||
|
| **Format:** | string |
|
||||||
|
| **Default:** | _empty_ |
|
||||||
|
|
||||||
|
When [NOMINATIM_HTTP_PROXY](#nominatim_http_proxy) is enabled, use this
|
||||||
|
setting to define the password for proxies that require a login.
|
||||||
|
|
||||||
|
|
||||||
|
#### NOMINATIM_OSM2PGSQL_BINARY
|
||||||
|
|
||||||
|
| Summary | |
|
||||||
|
| -------------- | --------------------------------------------------- |
|
||||||
|
| **Description:** | Location of the osm2pgsql binary |
|
||||||
|
| **Format:** | path |
|
||||||
|
| **Default:** | _empty_ (use binary shipped with Nominatim) |
|
||||||
|
| **Comment:** | EXPERT ONLY |
|
||||||
|
|
||||||
|
Nominatim uses [osm2pgsql](https://osm2pgsql.org) to load the OSM data
|
||||||
|
initially into the database. Nominatim comes bundled with a version of
|
||||||
|
osm2pgsql that is guaranteed to be compatible. Use this setting to use
|
||||||
|
a different binary instead. You should do this only when you know exactly
|
||||||
|
what you are doing. If the osm2pgsql version is not compatible, then the
|
||||||
|
result is undefined.
|
||||||
|
|
||||||
|
|
||||||
|
#### NOMINATIM_WIKIPEDIA_DATA_PATH
|
||||||
|
|
||||||
|
| Summary | |
|
||||||
|
| -------------- | --------------------------------------------------- |
|
||||||
|
| **Description:** | Directory with the wikipedia importance data |
|
||||||
|
| **Format:** | path |
|
||||||
|
| **Default:** | _empty_ (project directory) |
|
||||||
|
|
||||||
|
Set a custom location for the
|
||||||
|
[wikipedia ranking file](../admin/Import.md#wikipediawikidata-rankings). When
|
||||||
|
unset, Nominatim expects the data to be saved in the project directory.
|
||||||
|
|
||||||
|
#### NOMINATIM_ADDRESS_LEVEL_CONFIG
|
||||||
|
|
||||||
|
| Summary | |
|
||||||
|
| -------------- | --------------------------------------------------- |
|
||||||
|
| **Description:** | Configuration file for rank assignments |
|
||||||
|
| **Format:** | path |
|
||||||
|
| **Default:** | address-levels.json |
|
||||||
|
|
||||||
|
The _address level configuration_ defines the rank assignments for places. See
|
||||||
|
[Place Ranking](Ranking.md) for a detailed explanation what rank assignments
|
||||||
|
are and what the configuration file must look like.
|
||||||
|
|
||||||
|
When a relative path is given, then the file is searched first relative to the
|
||||||
|
project directory and then in the global settings directory.
|
||||||
|
|
||||||
|
|
||||||
|
#### NOMINATIM_IMPORT_STYLE
|
||||||
|
|
||||||
|
| Summary | |
|
||||||
|
| -------------- | --------------------------------------------------- |
|
||||||
|
| **Description:** | Configuration to use for the initial OSM data import |
|
||||||
|
| **Format:** | string or path |
|
||||||
|
| **Default:** | extratags |
|
||||||
|
|
||||||
|
The _style configuration_ describes which OSM objects and tags are taken
|
||||||
|
into consideration for the search database. Nominatim comes with a set
|
||||||
|
of pre-configured styles, that may be configured here.
|
||||||
|
|
||||||
|
You can also write your own custom style and point the setting to the file
|
||||||
|
with the style. When a relative path is given, then the style file is searched
|
||||||
|
first relative to the project directory and then in the global settings
|
||||||
|
directory.
|
||||||
|
|
||||||
|
See [Import Styles](Import-Styles.md)
|
||||||
|
for more information on the available internal styles and the format of the
|
||||||
|
configuration file.
|
||||||
|
|
||||||
|
#### NOMINATIM_FLATNODE_FILE
|
||||||
|
|
||||||
|
| Summary | |
|
||||||
|
| -------------- | --------------------------------------------------- |
|
||||||
|
| **Description:** | Location of osm2pgsql flatnode file |
|
||||||
|
| **Format:** | path |
|
||||||
|
| **Default:** | _empty_ (do not use a flatnote file) |
|
||||||
|
| **After Changes:** | Only change when moving the file physically. |
|
||||||
|
|
||||||
|
The `osm2pgsql flatnode file` is file that efficiently stores geographic
|
||||||
|
location for OSM nodes. For larger imports it can significantly speed up
|
||||||
|
the import. When this option is unset, then osm2pgsql uses a PsotgreSQL table
|
||||||
|
to store the locations.
|
||||||
|
|
||||||
|
When a relative path is given, then the flatnode file is created/searched
|
||||||
|
relative to the project directory.
|
||||||
|
|
||||||
|
!!! warning
|
||||||
|
|
||||||
|
The flatnode file is not only used during the initial import but also
|
||||||
|
when adding new data with `nominatim add-data` or `nominatim replication`.
|
||||||
|
Make sure you keep the flatnode file around and this setting unmodified,
|
||||||
|
if you plan to add more data or run regular updates.
|
||||||
|
|
||||||
|
|
||||||
|
#### NOMINATIM_TABLESPACE_*
|
||||||
|
|
||||||
|
| Summary | |
|
||||||
|
| -------------- | --------------------------------------------------- |
|
||||||
|
| **Description:** | Group of settings for distributing the database over tablespaces |
|
||||||
|
| **Format:** | string |
|
||||||
|
| **Default:** | _empty_ (do not use a table space) |
|
||||||
|
| **After Changes:** | no effect after initial import |
|
||||||
|
|
||||||
|
Nominatim allows to distribute the search database over up to 10 different
|
||||||
|
[PostgreSQL tablespaces](https://www.postgresql.org/docs/current/manage-ag-tablespaces.html).
|
||||||
|
If you use this option, make sure that the tablespaces exist before starting
|
||||||
|
the import.
|
||||||
|
|
||||||
|
The available tablespace groups are:
|
||||||
|
|
||||||
|
NOMINATIM_TABLESPACE_SEARCH_DATA
|
||||||
|
: Data used by the geocoding frontend.
|
||||||
|
|
||||||
|
NOMINATIM_TABLESPACE_SEARCH_INDEX
|
||||||
|
: Indexes used by the geocoding frontend.
|
||||||
|
|
||||||
|
NOMINATIM_TABLESPACE_OSM_DATA
|
||||||
|
: Raw OSM data cache used for import and updates.
|
||||||
|
|
||||||
|
NOMINATIM_TABLESPACE_OSM_DATA
|
||||||
|
: Indexes on the raw OSM data cache.
|
||||||
|
|
||||||
|
NOMINATIM_TABLESPACE_PLACE_DATA
|
||||||
|
: Data table with the pre-filtered but still unprocessed OSM data.
|
||||||
|
Used only during imports and updates.
|
||||||
|
|
||||||
|
NOMINATIM_TABLESPACE_PLACE_INDEX
|
||||||
|
: Indexes on raw data table. Used only during imports and updates.
|
||||||
|
|
||||||
|
NOMINATIM_TABLESPACE_ADDRESS_DATA
|
||||||
|
: Data tables used for computing search terms and addresses of places
|
||||||
|
during import and updates.
|
||||||
|
|
||||||
|
NOMINATIM_TABLESPACE_ADDRESS_INDEX
|
||||||
|
: Indexes on the data tables for search term and address computation.
|
||||||
|
Used only for import and updates.
|
||||||
|
|
||||||
|
NOMINATIM_TABLESPACE_AUX_DATA
|
||||||
|
: Auxiliary data tables for non-OSM data, e.g. for Tiger house number data.
|
||||||
|
|
||||||
|
NOMINATIM_TABLESPACE_AUX_INDEX
|
||||||
|
: Indexes on auxiliary data tables.
|
||||||
|
|
||||||
|
|
||||||
|
### Replication Update Settings
|
||||||
|
|
||||||
|
#### NOMINATIM_REPLICATION_URL
|
||||||
|
|
||||||
|
| Summary | |
|
||||||
|
| -------------- | --------------------------------------------------- |
|
||||||
|
| **Description:** | Base URL of the replication service |
|
||||||
|
| **Format:** | url |
|
||||||
|
| **Default:** | https://planet.openstreetmap.org/replication/minute |
|
||||||
|
| **After Changes:** | run `nominatim replication --init` |
|
||||||
|
|
||||||
|
Replication services deliver updates to OSM data. Use this setting to choose
|
||||||
|
which replication service to use. See [Updates](../admin/Update.md) for more
|
||||||
|
information on how to set up regular updates.
|
||||||
|
|
||||||
|
#### NOMINATIM_REPLICATION_MAX_DIFF
|
||||||
|
|
||||||
|
| Summary | |
|
||||||
|
| -------------- | --------------------------------------------------- |
|
||||||
|
| **Description:** | Maximum amount of data to download per update cycle (in MB) |
|
||||||
|
| **Format:** | integer |
|
||||||
|
| **Default:** | 50 |
|
||||||
|
| **After Changes:** | restart the replication process |
|
||||||
|
|
||||||
|
At each update cycle Nominatim downloads diffs until either no more diffs
|
||||||
|
are available on the server (i.e. the database is up-to-date) or the limit
|
||||||
|
given in this setting is exceeded. Nominatim guarantees to downloads at least
|
||||||
|
one diff, if one is available, no matter how small the setting.
|
||||||
|
|
||||||
|
The default for this setting is fairly conservative because Nominatim keeps
|
||||||
|
all data downloaded in one cycle in RAM. Using large values in a production
|
||||||
|
server may interfere badly with the search frontend because it evicts data
|
||||||
|
from RAM that is needed for speedy answers to incoming requests. It is usually
|
||||||
|
a better idea to keep this setting lower and run multiple update cycles
|
||||||
|
to catch up with updates.
|
||||||
|
|
||||||
|
When catching up in non-production mode, for example after the initial import,
|
||||||
|
the setting can easily be changed temporarily on the command line:
|
||||||
|
|
||||||
|
NOMINATIM_REPLICATION_MAX_DIFF=3000 nominatim replication
|
||||||
|
|
||||||
|
|
||||||
|
#### NOMINATIM_REPLICATION_UPDATE_INTERVAL
|
||||||
|
|
||||||
|
| Summary | |
|
||||||
|
| -------------- | --------------------------------------------------- |
|
||||||
|
| **Description:** | Publication interval of the replication service (in seconds) |
|
||||||
|
| **Format:** | integer |
|
||||||
|
| **Default:** | 75 |
|
||||||
|
| **After Changes:** | restart the replication process |
|
||||||
|
|
||||||
|
This setting determines when Nominatim will attempt to download again a new
|
||||||
|
update. The time is computed from the publication date of the last diff
|
||||||
|
downloaded. Setting this to a slightly higher value than the actual
|
||||||
|
publication interval avoids unnecessary rechecks.
|
||||||
|
|
||||||
|
|
||||||
|
#### NOMINATIM_REPLICATION_RECHECK_INTERVAL
|
||||||
|
|
||||||
|
| Summary | |
|
||||||
|
| -------------- | --------------------------------------------------- |
|
||||||
|
| **Description:** | Wait time to recheck for a pending update (in seconds) |
|
||||||
|
| **Format:** | integer |
|
||||||
|
| **Default:** | 60 |
|
||||||
|
| **After Changes:** | restart the replication process |
|
||||||
|
|
||||||
|
When replication updates are run in continuous mode (using `nominatim replication`),
|
||||||
|
this setting determines how long Nominatim waits until it looks for updates
|
||||||
|
again when updates were not available on the server.
|
||||||
|
|
||||||
|
Note that this is different from
|
||||||
|
[NOMINATIM_REPLICATION_UPDATE_INTERVAL](#nominatim_replication_update_interval).
|
||||||
|
Nominatim will never attempt to query for new updates for UPDATE_INTERVAL
|
||||||
|
seconds after the current database date. Only after the update interval has
|
||||||
|
passed it asks for new data. If then no new data is found, it waits for
|
||||||
|
RECHECK_INTERVAL seconds before it attempts again.
|
||||||
|
|
||||||
|
### API Settings
|
||||||
|
|
||||||
|
#### NOMINATIM_CORS_NOACCESSCONTROL
|
||||||
|
|
||||||
|
| Summary | |
|
||||||
|
| -------------- | --------------------------------------------------- |
|
||||||
|
| **Description:** | Send permissive CORS access headers |
|
||||||
|
| **Format:** | boolean |
|
||||||
|
| **Default:** | yes |
|
||||||
|
| **After Changes:** | run `nominatim refresh --website` |
|
||||||
|
|
||||||
|
When this setting is enabled, API HTTP responses include the HTTP
|
||||||
|
[CORS](https://en.wikipedia.org/wiki/CORS) headers
|
||||||
|
`access-control-allow-origin: *` and `access-control-allow-methods: OPTIONS,GET`.
|
||||||
|
|
||||||
|
#### NOMINATIM_MAPICON_URL
|
||||||
|
|
||||||
|
| Summary | |
|
||||||
|
| -------------- | --------------------------------------------------- |
|
||||||
|
| **Description:** | URL prefix for static icon images |
|
||||||
|
| **Format:** | url |
|
||||||
|
| **Default:** | _empty_ |
|
||||||
|
| **After Changes:** | run `nominatim refresh --website` |
|
||||||
|
|
||||||
|
When a mapicon URL is configured, then Nominatim includes an additional `icon`
|
||||||
|
field in the responses, pointing to an appropriate icon for the place type.
|
||||||
|
|
||||||
|
Map icons used to be included in Nominatim itself but now have moved to the
|
||||||
|
[nominatim-ui](https://github.com/osm-search/nominatim-ui/) project. If you
|
||||||
|
want the URL to be included in API responses, make the `/mapicon`
|
||||||
|
directory of the project available under a public URL and point this setting
|
||||||
|
to the directory.
|
||||||
|
|
||||||
|
|
||||||
|
#### NOMINATIM_DEFAULT_LANGUAGE
|
||||||
|
|
||||||
|
| Summary | |
|
||||||
|
| -------------- | --------------------------------------------------- |
|
||||||
|
| **Description:** | Language of responses when no language is requested |
|
||||||
|
| **Format:** | language code |
|
||||||
|
| **Default:** | _empty_ (use the local language of the feature) |
|
||||||
|
| **After Changes:** | run `nominatim refresh --website` |
|
||||||
|
|
||||||
|
Nominatim localizes the place names in responses when the corresponding
|
||||||
|
translation is available. Users can request a custom language setting through
|
||||||
|
the HTTP accept-languages header or through the explicit parameter
|
||||||
|
[accept-languages](../api/Search.md#language-of-results). If neither is
|
||||||
|
given, it falls back to this setting. If the setting is also empty, then
|
||||||
|
the local languages (in OSM: the name tag without any language suffix) is
|
||||||
|
used.
|
||||||
|
|
||||||
|
|
||||||
|
#### NOMINATIM_SEARCH_BATCH_MODE
|
||||||
|
|
||||||
|
| Summary | |
|
||||||
|
| -------------- | --------------------------------------------------- |
|
||||||
|
| **Description:** | Enable a special batch query mode |
|
||||||
|
| **Format:** | boolean |
|
||||||
|
| **Default:** | no |
|
||||||
|
| **After Changes:** | run `nominatim refresh --website` |
|
||||||
|
|
||||||
|
This feature is currently undocumented and potentially broken.
|
||||||
|
|
||||||
|
|
||||||
|
#### NOMINATIM_SEARCH_NAME_ONLY_THRESHOLD
|
||||||
|
|
||||||
|
| Summary | |
|
||||||
|
| -------------- | --------------------------------------------------- |
|
||||||
|
| **Description:** | Threshold for switching the search index lookup strategy |
|
||||||
|
| **Format:** | integer |
|
||||||
|
| **Default:** | 500 |
|
||||||
|
| **After Changes:** | run `nominatim refresh --website` |
|
||||||
|
|
||||||
|
This setting defines the threshold over which a name is no longer considered
|
||||||
|
as rare. When searching for places with rare names, only the name is used
|
||||||
|
for place lookups. Otherwise the name and any address information is used.
|
||||||
|
|
||||||
|
This setting only has an effect after `nominatim refresh --word-counts` has
|
||||||
|
been called to compute the word frequencies.
|
||||||
|
|
||||||
|
|
||||||
|
#### NOMINATIM_LOOKUP_MAX_COUNT
|
||||||
|
|
||||||
|
| Summary | |
|
||||||
|
| -------------- | --------------------------------------------------- |
|
||||||
|
| **Description:** | Maximum number of OSM ids accepted by /lookup |
|
||||||
|
| **Format:** | integer |
|
||||||
|
| **Default:** | 50 |
|
||||||
|
| **After Changes:** | run `nominatim refresh --website` |
|
||||||
|
|
||||||
|
The /lookup point accepts list of ids to look up address details for. This
|
||||||
|
setting restricts the number of places a user may look up with a single
|
||||||
|
request.
|
||||||
|
|
||||||
|
|
||||||
|
#### NOMINATIM_POLYGON_OUTPUT_MAX_TYPES
|
||||||
|
|
||||||
|
| Summary | |
|
||||||
|
| -------------- | --------------------------------------------------- |
|
||||||
|
| **Description:** | Number of different geometry formats that may be returned |
|
||||||
|
| **Format:** | integer |
|
||||||
|
| **Default:** | 1 |
|
||||||
|
| **After Changes:** | run `nominatim refresh --website` |
|
||||||
|
|
||||||
|
Nominatim supports returning full geometries of places. The geometries may
|
||||||
|
be requested in different formats with one of the
|
||||||
|
[`polygon_*` parameters](../api/Search.md#polygon-output). Use this
|
||||||
|
setting to restrict the number of geometry types that may be requested
|
||||||
|
with a single query.
|
||||||
|
|
||||||
|
Setting this parameter to 0 disables polygon output completely.
|
||||||
|
|
||||||
|
### Logging Settings
|
||||||
|
|
||||||
|
#### NOMINATIM_LOG_DB
|
||||||
|
|
||||||
|
| Summary | |
|
||||||
|
| -------------- | --------------------------------------------------- |
|
||||||
|
| **Description:** | Log requests into the database |
|
||||||
|
| **Format:** | boolean |
|
||||||
|
| **Default:** | no |
|
||||||
|
| **After Changes:** | run `nominatim refresh --website` |
|
||||||
|
|
||||||
|
Enable logging requests into a database table with this setting. The logs
|
||||||
|
can be found in the table `new_query_log`.
|
||||||
|
|
||||||
|
When using this logging method, it is advisable to set up a job that
|
||||||
|
regularly clears out old logging information. Nominatim will not do that
|
||||||
|
on its own.
|
||||||
|
|
||||||
|
Can be used as the same time as NOMINATIM_LOG_FILE.
|
||||||
|
|
||||||
|
#### NOMINATIM_LOG_FILE
|
||||||
|
|
||||||
|
| Summary | |
|
||||||
|
| -------------- | --------------------------------------------------- |
|
||||||
|
| **Description:** | Log requests into a file |
|
||||||
|
| **Format:** | path |
|
||||||
|
| **Default:** | _empty_ (logging disabled) |
|
||||||
|
| **After Changes:** | run `nominatim refresh --website` |
|
||||||
|
|
||||||
|
Enable logging of requests into a file with this setting by setting the log
|
||||||
|
file where to log to. A relative file name is assumed to be relative to
|
||||||
|
the project directory.
|
||||||
|
|
||||||
|
|
||||||
|
The entries in the log file have the following format:
|
||||||
|
|
||||||
|
<request time> <execution time in s> <number of results> <type> "<query string>"
|
||||||
|
|
||||||
|
Request time is the time when the request was started. The execution time is
|
||||||
|
given in ms and corresponds to the time the query took executing in PHP.
|
||||||
|
type contains the name of the endpoint used.
|
||||||
|
|
||||||
|
Can be used as the same time as NOMINATIM_LOG_DB.
|
||||||
34
docs/customize/Special-Phrases.md
Normal file
34
docs/customize/Special-Phrases.md
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
# Special phrases
|
||||||
|
|
||||||
|
## Importing OSM user-maintained special phrases
|
||||||
|
|
||||||
|
As described in the [Import section](../admin/Import.md), it is possible to
|
||||||
|
import special phrases from the wiki with the following command:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
nominatim special-phrases --import-from-wiki
|
||||||
|
```
|
||||||
|
|
||||||
|
## Importing custom special phrases
|
||||||
|
|
||||||
|
But, it is also possible to import some phrases from a csv file.
|
||||||
|
To do so, you have access to the following command:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
nominatim special-phrases --import-from-csv <csv file>
|
||||||
|
```
|
||||||
|
|
||||||
|
Note that the two previous import commands will update the phrases from your database.
|
||||||
|
This means that if you import some phrases from a csv file, only the phrases
|
||||||
|
present in the csv file will be kept into the database. All other phrases will
|
||||||
|
be removed.
|
||||||
|
|
||||||
|
If you want to only add new phrases and not update the other ones you can add
|
||||||
|
the argument `--no-replace` to the import command. For example:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
nominatim special-phrases --import-from-csv <csv file> --no-replace
|
||||||
|
```
|
||||||
|
|
||||||
|
This will add the phrases present in the csv file into the database without
|
||||||
|
removing the other ones.
|
||||||
28
docs/customize/Tiger.md
Normal file
28
docs/customize/Tiger.md
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
# Installing TIGER housenumber data for the US
|
||||||
|
|
||||||
|
Nominatim is able to use the official [TIGER](https://www.census.gov/geographies/mapping-files/time-series/geo/tiger-line-file.html)
|
||||||
|
address set to complement the OSM house number data in the US. You can add
|
||||||
|
TIGER data to your own Nominatim instance by following these steps. The
|
||||||
|
entire US adds about 10GB to your database.
|
||||||
|
|
||||||
|
1. Get preprocessed TIGER 2021 data:
|
||||||
|
|
||||||
|
cd $PROJECT_DIR
|
||||||
|
wget https://nominatim.org/data/tiger2021-nominatim-preprocessed.csv.tar.gz
|
||||||
|
|
||||||
|
2. Import the data into your Nominatim database:
|
||||||
|
|
||||||
|
nominatim add-data --tiger-data tiger2021-nominatim-preprocessed.csv.tar.gz
|
||||||
|
|
||||||
|
3. Enable use of the Tiger data in your `.env` by adding:
|
||||||
|
|
||||||
|
echo NOMINATIM_USE_US_TIGER_DATA=yes >> .env
|
||||||
|
|
||||||
|
4. Apply the new settings:
|
||||||
|
|
||||||
|
nominatim refresh --functions
|
||||||
|
|
||||||
|
|
||||||
|
See the [TIGER-data project](https://github.com/osm-search/TIGER-data) for more
|
||||||
|
information on how the data got preprocessed.
|
||||||
|
|
||||||
@@ -37,39 +37,42 @@ NOMINATIM_DATABASE_MODULE_PATH=<path to directory where nominatim.so resides>
|
|||||||
```
|
```
|
||||||
|
|
||||||
This is in particular useful when the database runs on a different server.
|
This is in particular useful when the database runs on a different server.
|
||||||
See [Advanced installations](Advanced-Installations.md#importing-nominatim-to-an-external-postgresql-database) for details.
|
See [Advanced installations](../admin/Advanced-Installations.md#importing-nominatim-to-an-external-postgresql-database) for details.
|
||||||
|
|
||||||
There are no other configuration options for the legacy tokenizer. All
|
There are no other configuration options for the legacy tokenizer. All
|
||||||
normalization functions are hard-coded.
|
normalization functions are hard-coded.
|
||||||
|
|
||||||
## ICU tokenizer
|
## ICU tokenizer
|
||||||
|
|
||||||
!!! danger
|
|
||||||
This tokenizer is currently in active development and still subject
|
|
||||||
to backwards-incompatible changes.
|
|
||||||
|
|
||||||
The ICU tokenizer uses the [ICU library](http://site.icu-project.org/) to
|
The ICU tokenizer uses the [ICU library](http://site.icu-project.org/) to
|
||||||
normalize names and queries. It also offers configurable decomposition and
|
normalize names and queries. It also offers configurable decomposition and
|
||||||
abbreviation handling.
|
abbreviation handling.
|
||||||
|
|
||||||
|
To enable the tokenizer add the following line to your project configuration:
|
||||||
|
|
||||||
|
```
|
||||||
|
NOMINATIM_TOKENIZER=icu
|
||||||
|
```
|
||||||
|
|
||||||
### How it works
|
### How it works
|
||||||
|
|
||||||
On import the tokenizer processes names in the following four stages:
|
On import the tokenizer processes names in the following three stages:
|
||||||
|
|
||||||
1. The **Normalization** part removes all non-relevant information from the
|
1. During the **Sanitizer step** incoming names are cleaned up and converted to
|
||||||
input.
|
**full names**. This step can be used to regularize spelling, split multi-name
|
||||||
2. Incoming names are now converted to **full names**. This process is currently
|
tags into their parts and tag names with additional attributes. See the
|
||||||
hard coded and mostly serves to handle name tags from OSM that contain
|
[Sanitizers section](#sanitizers) below for available cleaning routines.
|
||||||
multiple names (e.g. [Biel/Bienne](https://www.openstreetmap.org/node/240097197)).
|
2. The **Normalization** part removes all information from the full names
|
||||||
3. Next the tokenizer creates **variants** from the full names. These variants
|
that are not relevant for search.
|
||||||
cover decomposition and abbreviation handling. Variants are saved to the
|
3. The **Token analysis** step takes the normalized full names and creates
|
||||||
database, so that it is not necessary to create the variants for a search
|
all transliterated variants under which the name should be searchable.
|
||||||
query.
|
See the [Token analysis](#token-analysis) section below for more
|
||||||
4. The final **Tokenization** step converts the names to a simple ASCII form,
|
information.
|
||||||
potentially removing further spelling variants for better matching.
|
|
||||||
|
|
||||||
At query time only stage 1) and 4) are used. The query is normalized and
|
During query time, only normalization and transliteration are relevant.
|
||||||
tokenized and the resulting string used for searching in the database.
|
An incoming query is first split into name chunks (this usually means splitting
|
||||||
|
the string at the commas) and the each part is normalised and transliterated.
|
||||||
|
The result is used to look up places in the search index.
|
||||||
|
|
||||||
### Configuration
|
### Configuration
|
||||||
|
|
||||||
@@ -87,21 +90,36 @@ normalization:
|
|||||||
transliteration:
|
transliteration:
|
||||||
- !include /etc/nominatim/icu-rules/extended-unicode-to-asccii.yaml
|
- !include /etc/nominatim/icu-rules/extended-unicode-to-asccii.yaml
|
||||||
- ":: Ascii ()"
|
- ":: Ascii ()"
|
||||||
|
sanitizers:
|
||||||
|
- step: split-name-list
|
||||||
|
token-analysis:
|
||||||
|
- analyzer: generic
|
||||||
variants:
|
variants:
|
||||||
- language: de
|
- !include icu-rules/variants-ca.yaml
|
||||||
words:
|
- words:
|
||||||
- ~haus => haus
|
|
||||||
- ~strasse -> str
|
|
||||||
- language: en
|
|
||||||
words:
|
|
||||||
- road -> rd
|
- road -> rd
|
||||||
- bridge -> bdge,br,brdg,bri,brg
|
- bridge -> bdge,br,brdg,bri,brg
|
||||||
```
|
```
|
||||||
|
|
||||||
The configuration file contains three sections:
|
The configuration file contains four sections:
|
||||||
`normalization`, `transliteration`, `variants`.
|
`normalization`, `transliteration`, `sanitizers` and `token-analysis`.
|
||||||
|
|
||||||
The normalization and transliteration sections each must contain a list of
|
#### Normalization and Transliteration
|
||||||
|
|
||||||
|
The normalization and transliteration sections each define a set of
|
||||||
|
ICU rules that are applied to the names.
|
||||||
|
|
||||||
|
The **normalisation** rules are applied after sanitation. They should remove
|
||||||
|
any information that is not relevant for search at all. Usual rules to be
|
||||||
|
applied here are: lower-casing, removing of special characters, cleanup of
|
||||||
|
spaces.
|
||||||
|
|
||||||
|
The **transliteration** rules are applied at the end of the tokenization
|
||||||
|
process to transfer the name into an ASCII representation. Transliteration can
|
||||||
|
be useful to allow for further fuzzy matching, especially between different
|
||||||
|
scripts.
|
||||||
|
|
||||||
|
Each section must contain a list of
|
||||||
[ICU transformation rules](https://unicode-org.github.io/icu/userguide/transforms/general/rules.html).
|
[ICU transformation rules](https://unicode-org.github.io/icu/userguide/transforms/general/rules.html).
|
||||||
The rules are applied in the order in which they appear in the file.
|
The rules are applied in the order in which they appear in the file.
|
||||||
You can also include additional rules from external yaml file using the
|
You can also include additional rules from external yaml file using the
|
||||||
@@ -113,6 +131,85 @@ and may again include other files.
|
|||||||
YAML syntax. You should therefore always enclose the ICU rules in
|
YAML syntax. You should therefore always enclose the ICU rules in
|
||||||
double-quotes.
|
double-quotes.
|
||||||
|
|
||||||
|
#### Sanitizers
|
||||||
|
|
||||||
|
The sanitizers section defines an ordered list of functions that are applied
|
||||||
|
to the name and address tags before they are further processed by the tokenizer.
|
||||||
|
They allows to clean up the tagging and bring it to a standardized form more
|
||||||
|
suitable for building the search index.
|
||||||
|
|
||||||
|
!!! hint
|
||||||
|
Sanitizers only have an effect on how the search index is built. They
|
||||||
|
do not change the information about each place that is saved in the
|
||||||
|
database. In particular, they have no influence on how the results are
|
||||||
|
displayed. The returned results always show the original information as
|
||||||
|
stored in the OpenStreetMap database.
|
||||||
|
|
||||||
|
Each entry contains information of a sanitizer to be applied. It has a
|
||||||
|
mandatory parameter `step` which gives the name of the sanitizer. Depending
|
||||||
|
on the type, it may have additional parameters to configure its operation.
|
||||||
|
|
||||||
|
The order of the list matters. The sanitizers are applied exactly in the order
|
||||||
|
that is configured. Each sanitizer works on the results of the previous one.
|
||||||
|
|
||||||
|
The following is a list of sanitizers that are shipped with Nominatim.
|
||||||
|
|
||||||
|
##### split-name-list
|
||||||
|
|
||||||
|
::: nominatim.tokenizer.sanitizers.split_name_list
|
||||||
|
selection:
|
||||||
|
members: False
|
||||||
|
rendering:
|
||||||
|
heading_level: 6
|
||||||
|
|
||||||
|
##### strip-brace-terms
|
||||||
|
|
||||||
|
::: nominatim.tokenizer.sanitizers.strip_brace_terms
|
||||||
|
selection:
|
||||||
|
members: False
|
||||||
|
rendering:
|
||||||
|
heading_level: 6
|
||||||
|
|
||||||
|
##### tag-analyzer-by-language
|
||||||
|
|
||||||
|
::: nominatim.tokenizer.sanitizers.tag_analyzer_by_language
|
||||||
|
selection:
|
||||||
|
members: False
|
||||||
|
rendering:
|
||||||
|
heading_level: 6
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#### Token Analysis
|
||||||
|
|
||||||
|
Token analyzers take a full name and transform it into one or more normalized
|
||||||
|
form that are then saved in the search index. In its simplest form, the
|
||||||
|
analyzer only applies the transliteration rules. More complex analyzers
|
||||||
|
create additional spelling variants of a name. This is useful to handle
|
||||||
|
decomposition and abbreviation.
|
||||||
|
|
||||||
|
The ICU tokenizer may use different analyzers for different names. To select
|
||||||
|
the analyzer to be used, the name must be tagged with the `analyzer` attribute
|
||||||
|
by a sanitizer (see for example the
|
||||||
|
[tag-analyzer-by-language sanitizer](#tag-analyzer-by-language)).
|
||||||
|
|
||||||
|
The token-analysis section contains the list of configured analyzers. Each
|
||||||
|
analyzer must have an `id` parameter that uniquely identifies the analyzer.
|
||||||
|
The only exception is the default analyzer that is used when no special
|
||||||
|
analyzer was selected.
|
||||||
|
|
||||||
|
Different analyzer implementations may exist. To select the implementation,
|
||||||
|
the `analyzer` parameter must be set. Currently there is only one implementation
|
||||||
|
`generic` which is described in the following.
|
||||||
|
|
||||||
|
##### Generic token analyzer
|
||||||
|
|
||||||
|
The generic analyzer is able to create variants from a list of given
|
||||||
|
abbreviation and decomposition replacements. It takes one optional parameter
|
||||||
|
`variants` which lists the replacements to apply. If the section is
|
||||||
|
omitted, then the generic analyzer becomes a simple analyzer that only
|
||||||
|
applies the transliteration.
|
||||||
|
|
||||||
The variants section defines lists of replacements which create alternative
|
The variants section defines lists of replacements which create alternative
|
||||||
spellings of a name. To create the variants, a name is scanned from left to
|
spellings of a name. To create the variants, a name is scanned from left to
|
||||||
right and the longest matching replacement is applied until the end of the
|
right and the longest matching replacement is applied until the end of the
|
||||||
@@ -138,7 +235,7 @@ term.
|
|||||||
words in the configuration because then it is possible to change the
|
words in the configuration because then it is possible to change the
|
||||||
rules for normalization later without having to adapt the variant rules.
|
rules for normalization later without having to adapt the variant rules.
|
||||||
|
|
||||||
#### Decomposition
|
###### Decomposition
|
||||||
|
|
||||||
In its standard form, only full words match against the source. There
|
In its standard form, only full words match against the source. There
|
||||||
is a special notation to match the prefix and suffix of a word:
|
is a special notation to match the prefix and suffix of a word:
|
||||||
@@ -165,7 +262,7 @@ To avoid automatic decomposition, use the '|' notation:
|
|||||||
|
|
||||||
simply changes "hauptstrasse" to "hauptstr" and "rote strasse" to "rote str".
|
simply changes "hauptstrasse" to "hauptstr" and "rote strasse" to "rote str".
|
||||||
|
|
||||||
#### Initial and final terms
|
###### Initial and final terms
|
||||||
|
|
||||||
It is also possible to restrict replacements to the beginning and end of a
|
It is also possible to restrict replacements to the beginning and end of a
|
||||||
name:
|
name:
|
||||||
@@ -178,7 +275,7 @@ name:
|
|||||||
So the first example would trigger a replacement for "south 45th street" but
|
So the first example would trigger a replacement for "south 45th street" but
|
||||||
not for "the south beach restaurant".
|
not for "the south beach restaurant".
|
||||||
|
|
||||||
#### Replacements vs. variants
|
###### Replacements vs. variants
|
||||||
|
|
||||||
The replacement syntax `source => target` works as a pure replacement. It changes
|
The replacement syntax `source => target` works as a pure replacement. It changes
|
||||||
the name instead of creating a variant. To create an additional version, you'd
|
the name instead of creating a variant. To create an additional version, you'd
|
||||||
167
docs/develop/Database-Layout.md
Normal file
167
docs/develop/Database-Layout.md
Normal file
@@ -0,0 +1,167 @@
|
|||||||
|
# Database Layout
|
||||||
|
|
||||||
|
### Import tables
|
||||||
|
|
||||||
|
OSM data is initially imported using [osm2pgsql](https://osm2pgsql.org).
|
||||||
|
Nominatim uses its own data output style 'gazetteer', which differs from the
|
||||||
|
output style created for map rendering.
|
||||||
|
|
||||||
|
The import process creates the following tables:
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
The `planet_osm_*` tables are the usual backing tables for OSM data. Note
|
||||||
|
that Nominatim uses them to look up special relations and to find nodes on
|
||||||
|
ways.
|
||||||
|
|
||||||
|
The gazetteer style produces a single table `place` as output with the following
|
||||||
|
columns:
|
||||||
|
|
||||||
|
* `osm_type` - kind of OSM object (**N** - node, **W** - way, **R** - relation)
|
||||||
|
* `osm_id` - original OSM ID
|
||||||
|
* `class` - key of principal tag defining the object type
|
||||||
|
* `type` - value of principal tag defining the object type
|
||||||
|
* `name` - collection of tags that contain a name or reference
|
||||||
|
* `admin_level` - numerical value of the tagged administrative level
|
||||||
|
* `address` - collection of tags defining the address of an object
|
||||||
|
* `extratags` - collection of additional interesting tags that are not
|
||||||
|
directly relevant for searching
|
||||||
|
* `geometry` - geometry of the object (in WGS84)
|
||||||
|
|
||||||
|
A single OSM object may appear multiple times in this table when it is tagged
|
||||||
|
with multiple tags that may constitute a principal tag. Take for example a
|
||||||
|
motorway bridge. In OSM, this would be a way which is tagged with
|
||||||
|
`highway=motorway` and `bridge=yes`. This way would appear in the `place` table
|
||||||
|
once with `class` of `highway` and once with a `class` of `bridge`. Thus the
|
||||||
|
*unique key* for `place` is (`osm_type`, `osm_id`, `class`).
|
||||||
|
|
||||||
|
How raw OSM tags are mapped to the columns in the place table is to a certain
|
||||||
|
degree configurable. See [Customizing Import Styles](../customize/Import-Styles.md)
|
||||||
|
for more information.
|
||||||
|
|
||||||
|
### Search tables
|
||||||
|
|
||||||
|
The following tables carry all information needed to do the search:
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
The **placex** table is the central table that saves all information about the
|
||||||
|
searchable places in Nominatim. The basic columns are the same as for the
|
||||||
|
place table and have the same meaning. The placex tables adds the following
|
||||||
|
additional columns:
|
||||||
|
|
||||||
|
* `place_id` - the internal unique ID to identify the place
|
||||||
|
* `partition` - the id to use with partitioned tables (see below)
|
||||||
|
* `geometry_sector` - a location hash used for geographically close ordering
|
||||||
|
* `parent_place_id` - the next higher place in the address hierarchy, only
|
||||||
|
relevant for POI-type places (with rank 30)
|
||||||
|
* `linked_place_id` - place ID of the place this object has been merged with.
|
||||||
|
When this ID is set, then the place is invisible for search.
|
||||||
|
* `importance` - measure how well known the place is
|
||||||
|
* `rank_search`, `rank_address` - search and address rank (see [Customizing ranking](../customize/Ranking.md)
|
||||||
|
* `wikipedia` - the wikipedia page used for computing the importance of the place
|
||||||
|
* `country_code` - the country the place is located in
|
||||||
|
* `housenumber` - normalized housenumber, if the place has one
|
||||||
|
* `postcode` - computed postcode for the place
|
||||||
|
* `indexed_status` - processing status of the place (0 - ready, 1 - freshly inserted, 2 - needs updating, 100 - needs deletion)
|
||||||
|
* `indexed_date` - timestamp when the place was processed last
|
||||||
|
* `centroid` - a point feature for the place
|
||||||
|
|
||||||
|
The **location_property_osmline** table is a special table for
|
||||||
|
[address interpolations](https://wiki.openstreetmap.org/wiki/Addresses#Using_interpolation).
|
||||||
|
The columns have the same meaning and use as the columns with the same name in
|
||||||
|
the placex table. Only three columns are special:
|
||||||
|
|
||||||
|
* `startnumber` and `endnumber` - beginning and end of the number range
|
||||||
|
for the interpolation
|
||||||
|
* `interpolationtype` - a string `odd`, `even` or `all` to indicate
|
||||||
|
the interval between the numbers
|
||||||
|
|
||||||
|
Address interpolations are always ways in OSM, which is why there is no column
|
||||||
|
`osm_type`.
|
||||||
|
|
||||||
|
The **location_postcode** table holds computed centroids of all postcodes that
|
||||||
|
can be found in the OSM data. The meaning of the columns is again the same
|
||||||
|
as that of the placex table.
|
||||||
|
|
||||||
|
Every place needs an address, a set of surrounding places that describe the
|
||||||
|
location of the place. The set of address places is made up of OSM places
|
||||||
|
themselves. The **place_addressline** table cross-references for each place
|
||||||
|
all the places that make up its address. Two columns define the address
|
||||||
|
relation:
|
||||||
|
|
||||||
|
* `place_id` - reference to the place being addressed
|
||||||
|
* `address_place_id` - reference to the place serving as an address part
|
||||||
|
|
||||||
|
The most of the columns cache information from the placex entry of the address
|
||||||
|
part. The exceptions are:
|
||||||
|
|
||||||
|
* `fromarea` - is true if the address part has an area geometry and can
|
||||||
|
therefore be considered preceise
|
||||||
|
* `isaddress` - is true if the address part should show up in the address
|
||||||
|
output. Sometimes there are multiple places competing for for same address
|
||||||
|
type (e.g. multiple cities) and this field resolves the tie.
|
||||||
|
|
||||||
|
The **search_name** table contains the search index proper. It saves for each
|
||||||
|
place the terms with which the place can be found. The terms are split into
|
||||||
|
the name itself and all terms that make up the address. The table mirrors some
|
||||||
|
of the columns from placex for faster lookup.
|
||||||
|
|
||||||
|
Search terms are not saved as strings. Each term is assigned an integer and those
|
||||||
|
integers are saved in the name and address vectors of the search_name table. The
|
||||||
|
**word** table serves as the lookup table from string to such a word ID. The
|
||||||
|
exact content of the word table depends on the [tokenizer](Tokenizers.md) used.
|
||||||
|
|
||||||
|
## Address computation tables
|
||||||
|
|
||||||
|
Next to the main search tables, there is a set of secondary helper tables used
|
||||||
|
to compute the address relations between places. These tables are partitioned.
|
||||||
|
Each country is assigned a partition number in the country_name table (see
|
||||||
|
below) and the data is then split between a set of tables, one for each
|
||||||
|
partition. Note that Nominatim still manually manages partitioned tables.
|
||||||
|
Native support for partitions in PostgreSQL only became useable with version 13.
|
||||||
|
It will be a little while before Nominatim drops support for older versions.
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
The **search_name_X** tables are used to look up streets that appear in the
|
||||||
|
`addr:street` tag.
|
||||||
|
|
||||||
|
The **location_area_large_X** tables are used to look up larger areas
|
||||||
|
(administrative boundaries and place nodes) either through their geographic
|
||||||
|
closeness or through `addr:*` entries.
|
||||||
|
|
||||||
|
The **location_road_X** tables are used to find the closest street for a
|
||||||
|
dependent place.
|
||||||
|
|
||||||
|
All three table cache specific information from the placex table for their
|
||||||
|
selected subset of places:
|
||||||
|
|
||||||
|
* `keywords` and `name_vector` contain lists of term ids (from the word table)
|
||||||
|
that the full name of the place should match against
|
||||||
|
* `isguess` is true for places that are not described by an area
|
||||||
|
|
||||||
|
All other columns reflect their counterpart in the placex table.
|
||||||
|
|
||||||
|
## Static data tables
|
||||||
|
|
||||||
|
Nominatim also creates a number of static tables at import:
|
||||||
|
|
||||||
|
* `nominatim_properties` saves settings that must not be changed after
|
||||||
|
import
|
||||||
|
* `address_levels` save the rank information from the
|
||||||
|
[ranking configuration](../customize/Ranking.md)
|
||||||
|
* `country_name` contains a fallback of names for all countries, their
|
||||||
|
default languages and saves the assignment of countries to partitions.
|
||||||
|
* `country_osm_grid` provides a fallback for country geometries
|
||||||
|
|
||||||
|
## Auxilary data tables
|
||||||
|
|
||||||
|
Finally there are some table for auxillary data:
|
||||||
|
|
||||||
|
* `location_property_tiger` - saves housenumber from the Tiger import. Its
|
||||||
|
layout is similar to that of `location_propoerty_osmline`.
|
||||||
|
* `place_class_*` tables are helper tables to facilitate lookup of POIs
|
||||||
|
by their class and type. They exist because it is not possible to create
|
||||||
|
combined indexes with geometries.
|
||||||
|
|
||||||
@@ -38,6 +38,7 @@ It has the following additional requirements:
|
|||||||
The documentation is built with mkdocs:
|
The documentation is built with mkdocs:
|
||||||
|
|
||||||
* [mkdocs](https://www.mkdocs.org/) >= 1.1.2
|
* [mkdocs](https://www.mkdocs.org/) >= 1.1.2
|
||||||
|
* [mkdocstrings](https://mkdocstrings.github.io/)
|
||||||
|
|
||||||
### Installing prerequisites on Ubuntu/Debian
|
### Installing prerequisites on Ubuntu/Debian
|
||||||
|
|
||||||
@@ -51,7 +52,7 @@ To install all necessary packages run:
|
|||||||
sudo apt install php-cgi phpunit php-codesniffer \
|
sudo apt install php-cgi phpunit php-codesniffer \
|
||||||
python3-pip python3-setuptools python3-dev pylint
|
python3-pip python3-setuptools python3-dev pylint
|
||||||
|
|
||||||
pip3 install --user behave mkdocs pytest
|
pip3 install --user behave mkdocs mkdocstrings pytest
|
||||||
```
|
```
|
||||||
|
|
||||||
The `mkdocs` executable will be located in `.local/bin`. You may have to add
|
The `mkdocs` executable will be located in `.local/bin`. You may have to add
|
||||||
@@ -113,7 +114,7 @@ symlinks (see `CMakeLists.txt` for the exact steps).
|
|||||||
Now you can start webserver for local testing
|
Now you can start webserver for local testing
|
||||||
|
|
||||||
```
|
```
|
||||||
build> mkdocs serve
|
build> make serve-doc
|
||||||
[server:296] Serving on http://127.0.0.1:8000
|
[server:296] Serving on http://127.0.0.1:8000
|
||||||
[handlers:62] Start watching changes
|
[handlers:62] Start watching changes
|
||||||
```
|
```
|
||||||
@@ -122,7 +123,7 @@ If you develop inside a Vagrant virtual machine, use a port that is forwarded
|
|||||||
to your host:
|
to your host:
|
||||||
|
|
||||||
```
|
```
|
||||||
build> mkdocs serve --dev-addr 0.0.0.0:8088
|
build> PYTHONPATH=$SRCDIR mkdocs serve --dev-addr 0.0.0.0:8088
|
||||||
[server:296] Serving on http://0.0.0.0:8088
|
[server:296] Serving on http://0.0.0.0:8088
|
||||||
[handlers:62] Start watching changes
|
[handlers:62] Start watching changes
|
||||||
```
|
```
|
||||||
|
|||||||
152
docs/develop/Indexing.md
Normal file
152
docs/develop/Indexing.md
Normal file
@@ -0,0 +1,152 @@
|
|||||||
|
# Indexing Places
|
||||||
|
|
||||||
|
In Nominatim, the word __indexing__ refers to the process that takes the raw
|
||||||
|
OpenStreetMap data from the place table, enriches it with address information
|
||||||
|
and creates the search indexes. This section explains the basic data flow.
|
||||||
|
|
||||||
|
|
||||||
|
## Initial import
|
||||||
|
|
||||||
|
After osm2pgsql has loaded the raw OSM data into the place table,
|
||||||
|
the data is copied to the final search tables placex and location_property_osmline.
|
||||||
|
While they are copied, some basic properties are added:
|
||||||
|
|
||||||
|
* country_code, geometry_sector and partition
|
||||||
|
* initial search and address rank
|
||||||
|
|
||||||
|
In addition the column `indexed_status` is set to `1` marking the place as one
|
||||||
|
that needs to be indexed.
|
||||||
|
|
||||||
|
All this happens in the triggers `placex_insert` and `osmline_insert`.
|
||||||
|
|
||||||
|
## Indexing
|
||||||
|
|
||||||
|
The main work horse of the data import is the indexing step, where Nominatim
|
||||||
|
takes every place from the placex and location_property_osmline tables where
|
||||||
|
the indexed_status != 0 and computes the search terms and the address parts
|
||||||
|
of the place.
|
||||||
|
|
||||||
|
The indexing happens in three major steps:
|
||||||
|
|
||||||
|
1. **Data preparation** - The indexer gets the data for the place to be indexed
|
||||||
|
from the database.
|
||||||
|
|
||||||
|
2. **Search name processing** - The prepared data is given to the
|
||||||
|
tokenizer which computes the search terms from the names
|
||||||
|
and potentially other information.
|
||||||
|
|
||||||
|
3. **Address processing** - The indexer then hands the prepared data and the
|
||||||
|
tokenizer information back to the database via an `INSERT` statement which
|
||||||
|
also sets the indexed_status to `0`. This triggers the update triggers
|
||||||
|
`placex_update`/`osmline_update` which do the work of computing address
|
||||||
|
parts and filling all the search tables.
|
||||||
|
|
||||||
|
When computing the address terms of a place, Nominatim relies on the processed
|
||||||
|
search names of all the address parts. That is why places are processed in rank
|
||||||
|
order, from smallest rank to largest. To ensure correct handling of linked
|
||||||
|
place nodes, administrative boundaries are processed before all other places.
|
||||||
|
|
||||||
|
Apart from these restrictions, each place can be indexed independently
|
||||||
|
from the others. This allows a large degree of parallelization during the indexing.
|
||||||
|
It also means that the indexing process can be interrupted at any time and
|
||||||
|
will simply pick up where it left of when restarted.
|
||||||
|
|
||||||
|
### Data preparation
|
||||||
|
|
||||||
|
The data preparation step computes and retrieves all data for a place that
|
||||||
|
might be needed for the next step of processing the search name. That includes
|
||||||
|
|
||||||
|
* location information (country code)
|
||||||
|
* place classification (class, type, ranks)
|
||||||
|
* names (including names of linked places)
|
||||||
|
* address information (`addr:*` tags)
|
||||||
|
|
||||||
|
Data preparation is implemented in pl/PgSQL mostly in the functions
|
||||||
|
`placex_indexing_prepare()` and `get_interpolation_address()`.
|
||||||
|
|
||||||
|
#### `addr:*` tag inheritance
|
||||||
|
|
||||||
|
Nominatim has limited support for inheriting address tags from a building
|
||||||
|
to POIs inside the building. This only works when the address tags are on the
|
||||||
|
building outline. Any rank 30 object inside such a building or on its outline
|
||||||
|
inherits all address tags when it does not have any address tags of its own.
|
||||||
|
|
||||||
|
The inheritance is computed in the data preparation step.
|
||||||
|
|
||||||
|
### Search name processing
|
||||||
|
|
||||||
|
The prepared place information is handed to the tokenizer next. This is a
|
||||||
|
Python module responsible for processing the names from both name and address
|
||||||
|
terms and building up the word index from them. The process is explained in
|
||||||
|
more detail in the [Tokenizer chapter](Tokenizer.md).
|
||||||
|
|
||||||
|
### Address processing
|
||||||
|
|
||||||
|
Finally, the preprocessed place information and the results of the search name
|
||||||
|
processing are written back to the database. At this point the update trigger
|
||||||
|
of the placex/location_property_osmline tables take over and fill all the
|
||||||
|
dependent tables. This makes up the most work-intensive part of the indexing.
|
||||||
|
|
||||||
|
Nominatim distinguishes between dependent and independent places.
|
||||||
|
**Dependent places** are all places on rank 30: house numbers, POIs etc. These
|
||||||
|
places don't have a full address of their own. Instead they are attached to
|
||||||
|
a parent street or place and use the information of the parent for searching
|
||||||
|
and displaying information. Everything else are **independent places**: streets,
|
||||||
|
parks, water bodies, suburbs, cities, states etc. They receive a full address
|
||||||
|
on their own.
|
||||||
|
|
||||||
|
The address processing for both types of places is very different.
|
||||||
|
|
||||||
|
#### Independent places
|
||||||
|
|
||||||
|
To compute the address of an independent place Nominatim searches for all
|
||||||
|
places that cover the place to compute the address for at least partially.
|
||||||
|
For places with an area, that area is used to check for coverage. For place
|
||||||
|
nodes an artificial square area is computed according to the rank of
|
||||||
|
the place. The lower the rank the lager the area. The `location_area_large_X`
|
||||||
|
tables are there to facilitate the lookup. All places that can function as
|
||||||
|
the address of another place are saved in those tables.
|
||||||
|
|
||||||
|
`addr:*` and `isin:*` tags are taken into account to compute the address, too.
|
||||||
|
Nominatim will give preference to places with the same name as in these tags
|
||||||
|
when looking for places in the vicinity. If there are no matching place names
|
||||||
|
at all, then the tags are at least added to the search index. That means that
|
||||||
|
the names will not be shown in the result as the 'address' of the place, but
|
||||||
|
searching by them still works.
|
||||||
|
|
||||||
|
Independent places are always added to the global search index `search_name`.
|
||||||
|
|
||||||
|
#### Dependent places
|
||||||
|
|
||||||
|
Dependent places skip the full address computation for performance reasons.
|
||||||
|
Instead they just find a parent place to attach themselves to.
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
By default a POI
|
||||||
|
or house number will be attached to the closest street. That can be any major
|
||||||
|
or minor street indexed by Nominatim. In the default configuration that means
|
||||||
|
that it can attach itself to a footway but only when it has a name.
|
||||||
|
|
||||||
|
When the dependent place has an `addr:street` tag, then Nominatim will first
|
||||||
|
try to find a street with the same name before falling back to the closest
|
||||||
|
street.
|
||||||
|
|
||||||
|
There are also addresses in OSM, where the housenumber does not belong
|
||||||
|
to a street at all. These have an `addr:place` tag. For these places, Nominatim
|
||||||
|
tries to find a place with the given name in the indexed places with an
|
||||||
|
address rank between 16 and 25. If none is found, then the dependent place
|
||||||
|
is attached to the closest place in that category and the addr:place name is
|
||||||
|
added as *unlisted* place, which indicates to Nominatim that it needs to add
|
||||||
|
it to the address output, no matter what. This special case is necessary to
|
||||||
|
cover addresses that don't really refer to an existing object.
|
||||||
|
|
||||||
|
When an address has both the `addr:street` and `addr:place` tag, then Nominatim
|
||||||
|
assumes that the `addr:place` tag in fact should be the city part of the address
|
||||||
|
and give the POI the usual street number address.
|
||||||
|
|
||||||
|
Dependent places are only added to the global search index `search_name` when
|
||||||
|
they have either a name themselves or when they have address tags that are not
|
||||||
|
covered by the places that make up their address. The latter ensures that
|
||||||
|
addresses are always searchable by those address tags.
|
||||||
|
|
||||||
@@ -1,45 +0,0 @@
|
|||||||
# Postcodes in Nominatim
|
|
||||||
|
|
||||||
The blog post
|
|
||||||
[Nominatim and Postcodes](https://www.openstreetmap.org/user/lonvia/diary/43143)
|
|
||||||
describes the handling implemented since Nominatim 3.1.
|
|
||||||
|
|
||||||
Postcode centroids (aka 'calculated postcodes') are generated by looking at all
|
|
||||||
postcodes of a country, grouping them and calculating the geometric centroid.
|
|
||||||
There is currently no logic to deal with extreme outliers (typos or other
|
|
||||||
mistakes in OSM data). There is also no check if a postcodes adheres to a
|
|
||||||
country's format, e.g. if Swiss postcodes are 4 digits.
|
|
||||||
|
|
||||||
|
|
||||||
## Regular updating calculated postcodes
|
|
||||||
|
|
||||||
The script to rerun the calculation is
|
|
||||||
`nominatim refresh --postcodes`
|
|
||||||
and runs once per night on nominatim.openstreetmap.org.
|
|
||||||
|
|
||||||
|
|
||||||
## Finding places that share a specific postcode
|
|
||||||
|
|
||||||
In the Nominatim database run
|
|
||||||
|
|
||||||
```sql
|
|
||||||
SELECT address->'postcode' as pc,
|
|
||||||
osm_type, osm_id, class, type,
|
|
||||||
st_x(centroid) as lon, st_y(centroid) as lat
|
|
||||||
FROM placex
|
|
||||||
WHERE country_code='fr'
|
|
||||||
AND upper(trim (both ' ' from address->'postcode')) = '33210';
|
|
||||||
```
|
|
||||||
|
|
||||||
Alternatively on [Overpass](https://overpass-turbo.eu/) run the following query
|
|
||||||
|
|
||||||
```
|
|
||||||
[out:json][timeout:250];
|
|
||||||
area["name"="France"]->.boundaryarea;
|
|
||||||
(
|
|
||||||
nwr(area.boundaryarea)["addr:postcode"="33210"];
|
|
||||||
);
|
|
||||||
out body;
|
|
||||||
>;
|
|
||||||
out skel qt;
|
|
||||||
```
|
|
||||||
332
docs/develop/Tokenizers.md
Normal file
332
docs/develop/Tokenizers.md
Normal file
@@ -0,0 +1,332 @@
|
|||||||
|
# Tokenizers
|
||||||
|
|
||||||
|
The tokenizer is the component of Nominatim that is responsible for
|
||||||
|
analysing names of OSM objects and queries. Nominatim provides different
|
||||||
|
tokenizers that use different strategies for normalisation. This page describes
|
||||||
|
how tokenizers are expected to work and the public API that needs to be
|
||||||
|
implemented when creating a new tokenizer. For information on how to configure
|
||||||
|
a specific tokenizer for a database see the
|
||||||
|
[tokenizer chapter in the Customization Guide](../customize/Tokenizers.md).
|
||||||
|
|
||||||
|
## Generic Architecture
|
||||||
|
|
||||||
|
### About Search Tokens
|
||||||
|
|
||||||
|
Search in Nominatim is organised around search tokens. Such a token represents
|
||||||
|
string that can be part of the search query. Tokens are used so that the search
|
||||||
|
index does not need to be organised around strings. Instead the database saves
|
||||||
|
for each place which tokens match this place's name, address, house number etc.
|
||||||
|
To be able to distinguish between these different types of information stored
|
||||||
|
with the place, a search token also always has a certain type: name, house number,
|
||||||
|
postcode etc.
|
||||||
|
|
||||||
|
During search an incoming query is transformed into a ordered list of such
|
||||||
|
search tokens (or rather many lists, see below) and this list is then converted
|
||||||
|
into a database query to find the right place.
|
||||||
|
|
||||||
|
It is the core task of the tokenizer to create, manage and assign the search
|
||||||
|
tokens. The tokenizer is involved in two distinct operations:
|
||||||
|
|
||||||
|
* __at import time__: scanning names of OSM objects, normalizing them and
|
||||||
|
building up the list of search tokens.
|
||||||
|
* __at query time__: scanning the query and returning the appropriate search
|
||||||
|
tokens.
|
||||||
|
|
||||||
|
|
||||||
|
### Importing
|
||||||
|
|
||||||
|
The indexer is responsible to enrich an OSM object (or place) with all data
|
||||||
|
required for geocoding. It is split into two parts: the controller collects
|
||||||
|
the places that require updating, enriches the place information as required
|
||||||
|
and hands the place to Postgresql. The collector is part of the Nominatim
|
||||||
|
library written in Python. Within Postgresql, the `placex_update`
|
||||||
|
trigger is responsible to fill out all secondary tables with extra geocoding
|
||||||
|
information. This part is written in PL/pgSQL.
|
||||||
|
|
||||||
|
The tokenizer is involved in both parts. When the indexer prepares a place,
|
||||||
|
it hands it over to the tokenizer to inspect the names and create all the
|
||||||
|
search tokens applicable for the place. This usually involves updating the
|
||||||
|
tokenizer's internal token lists and creating a list of all token IDs for
|
||||||
|
the specific place. This list is later needed in the PL/pgSQL part where the
|
||||||
|
indexer needs to add the token IDs to the appropriate search tables. To be
|
||||||
|
able to communicate the list between the Python part and the pl/pgSQL trigger,
|
||||||
|
the `placex` table contains a special JSONB column `token_info` which is there
|
||||||
|
for the exclusive use of the tokenizer.
|
||||||
|
|
||||||
|
The Python part of the tokenizer returns a structured information about the
|
||||||
|
tokens of a place to the indexer which converts it to JSON and inserts it into
|
||||||
|
the `token_info` column. The content of the column is then handed to the PL/pqSQL
|
||||||
|
callbacks of the tokenizer which extracts the required information. Usually
|
||||||
|
the tokenizer then removes all information from the `token_info` structure,
|
||||||
|
so that no information is ever persistently saved in the table. All information
|
||||||
|
that went in should have been processed after all and put into secondary tables.
|
||||||
|
This is however not a hard requirement. If the tokenizer needs to store
|
||||||
|
additional information about a place permanently, it may do so in the
|
||||||
|
`token_info` column. It just may never execute searches over it and
|
||||||
|
consequently not create any special indexes on it.
|
||||||
|
|
||||||
|
### Querying
|
||||||
|
|
||||||
|
At query time, Nominatim builds up multiple _interpretations_ of the search
|
||||||
|
query. Each of these interpretations is tried against the database in order
|
||||||
|
of the likelihood with which they match to the search query. The first
|
||||||
|
interpretation that yields results wins.
|
||||||
|
|
||||||
|
The interpretations are encapsulated in the `SearchDescription` class. An
|
||||||
|
instance of this class is created by applying a sequence of
|
||||||
|
_search tokens_ to an initially empty SearchDescription. It is the
|
||||||
|
responsibility of the tokenizer to parse the search query and derive all
|
||||||
|
possible sequences of search tokens. To that end the tokenizer needs to parse
|
||||||
|
the search query and look up matching words in its own data structures.
|
||||||
|
|
||||||
|
## Tokenizer API
|
||||||
|
|
||||||
|
The following section describes the functions that need to be implemented
|
||||||
|
for a custom tokenizer implementation.
|
||||||
|
|
||||||
|
!!! warning
|
||||||
|
This API is currently in early alpha status. While this API is meant to
|
||||||
|
be a public API on which other tokenizers may be implemented, the API is
|
||||||
|
far away from being stable at the moment.
|
||||||
|
|
||||||
|
### Directory Structure
|
||||||
|
|
||||||
|
Nominatim expects two files for a tokenizer:
|
||||||
|
|
||||||
|
* `nominiatim/tokenizer/<NAME>_tokenizer.py` containing the Python part of the
|
||||||
|
implementation
|
||||||
|
* `lib-php/tokenizer/<NAME>_tokenizer.php` with the PHP part of the
|
||||||
|
implementation
|
||||||
|
|
||||||
|
where `<NAME>` is a unique name for the tokenizer consisting of only lower-case
|
||||||
|
letters, digits and underscore. A tokenizer also needs to install some SQL
|
||||||
|
functions. By convention, these should be placed in `lib-sql/tokenizer`.
|
||||||
|
|
||||||
|
If the tokenizer has a default configuration file, this should be saved in
|
||||||
|
the `settings/<NAME>_tokenizer.<SUFFIX>`.
|
||||||
|
|
||||||
|
### Configuration and Persistance
|
||||||
|
|
||||||
|
Tokenizers may define custom settings for their configuration. All settings
|
||||||
|
must be prefixed with `NOMINATIM_TOKENIZER_`. Settings may be transient or
|
||||||
|
persistent. Transient settings are loaded from the configuration file when
|
||||||
|
Nominatim is started and may thus be changed at any time. Persistent settings
|
||||||
|
are tied to a database installation and must only be read during installation
|
||||||
|
time. If they are needed for the runtime then they must be saved into the
|
||||||
|
`nominatim_properties` table and later loaded from there.
|
||||||
|
|
||||||
|
### The Python module
|
||||||
|
|
||||||
|
The Python module is expect to export a single factory function:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def create(dsn: str, data_dir: Path) -> AbstractTokenizer
|
||||||
|
```
|
||||||
|
|
||||||
|
The `dsn` parameter contains the DSN of the Nominatim database. The `data_dir`
|
||||||
|
is a directory in the project directory that the tokenizer may use to save
|
||||||
|
database-specific data. The function must return the instance of the tokenizer
|
||||||
|
class as defined below.
|
||||||
|
|
||||||
|
### Python Tokenizer Class
|
||||||
|
|
||||||
|
All tokenizers must inherit from `nominatim.tokenizer.base.AbstractTokenizer`
|
||||||
|
and implement the abstract functions defined there.
|
||||||
|
|
||||||
|
::: nominatim.tokenizer.base.AbstractTokenizer
|
||||||
|
rendering:
|
||||||
|
heading_level: 4
|
||||||
|
|
||||||
|
### Python Analyzer Class
|
||||||
|
|
||||||
|
::: nominatim.tokenizer.base.AbstractAnalyzer
|
||||||
|
rendering:
|
||||||
|
heading_level: 4
|
||||||
|
|
||||||
|
### PL/pgSQL Functions
|
||||||
|
|
||||||
|
The tokenizer must provide access functions for the `token_info` column
|
||||||
|
to the indexer which extracts the necessary information for the global
|
||||||
|
search tables. If the tokenizer needs additional SQL functions for private
|
||||||
|
use, then these functions must be prefixed with `token_` in order to ensure
|
||||||
|
that there are no naming conflicts with the SQL indexer code.
|
||||||
|
|
||||||
|
The following functions are expected:
|
||||||
|
|
||||||
|
```sql
|
||||||
|
FUNCTION token_get_name_search_tokens(info JSONB) RETURNS INTEGER[]
|
||||||
|
```
|
||||||
|
|
||||||
|
Return an array of token IDs of search terms that should match
|
||||||
|
the name(s) for the given place. These tokens are used to look up the place
|
||||||
|
by name and, where the place functions as part of an address for another place,
|
||||||
|
by address. Must return NULL when the place has no name.
|
||||||
|
|
||||||
|
```sql
|
||||||
|
FUNCTION token_get_name_match_tokens(info JSONB) RETURNS INTEGER[]
|
||||||
|
```
|
||||||
|
|
||||||
|
Return an array of token IDs of full names of the place that should be used
|
||||||
|
to match addresses. The list of match tokens is usually more strict than
|
||||||
|
search tokens as it is used to find a match between two OSM tag values which
|
||||||
|
are expected to contain matching full names. Partial terms should not be
|
||||||
|
used for match tokens. Must return NULL when the place has no name.
|
||||||
|
|
||||||
|
```sql
|
||||||
|
FUNCTION token_get_housenumber_search_tokens(info JSONB) RETURNS INTEGER[]
|
||||||
|
```
|
||||||
|
|
||||||
|
Return an array of token IDs of house number tokens that apply to the place.
|
||||||
|
Note that a place may have multiple house numbers, for example when apartments
|
||||||
|
each have their own number. Must be NULL when the place has no house numbers.
|
||||||
|
|
||||||
|
```sql
|
||||||
|
FUNCTION token_normalized_housenumber(info JSONB) RETURNS TEXT
|
||||||
|
```
|
||||||
|
|
||||||
|
Return the house number(s) in the normalized form that can be matched against
|
||||||
|
a house number token text. If a place has multiple house numbers they must
|
||||||
|
be listed with a semicolon as delimiter. Must be NULL when the place has no
|
||||||
|
house numbers.
|
||||||
|
|
||||||
|
```sql
|
||||||
|
FUNCTION token_matches_street(info JSONB, street_tokens INTEGER[]) RETURNS BOOLEAN
|
||||||
|
```
|
||||||
|
|
||||||
|
Check if the given tokens (previously saved from `token_get_name_match_tokens()`)
|
||||||
|
match against the `addr:street` tag name. Must return either NULL or FALSE
|
||||||
|
when the place has no `addr:street` tag.
|
||||||
|
|
||||||
|
```sql
|
||||||
|
FUNCTION token_matches_place(info JSONB, place_tokens INTEGER[]) RETURNS BOOLEAN
|
||||||
|
```
|
||||||
|
|
||||||
|
Check if the given tokens (previously saved from `token_get_name_match_tokens()`)
|
||||||
|
match against the `addr:place` tag name. Must return either NULL or FALSE
|
||||||
|
when the place has no `addr:place` tag.
|
||||||
|
|
||||||
|
|
||||||
|
```sql
|
||||||
|
FUNCTION token_addr_place_search_tokens(info JSONB) RETURNS INTEGER[]
|
||||||
|
```
|
||||||
|
|
||||||
|
Return the search token IDs extracted from the `addr:place` tag. These tokens
|
||||||
|
are used for searches by address when no matching place can be found in the
|
||||||
|
database. Must be NULL when the place has no `addr:place` tag.
|
||||||
|
|
||||||
|
```sql
|
||||||
|
FUNCTION token_get_address_keys(info JSONB) RETURNS SETOF TEXT
|
||||||
|
```
|
||||||
|
|
||||||
|
Return the set of keys for which address information is provided. This
|
||||||
|
should correspond to the list of (relevant) `addr:*` tags with the `addr:`
|
||||||
|
prefix removed or the keys used in the `address` dictionary of the place info.
|
||||||
|
|
||||||
|
```sql
|
||||||
|
FUNCTION token_get_address_search_tokens(info JSONB, key TEXT) RETURNS INTEGER[]
|
||||||
|
```
|
||||||
|
|
||||||
|
Return the array of search tokens for the given address part. `key` can be
|
||||||
|
expected to be one of those returned with `token_get_address_keys()`. The
|
||||||
|
search tokens are added to the address search vector of the place, when no
|
||||||
|
corresponding OSM object could be found for the given address part from which
|
||||||
|
to copy the name information.
|
||||||
|
|
||||||
|
```sql
|
||||||
|
FUNCTION token_matches_address(info JSONB, key TEXT, tokens INTEGER[])
|
||||||
|
```
|
||||||
|
|
||||||
|
Check if the given tokens match against the address part `key`.
|
||||||
|
|
||||||
|
__Warning:__ the tokens that are handed in are the lists previously saved
|
||||||
|
from `token_get_name_search_tokens()`, _not_ from the match token list. This
|
||||||
|
is an historical oddity which will be fixed at some point in the future.
|
||||||
|
Currently, tokenizers are encouraged to make sure that matching works against
|
||||||
|
both the search token list and the match token list.
|
||||||
|
|
||||||
|
```sql
|
||||||
|
FUNCTION token_normalized_postcode(postcode TEXT) RETURNS TEXT
|
||||||
|
```
|
||||||
|
|
||||||
|
Return the normalized version of the given postcode. This function must return
|
||||||
|
the same value as the Python function `AbstractAnalyzer->normalize_postcode()`.
|
||||||
|
|
||||||
|
```sql
|
||||||
|
FUNCTION token_strip_info(info JSONB) RETURNS JSONB
|
||||||
|
```
|
||||||
|
|
||||||
|
Return the part of the `token_info` field that should be stored in the database
|
||||||
|
permanently. The indexer calls this function when all processing is done and
|
||||||
|
replaces the content of the `token_info` column with the returned value before
|
||||||
|
the trigger stores the information in the database. May return NULL if no
|
||||||
|
information should be stored permanently.
|
||||||
|
|
||||||
|
### PHP Tokenizer class
|
||||||
|
|
||||||
|
The PHP tokenizer class is instantiated once per request and responsible for
|
||||||
|
analyzing the incoming query. Multiple requests may be in flight in
|
||||||
|
parallel.
|
||||||
|
|
||||||
|
The class is expected to be found under the
|
||||||
|
name of `\Nominatim\Tokenizer`. To find the class the PHP code includes the file
|
||||||
|
`tokenizer/tokenizer.php` in the project directory. This file must be created
|
||||||
|
when the tokenizer is first set up on import. The file should initialize any
|
||||||
|
configuration variables by setting PHP constants and then require the file
|
||||||
|
with the actual implementation of the tokenizer.
|
||||||
|
|
||||||
|
The tokenizer class must implement the following functions:
|
||||||
|
|
||||||
|
```php
|
||||||
|
public function __construct(object &$oDB)
|
||||||
|
```
|
||||||
|
|
||||||
|
The constructor of the class receives a database connection that can be used
|
||||||
|
to query persistent data in the database.
|
||||||
|
|
||||||
|
```php
|
||||||
|
public function checkStatus()
|
||||||
|
```
|
||||||
|
|
||||||
|
Check that the tokenizer can access its persistent data structures. If there
|
||||||
|
is an issue, throw an `\Exception`.
|
||||||
|
|
||||||
|
```php
|
||||||
|
public function normalizeString(string $sTerm) : string
|
||||||
|
```
|
||||||
|
|
||||||
|
Normalize string to a form to be used for comparisons when reordering results.
|
||||||
|
Nominatim reweighs results how well the final display string matches the actual
|
||||||
|
query. Before comparing result and query, names and query are normalised against
|
||||||
|
this function. The tokenizer can thus remove all properties that should not be
|
||||||
|
taken into account for reweighing, e.g. special characters or case.
|
||||||
|
|
||||||
|
```php
|
||||||
|
public function tokensForSpecialTerm(string $sTerm) : array
|
||||||
|
```
|
||||||
|
|
||||||
|
Return the list of special term tokens that match the given term.
|
||||||
|
|
||||||
|
```php
|
||||||
|
public function extractTokensFromPhrases(array &$aPhrases) : TokenList
|
||||||
|
```
|
||||||
|
|
||||||
|
Parse the given phrases, splitting them into word lists and retrieve the
|
||||||
|
matching tokens.
|
||||||
|
|
||||||
|
The phrase array may take on two forms. In unstructured searches (using `q=`
|
||||||
|
parameter) the search query is split at the commas and the elements are
|
||||||
|
put into a sorted list. For structured searches the phrase array is an
|
||||||
|
associative array where the key designates the type of the term (street, city,
|
||||||
|
county etc.) The tokenizer may ignore the phrase type at this stage in parsing.
|
||||||
|
Matching phrase type and appropriate search token type will be done later
|
||||||
|
when the SearchDescription is built.
|
||||||
|
|
||||||
|
For each phrase in the list of phrases, the function must analyse the phrase
|
||||||
|
string and then call `setWordSets()` to communicate the result of the analysis.
|
||||||
|
A word set is a list of strings, where each string refers to a search token.
|
||||||
|
A phrase may have multiple interpretations. Therefore a list of word sets is
|
||||||
|
usually attached to the phrase. The search tokens themselves are returned
|
||||||
|
by the function in an associative array, where the key corresponds to the
|
||||||
|
strings given in the word sets. The value is a list of search tokens. Thus
|
||||||
|
a single string in the list of word sets may refer to multiple search tokens.
|
||||||
|
|
||||||
35
docs/develop/address-tables.plantuml
Normal file
35
docs/develop/address-tables.plantuml
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
@startuml
|
||||||
|
skinparam monochrome true
|
||||||
|
skinparam ObjectFontStyle bold
|
||||||
|
|
||||||
|
map search_name_X {
|
||||||
|
place_id => BIGINT
|
||||||
|
address_rank => SMALLINT
|
||||||
|
name_vector => INT[]
|
||||||
|
centroid => GEOMETRY
|
||||||
|
}
|
||||||
|
|
||||||
|
map location_area_large_X {
|
||||||
|
place_id => BIGINT
|
||||||
|
keywords => INT[]
|
||||||
|
partition => SMALLINT
|
||||||
|
rank_search => SMALLINT
|
||||||
|
rank_address => SMALLINT
|
||||||
|
country_code => VARCHR(2)
|
||||||
|
isguess => BOOLEAN
|
||||||
|
postcode => TEXT
|
||||||
|
centroid => POINT
|
||||||
|
geometry => GEOMETRY
|
||||||
|
}
|
||||||
|
|
||||||
|
map location_road_X {
|
||||||
|
place_id => BIGINT
|
||||||
|
partition => SMALLINT
|
||||||
|
country_code => VARCHR(2)
|
||||||
|
geometry => GEOMETRY
|
||||||
|
}
|
||||||
|
|
||||||
|
search_name_X -[hidden]> location_area_large_X
|
||||||
|
location_area_large_X -[hidden]> location_road_X
|
||||||
|
|
||||||
|
@enduml
|
||||||
47
docs/develop/address-tables.svg
Normal file
47
docs/develop/address-tables.svg
Normal file
File diff suppressed because one or more lines are too long
|
After Width: | Height: | Size: 11 KiB |
44
docs/develop/osm2pgsql-tables.plantuml
Normal file
44
docs/develop/osm2pgsql-tables.plantuml
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
@startuml
|
||||||
|
skinparam monochrome true
|
||||||
|
skinparam ObjectFontStyle bold
|
||||||
|
|
||||||
|
map planet_osm_nodes #eee {
|
||||||
|
id => BIGINT
|
||||||
|
lat => INT
|
||||||
|
lon => INT
|
||||||
|
}
|
||||||
|
|
||||||
|
map planet_osm_ways #eee {
|
||||||
|
id => BIGINT
|
||||||
|
nodes => BIGINT[]
|
||||||
|
tags => TEXT[]
|
||||||
|
}
|
||||||
|
|
||||||
|
map planet_osm_rels #eee {
|
||||||
|
id => BIGINT
|
||||||
|
parts => BIGINT[]
|
||||||
|
members => TEXT[]
|
||||||
|
tags => TEXT[]
|
||||||
|
way_off => SMALLINT
|
||||||
|
rel_off => SMALLINT
|
||||||
|
}
|
||||||
|
|
||||||
|
map place {
|
||||||
|
osm_type => CHAR(1)
|
||||||
|
osm_id => BIGINT
|
||||||
|
class => TEXT
|
||||||
|
type => TEXT
|
||||||
|
name => HSTORE
|
||||||
|
address => HSTORE
|
||||||
|
extratags => HSTORE
|
||||||
|
admin_level => SMALLINT
|
||||||
|
geometry => GEOMETRY
|
||||||
|
}
|
||||||
|
|
||||||
|
planet_osm_nodes -[hidden]> planet_osm_ways
|
||||||
|
planet_osm_ways -[hidden]> planet_osm_rels
|
||||||
|
planet_osm_ways -[hidden]-> place
|
||||||
|
|
||||||
|
planet_osm_nodes::id <- planet_osm_ways::nodes
|
||||||
|
|
||||||
|
@enduml
|
||||||
58
docs/develop/osm2pgsql-tables.svg
Normal file
58
docs/develop/osm2pgsql-tables.svg
Normal file
File diff suppressed because one or more lines are too long
|
After Width: | Height: | Size: 13 KiB |
31
docs/develop/parenting-flow.plantuml
Normal file
31
docs/develop/parenting-flow.plantuml
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
@startuml
|
||||||
|
skinparam monochrome true
|
||||||
|
|
||||||
|
start
|
||||||
|
|
||||||
|
if (has 'addr:street'?) then (yes)
|
||||||
|
if (street with that name\n nearby?) then (yes)
|
||||||
|
:**Use closest street**
|
||||||
|
**with same name**;
|
||||||
|
kill
|
||||||
|
else (no)
|
||||||
|
:** Use closest**\n**street**;
|
||||||
|
kill
|
||||||
|
endif
|
||||||
|
elseif (has 'addr:place'?) then (yes)
|
||||||
|
if (place with that name\n nearby?) then (yes)
|
||||||
|
:**Use closest place**
|
||||||
|
**with same name**;
|
||||||
|
kill
|
||||||
|
else (no)
|
||||||
|
:add addr:place to adress;
|
||||||
|
:**Use closest place**\n**rank 16 to 25**;
|
||||||
|
kill
|
||||||
|
endif
|
||||||
|
else (otherwise)
|
||||||
|
:**Use closest**\n**street**;
|
||||||
|
kill
|
||||||
|
endif
|
||||||
|
|
||||||
|
|
||||||
|
@enduml
|
||||||
41
docs/develop/parenting-flow.svg
Normal file
41
docs/develop/parenting-flow.svg
Normal file
File diff suppressed because one or more lines are too long
|
After Width: | Height: | Size: 9.8 KiB |
99
docs/develop/search-tables.plantuml
Normal file
99
docs/develop/search-tables.plantuml
Normal file
@@ -0,0 +1,99 @@
|
|||||||
|
@startuml
|
||||||
|
skinparam monochrome true
|
||||||
|
skinparam ObjectFontStyle bold
|
||||||
|
|
||||||
|
left to right direction
|
||||||
|
|
||||||
|
map placex {
|
||||||
|
place_id => BIGINT
|
||||||
|
osm_type => CHAR(1)
|
||||||
|
osm_id => BIGINT
|
||||||
|
class => TEXT
|
||||||
|
type => TEXT
|
||||||
|
name => HSTORE
|
||||||
|
address => HSTORE
|
||||||
|
extratags => HSTORE
|
||||||
|
admin_level => SMALLINT
|
||||||
|
partition => SMALLINT
|
||||||
|
geometry_sector => INT
|
||||||
|
parent_place_id => BIGINT
|
||||||
|
linked_place_id => BIGINT
|
||||||
|
importance => DOUBLE
|
||||||
|
rank_search => SMALLINT
|
||||||
|
rank_address => SMALLINT
|
||||||
|
wikipedia => TEXT
|
||||||
|
country_code => VARCHAR(2)
|
||||||
|
housenumber => TEXT
|
||||||
|
postcode => TEXT
|
||||||
|
indexed_status => SMALLINT
|
||||||
|
indexed_date => TIMESTAMP
|
||||||
|
centroid => GEOMETRY
|
||||||
|
geometry => GEOMETRY
|
||||||
|
}
|
||||||
|
|
||||||
|
map search_name {
|
||||||
|
place_id => BIGINT
|
||||||
|
importance => DOUBLE
|
||||||
|
search_rank => SMALLINT
|
||||||
|
address_rank => SMALLINT
|
||||||
|
name_vector => INT[]
|
||||||
|
nameaddress_vector => INT[]
|
||||||
|
country_code => VARCHAR(2)
|
||||||
|
centroid => GEOMETRY
|
||||||
|
}
|
||||||
|
|
||||||
|
map word {
|
||||||
|
word_id => INT
|
||||||
|
word_token => TEXT
|
||||||
|
... =>
|
||||||
|
}
|
||||||
|
|
||||||
|
map location_property_osmline {
|
||||||
|
place_id => BIGINT
|
||||||
|
osm_id => BIGINT
|
||||||
|
startnumber => INT
|
||||||
|
endnumber => INT
|
||||||
|
interpolationtype => TEXT
|
||||||
|
address => HSTORE
|
||||||
|
partition => SMALLINT
|
||||||
|
geometry_sector => INT
|
||||||
|
parent_place_id => BIGINT
|
||||||
|
country_code => VARCHAR(2)
|
||||||
|
postcode => text
|
||||||
|
indexed_status => SMALLINT
|
||||||
|
indexed_date => TIMESTAMP
|
||||||
|
linegeo => GEOMETRY
|
||||||
|
}
|
||||||
|
|
||||||
|
map place_addressline {
|
||||||
|
place_id => BIGINT
|
||||||
|
address_place_id => BIGINT
|
||||||
|
distance => DOUBLE
|
||||||
|
cached_rank_address => SMALLINT
|
||||||
|
fromarea => BOOLEAN
|
||||||
|
isaddress => BOOLEAN
|
||||||
|
}
|
||||||
|
|
||||||
|
map location_postcode {
|
||||||
|
place_id => BIGINT
|
||||||
|
postcode => TEXT
|
||||||
|
parent_place_id => BIGINT
|
||||||
|
rank_search => SMALLINT
|
||||||
|
rank_address => SMALLINT
|
||||||
|
indexed_status => SMALLINT
|
||||||
|
indexed_date => TIMESTAMP
|
||||||
|
geometry => GEOMETRY
|
||||||
|
}
|
||||||
|
|
||||||
|
placex::place_id <-- search_name::place_id
|
||||||
|
placex::place_id <-- place_addressline::place_id
|
||||||
|
placex::place_id <-- place_addressline::address_place_id
|
||||||
|
|
||||||
|
search_name::name_vector --> word::word_id
|
||||||
|
search_name::nameaddress_vector --> word::word_id
|
||||||
|
|
||||||
|
place_addressline -[hidden]> location_property_osmline
|
||||||
|
search_name -[hidden]> place_addressline
|
||||||
|
location_property_osmline -[hidden]-> location_postcode
|
||||||
|
|
||||||
|
@enduml
|
||||||
117
docs/develop/search-tables.svg
Normal file
117
docs/develop/search-tables.svg
Normal file
File diff suppressed because one or more lines are too long
|
After Width: | Height: | Size: 35 KiB |
@@ -13,3 +13,11 @@ th, td {
|
|||||||
th {
|
th {
|
||||||
background-color: #eee;
|
background-color: #eee;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Indentation for mkdocstrings.
|
||||||
|
div.doc-contents:not(.first) {
|
||||||
|
padding-left: 25px;
|
||||||
|
border-left: 4px solid rgba(230, 230, 230);
|
||||||
|
margin-bottom: 60px;
|
||||||
|
}*/
|
||||||
|
|
||||||
|
|||||||
@@ -1,8 +1,10 @@
|
|||||||
Nominatim (from the Latin, 'by name') is a tool to search OSM data by name and address and to generate synthetic addresses of OSM points (reverse geocoding).
|
Nominatim (from the Latin, 'by name') is a tool to search OSM data by name and address and to generate synthetic addresses of OSM points (reverse geocoding).
|
||||||
|
|
||||||
This guide comes in three parts:
|
This guide comes in four parts:
|
||||||
|
|
||||||
* __[API reference](api/Overview.md)__ for users of Nominatim
|
* __[API reference](api/Overview.md)__ for users of Nominatim
|
||||||
* __[Administration Guide](admin/Installation.md)__ for those who want
|
* __[Administration Guide](admin/Installation.md)__ for those who want
|
||||||
to install their own Nominatim server
|
to install their own Nominatim server
|
||||||
|
* __[Customization Guide](customize/Overview.md)__ for those who want to
|
||||||
|
adapt their own installation to their special requirements
|
||||||
* __[Developer's Guide](develop/overview.md)__ for developers of the software
|
* __[Developer's Guide](develop/overview.md)__ for developers of the software
|
||||||
|
|||||||
@@ -19,18 +19,26 @@ pages:
|
|||||||
- 'Import' : 'admin/Import.md'
|
- 'Import' : 'admin/Import.md'
|
||||||
- 'Update' : 'admin/Update.md'
|
- 'Update' : 'admin/Update.md'
|
||||||
- 'Deploy' : 'admin/Deployment.md'
|
- 'Deploy' : 'admin/Deployment.md'
|
||||||
- 'Customize Imports' : 'admin/Customization.md'
|
|
||||||
- 'Tokenizers' : 'admin/Tokenizers.md'
|
|
||||||
- 'Nominatim UI' : 'admin/Setup-Nominatim-UI.md'
|
- 'Nominatim UI' : 'admin/Setup-Nominatim-UI.md'
|
||||||
- 'Advanced Installations' : 'admin/Advanced-Installations.md'
|
- 'Advanced Installations' : 'admin/Advanced-Installations.md'
|
||||||
|
- 'Maintenance' : 'admin/Maintenance.md'
|
||||||
- 'Migration from older Versions' : 'admin/Migration.md'
|
- 'Migration from older Versions' : 'admin/Migration.md'
|
||||||
- 'Troubleshooting' : 'admin/Faq.md'
|
- 'Troubleshooting' : 'admin/Faq.md'
|
||||||
|
- 'Customization Guide':
|
||||||
|
- 'Overview': 'customize/Overview.md'
|
||||||
|
- 'Import Styles': 'customize/Import-Styles.md'
|
||||||
|
- 'Configuration Settings': 'customize/Settings.md'
|
||||||
|
- 'Place Ranking' : 'customize/Ranking.md'
|
||||||
|
- 'Tokenizers' : 'customize/Tokenizers.md'
|
||||||
|
- 'Special Phrases': 'customize/Special-Phrases.md'
|
||||||
|
- 'External data: US housenumbers from TIGER': 'customize/Tiger.md'
|
||||||
|
- 'External data: Postcodes': 'customize/Postcodes.md'
|
||||||
- 'Developers Guide':
|
- 'Developers Guide':
|
||||||
- 'Setup for Development' : 'develop/Development-Environment.md'
|
|
||||||
- 'Architecture Overview' : 'develop/overview.md'
|
- 'Architecture Overview' : 'develop/overview.md'
|
||||||
- 'OSM Data Import' : 'develop/Import.md'
|
- 'Database Layout' : 'develop/Database-Layout.md'
|
||||||
- 'Place Ranking' : 'develop/Ranking.md'
|
- 'Indexing' : 'develop/Indexing.md'
|
||||||
- 'Postcodes' : 'develop/Postcodes.md'
|
- 'Tokenizers' : 'develop/Tokenizers.md'
|
||||||
|
- 'Setup for Development' : 'develop/Development-Environment.md'
|
||||||
- 'Testing' : 'develop/Testing.md'
|
- 'Testing' : 'develop/Testing.md'
|
||||||
- 'External Data Sources': 'develop/data-sources.md'
|
- 'External Data Sources': 'develop/data-sources.md'
|
||||||
- 'Appendix':
|
- 'Appendix':
|
||||||
@@ -41,6 +49,15 @@ pages:
|
|||||||
markdown_extensions:
|
markdown_extensions:
|
||||||
- codehilite
|
- codehilite
|
||||||
- admonition
|
- admonition
|
||||||
|
- def_list
|
||||||
- toc:
|
- toc:
|
||||||
permalink:
|
permalink:
|
||||||
extra_css: [extra.css, styles.css]
|
extra_css: [extra.css, styles.css]
|
||||||
|
plugins:
|
||||||
|
- search
|
||||||
|
- mkdocstrings:
|
||||||
|
handlers:
|
||||||
|
python:
|
||||||
|
rendering:
|
||||||
|
show_source: false
|
||||||
|
show_signature_annotations: false
|
||||||
|
|||||||
@@ -127,7 +127,7 @@ class Debug
|
|||||||
|
|
||||||
public static function printSQL($sSQL)
|
public static function printSQL($sSQL)
|
||||||
{
|
{
|
||||||
echo '<p><tt><font color="#aaa">'.$sSQL.'</font></tt></p>'."\n";
|
echo '<p><tt><font color="#aaa">'.htmlspecialchars($sSQL, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML401).'</font></tt></p>'."\n";
|
||||||
}
|
}
|
||||||
|
|
||||||
private static function outputVar($mVar, $sPreNL)
|
private static function outputVar($mVar, $sPreNL)
|
||||||
@@ -170,11 +170,12 @@ class Debug
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (is_string($mVar)) {
|
if (is_string($mVar)) {
|
||||||
echo "'$mVar'";
|
$sOut = "'$mVar'";
|
||||||
return strlen($mVar) + 2;
|
} else {
|
||||||
|
$sOut = (string)$mVar;
|
||||||
}
|
}
|
||||||
|
|
||||||
echo (string)$mVar;
|
echo htmlspecialchars($sOut, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML401);
|
||||||
return strlen((string)$mVar);
|
return strlen($sOut);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -498,7 +498,6 @@ class Geocode
|
|||||||
if ($this->aCountryCodes) {
|
if ($this->aCountryCodes) {
|
||||||
$oCtx->setCountryList($this->aCountryCodes);
|
$oCtx->setCountryList($this->aCountryCodes);
|
||||||
}
|
}
|
||||||
$this->oTokenizer->setCountryRestriction($this->aCountryCodes);
|
|
||||||
|
|
||||||
Debug::newSection('Query Preprocessing');
|
Debug::newSection('Query Preprocessing');
|
||||||
|
|
||||||
@@ -507,13 +506,6 @@ class Geocode
|
|||||||
userError('Query string is not UTF-8 encoded.');
|
userError('Query string is not UTF-8 encoded.');
|
||||||
}
|
}
|
||||||
|
|
||||||
// Conflicts between US state abreviations and various words for 'the' in different languages
|
|
||||||
if (isset($this->aLangPrefOrder['name:en'])) {
|
|
||||||
$sQuery = preg_replace('/(^|,)\s*il\s*(,|$)/i', '\1illinois\2', $sQuery);
|
|
||||||
$sQuery = preg_replace('/(^|,)\s*al\s*(,|$)/i', '\1alabama\2', $sQuery);
|
|
||||||
$sQuery = preg_replace('/(^|,)\s*la\s*(,|$)/i', '\1louisiana\2', $sQuery);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Do we have anything that looks like a lat/lon pair?
|
// Do we have anything that looks like a lat/lon pair?
|
||||||
$sQuery = $oCtx->setNearPointFromQuery($sQuery);
|
$sQuery = $oCtx->setNearPointFromQuery($sQuery);
|
||||||
|
|
||||||
|
|||||||
@@ -9,29 +9,14 @@ namespace Nominatim;
|
|||||||
*/
|
*/
|
||||||
class Phrase
|
class Phrase
|
||||||
{
|
{
|
||||||
const MAX_WORDSET_LEN = 20;
|
// Complete phrase as a string (guaranteed to have no leading or trailing
|
||||||
const MAX_WORDSETS = 100;
|
// spaces).
|
||||||
|
|
||||||
// Complete phrase as a string.
|
|
||||||
private $sPhrase;
|
private $sPhrase;
|
||||||
// Element type for structured searches.
|
// Element type for structured searches.
|
||||||
private $sPhraseType;
|
private $sPhraseType;
|
||||||
// Possible segmentations of the phrase.
|
// Possible segmentations of the phrase.
|
||||||
private $aWordSets;
|
private $aWordSets;
|
||||||
|
|
||||||
public static function cmpByArraylen($aA, $aB)
|
|
||||||
{
|
|
||||||
$iALen = count($aA);
|
|
||||||
$iBLen = count($aB);
|
|
||||||
|
|
||||||
if ($iALen == $iBLen) {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
return ($iALen < $iBLen) ? -1 : 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public function __construct($sPhrase, $sPhraseType)
|
public function __construct($sPhrase, $sPhraseType)
|
||||||
{
|
{
|
||||||
$this->sPhrase = trim($sPhrase);
|
$this->sPhrase = trim($sPhrase);
|
||||||
@@ -57,6 +42,11 @@ class Phrase
|
|||||||
return $this->sPhraseType;
|
return $this->sPhraseType;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function setWordSets($aWordSets)
|
||||||
|
{
|
||||||
|
$this->aWordSets = $aWordSets;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Return the array of possible segmentations of the phrase.
|
* Return the array of possible segmentations of the phrase.
|
||||||
*
|
*
|
||||||
@@ -80,61 +70,6 @@ class Phrase
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public function computeWordSets($aWords, $oTokens)
|
|
||||||
{
|
|
||||||
$iNumWords = count($aWords);
|
|
||||||
|
|
||||||
if ($iNumWords == 0) {
|
|
||||||
$this->aWordSets = null;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Caches the word set for the partial phrase up to word i.
|
|
||||||
$aSetCache = array_fill(0, $iNumWords, array());
|
|
||||||
|
|
||||||
// Initialise first element of cache. There can only be the word.
|
|
||||||
if ($oTokens->containsAny($aWords[0])) {
|
|
||||||
$aSetCache[0][] = array($aWords[0]);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Now do the next elements using what we already have.
|
|
||||||
for ($i = 1; $i < $iNumWords; $i++) {
|
|
||||||
for ($j = $i; $j > 0; $j--) {
|
|
||||||
$sPartial = $j == $i ? $aWords[$j] : $aWords[$j].' '.$sPartial;
|
|
||||||
if (!empty($aSetCache[$j - 1]) && $oTokens->containsAny($sPartial)) {
|
|
||||||
$aPartial = array($sPartial);
|
|
||||||
foreach ($aSetCache[$j - 1] as $aSet) {
|
|
||||||
if (count($aSet) < Phrase::MAX_WORDSET_LEN) {
|
|
||||||
$aSetCache[$i][] = array_merge($aSet, $aPartial);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (count($aSetCache[$i]) > 2 * Phrase::MAX_WORDSETS) {
|
|
||||||
usort(
|
|
||||||
$aSetCache[$i],
|
|
||||||
array('\Nominatim\Phrase', 'cmpByArraylen')
|
|
||||||
);
|
|
||||||
$aSetCache[$i] = array_slice(
|
|
||||||
$aSetCache[$i],
|
|
||||||
0,
|
|
||||||
Phrase::MAX_WORDSETS
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// finally the current full phrase
|
|
||||||
$sPartial = $aWords[0].' '.$sPartial;
|
|
||||||
if ($oTokens->containsAny($sPartial)) {
|
|
||||||
$aSetCache[$i][] = array($sPartial);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
$this->aWordSets = $aSetCache[$iNumWords - 1];
|
|
||||||
usort($this->aWordSets, array('\Nominatim\Phrase', 'cmpByArraylen'));
|
|
||||||
$this->aWordSets = array_slice($this->aWordSets, 0, Phrase::MAX_WORDSETS);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public function debugInfo()
|
public function debugInfo()
|
||||||
{
|
{
|
||||||
return array(
|
return array(
|
||||||
|
|||||||
@@ -111,6 +111,7 @@ class ReverseGeocode
|
|||||||
$sSQL .= ' FROM placex';
|
$sSQL .= ' FROM placex';
|
||||||
$sSQL .= ' WHERE osm_type = \'N\'';
|
$sSQL .= ' WHERE osm_type = \'N\'';
|
||||||
$sSQL .= ' AND country_code = \''.$sCountryCode.'\'';
|
$sSQL .= ' AND country_code = \''.$sCountryCode.'\'';
|
||||||
|
$sSQL .= ' AND rank_search < 26 '; // needed to select right index
|
||||||
$sSQL .= ' AND rank_search between 5 and ' .min(25, $iMaxRank);
|
$sSQL .= ' AND rank_search between 5 and ' .min(25, $iMaxRank);
|
||||||
$sSQL .= ' AND class = \'place\' AND type != \'postcode\'';
|
$sSQL .= ' AND class = \'place\' AND type != \'postcode\'';
|
||||||
$sSQL .= ' AND name IS NOT NULL ';
|
$sSQL .= ' AND name IS NOT NULL ';
|
||||||
@@ -206,6 +207,7 @@ class ReverseGeocode
|
|||||||
// for place nodes at rank_address 16
|
// for place nodes at rank_address 16
|
||||||
$sSQL .= ' AND rank_search > '.$iRankSearch;
|
$sSQL .= ' AND rank_search > '.$iRankSearch;
|
||||||
$sSQL .= ' AND rank_search <= '.$iMaxRank;
|
$sSQL .= ' AND rank_search <= '.$iMaxRank;
|
||||||
|
$sSQL .= ' AND rank_search < 26 '; // needed to select right index
|
||||||
$sSQL .= ' AND rank_address > 0';
|
$sSQL .= ' AND rank_address > 0';
|
||||||
$sSQL .= ' AND class = \'place\'';
|
$sSQL .= ' AND class = \'place\'';
|
||||||
$sSQL .= ' AND type != \'postcode\'';
|
$sSQL .= ' AND type != \'postcode\'';
|
||||||
|
|||||||
@@ -28,6 +28,8 @@ class SearchContext
|
|||||||
public $sqlViewboxLarge = '';
|
public $sqlViewboxLarge = '';
|
||||||
/// Reference along a route (as SQL).
|
/// Reference along a route (as SQL).
|
||||||
public $sqlViewboxCentre = '';
|
public $sqlViewboxCentre = '';
|
||||||
|
/// List of countries to restrict search to (as array).
|
||||||
|
public $aCountryList = null;
|
||||||
/// List of countries to restrict search to (as SQL).
|
/// List of countries to restrict search to (as SQL).
|
||||||
public $sqlCountryList = '';
|
public $sqlCountryList = '';
|
||||||
/// List of place IDs to exclude (as SQL).
|
/// List of place IDs to exclude (as SQL).
|
||||||
@@ -187,6 +189,7 @@ class SearchContext
|
|||||||
public function setCountryList($aCountries)
|
public function setCountryList($aCountries)
|
||||||
{
|
{
|
||||||
$this->sqlCountryList = '('.join(',', array_map('addQuotes', $aCountries)).')';
|
$this->sqlCountryList = '('.join(',', array_map('addQuotes', $aCountries)).')';
|
||||||
|
$this->aCountryList = $aCountries;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -279,6 +282,19 @@ class SearchContext
|
|||||||
return '';
|
return '';
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if the given country is covered by the search context.
|
||||||
|
*
|
||||||
|
* @param string $sCountryCode Country code of the country to check.
|
||||||
|
*
|
||||||
|
* @return True, if no country code restrictions are set or the
|
||||||
|
* country is included in the country list.
|
||||||
|
*/
|
||||||
|
public function isCountryApplicable($sCountryCode)
|
||||||
|
{
|
||||||
|
return $this->aCountryList === null || in_array($sCountryCode, $this->aCountryList);
|
||||||
|
}
|
||||||
|
|
||||||
public function debugInfo()
|
public function debugInfo()
|
||||||
{
|
{
|
||||||
return array(
|
return array(
|
||||||
|
|||||||
@@ -19,6 +19,8 @@ class SearchDescription
|
|||||||
private $aName = array();
|
private $aName = array();
|
||||||
/// True if the name is rare enough to force index use on name.
|
/// True if the name is rare enough to force index use on name.
|
||||||
private $bRareName = false;
|
private $bRareName = false;
|
||||||
|
/// True if the name requires to be accompanied by address terms.
|
||||||
|
private $bNameNeedsAddress = false;
|
||||||
/// List of word ids making up the address of the object.
|
/// List of word ids making up the address of the object.
|
||||||
private $aAddress = array();
|
private $aAddress = array();
|
||||||
/// List of word ids that appear in the name but should be ignored.
|
/// List of word ids that appear in the name but should be ignored.
|
||||||
@@ -113,6 +115,9 @@ class SearchDescription
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if ($this->bNameNeedsAddress && empty($this->aAddress)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@@ -231,6 +236,7 @@ class SearchDescription
|
|||||||
{
|
{
|
||||||
$this->aName[$iId] = $iId;
|
$this->aName[$iId] = $iId;
|
||||||
$this->bRareName = $bRareName;
|
$this->bRareName = $bRareName;
|
||||||
|
$this->bNameNeedsAddress = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -240,11 +246,19 @@ class SearchDescription
|
|||||||
* @param integer iID ID of term to add.
|
* @param integer iID ID of term to add.
|
||||||
* @param bool bSearchable Term should be used to search for result
|
* @param bool bSearchable Term should be used to search for result
|
||||||
* (i.e. term is not a stop word).
|
* (i.e. term is not a stop word).
|
||||||
|
* @param bool bNeedsAddress True if the term is too unspecific to be used
|
||||||
|
* in a stand-alone search without an address
|
||||||
|
* to narrow down the search.
|
||||||
* @param integer iPhraseNumber Index of phrase, where the partial term
|
* @param integer iPhraseNumber Index of phrase, where the partial term
|
||||||
* appears.
|
* appears.
|
||||||
*/
|
*/
|
||||||
public function addPartialNameToken($iId, $bSearchable, $iPhraseNumber)
|
public function addPartialNameToken($iId, $bSearchable, $bNeedsAddress, $iPhraseNumber)
|
||||||
{
|
{
|
||||||
|
if (empty($this->aName)) {
|
||||||
|
$this->bNameNeedsAddress = $bNeedsAddress;
|
||||||
|
} else {
|
||||||
|
$this->bNameNeedsAddress &= $bNeedsAddress;
|
||||||
|
}
|
||||||
if ($bSearchable) {
|
if ($bSearchable) {
|
||||||
$this->aName[$iId] = $iId;
|
$this->aName[$iId] = $iId;
|
||||||
} else {
|
} else {
|
||||||
@@ -310,6 +324,7 @@ class SearchDescription
|
|||||||
{
|
{
|
||||||
$this->aAddress = array_merge($this->aAddress, $this->aName);
|
$this->aAddress = array_merge($this->aAddress, $this->aName);
|
||||||
$this->bRareName = false;
|
$this->bRareName = false;
|
||||||
|
$this->bNameNeedsAddress = true;
|
||||||
$this->aName = array($iId => $iId);
|
$this->aName = array($iId => $iId);
|
||||||
$this->iNamePhrase = -1;
|
$this->iNamePhrase = -1;
|
||||||
}
|
}
|
||||||
@@ -566,32 +581,37 @@ class SearchDescription
|
|||||||
|
|
||||||
// Sort by existence of the requested house number but only if not
|
// Sort by existence of the requested house number but only if not
|
||||||
// too many results are expected for the street, i.e. if the result
|
// too many results are expected for the street, i.e. if the result
|
||||||
// will be narrowed down by an address. Remeber that with ordering
|
// will be narrowed down by an address. Remember that with ordering
|
||||||
// every single result has to be checked.
|
// every single result has to be checked.
|
||||||
if ($this->sHouseNumber && ($this->bRareName || !empty($this->aAddress) || $this->sPostcode)) {
|
if ($this->sHouseNumber && ($this->bRareName || !empty($this->aAddress) || $this->sPostcode)) {
|
||||||
$sHouseNumberRegex = '\\\\m'.$this->sHouseNumber.'\\\\M';
|
$sHouseNumberRegex = $oDB->getDBQuoted('\\\\m'.$this->sHouseNumber.'\\\\M');
|
||||||
$aOrder[] = ' (';
|
|
||||||
$aOrder[0] .= 'EXISTS(';
|
// Housenumbers on streets and places.
|
||||||
$aOrder[0] .= ' SELECT place_id';
|
$sChildHnr = 'SELECT * FROM placex WHERE parent_place_id = search_name.place_id';
|
||||||
$aOrder[0] .= ' FROM placex';
|
$sChildHnr .= ' AND housenumber ~* E'.$sHouseNumberRegex;
|
||||||
$aOrder[0] .= ' WHERE parent_place_id = search_name.place_id';
|
// Interpolations on streets and places.
|
||||||
$aOrder[0] .= " AND housenumber ~* E'".$sHouseNumberRegex."'";
|
if (preg_match('/^[0-9]+$/', $this->sHouseNumber)) {
|
||||||
$aOrder[0] .= ' LIMIT 1';
|
$sIpolHnr = 'SELECT * FROM location_property_osmline ';
|
||||||
$aOrder[0] .= ') ';
|
$sIpolHnr .= 'WHERE parent_place_id = search_name.place_id ';
|
||||||
// also housenumbers from interpolation lines table are needed
|
$sIpolHnr .= ' AND startnumber is not NULL';
|
||||||
if (preg_match('/[0-9]+/', $this->sHouseNumber)) {
|
$sIpolHnr .= ' AND '.$this->sHouseNumber.'>=startnumber ';
|
||||||
$iHouseNumber = intval($this->sHouseNumber);
|
$sIpolHnr .= ' AND '.$this->sHouseNumber.'<=endnumber ';
|
||||||
$aOrder[0] .= 'OR EXISTS(';
|
} else {
|
||||||
$aOrder[0] .= ' SELECT place_id ';
|
$sIpolHnr = false;
|
||||||
$aOrder[0] .= ' FROM location_property_osmline ';
|
|
||||||
$aOrder[0] .= ' WHERE parent_place_id = search_name.place_id';
|
|
||||||
$aOrder[0] .= ' AND startnumber is not NULL';
|
|
||||||
$aOrder[0] .= ' AND '.$iHouseNumber.'>=startnumber ';
|
|
||||||
$aOrder[0] .= ' AND '.$iHouseNumber.'<=endnumber ';
|
|
||||||
$aOrder[0] .= ' LIMIT 1';
|
|
||||||
$aOrder[0] .= ')';
|
|
||||||
}
|
}
|
||||||
$aOrder[0] .= ') DESC';
|
// Housenumbers on the object iteself for unlisted places.
|
||||||
|
$sSelfHnr = 'SELECT * FROM placex WHERE place_id = search_name.place_id';
|
||||||
|
$sSelfHnr .= ' AND housenumber ~* E'.$sHouseNumberRegex;
|
||||||
|
|
||||||
|
$sSql = '(CASE WHEN address_rank = 30 THEN EXISTS('.$sSelfHnr.') ';
|
||||||
|
$sSql .= ' ELSE EXISTS('.$sChildHnr.') ';
|
||||||
|
if ($sIpolHnr) {
|
||||||
|
$sSql .= 'OR EXISTS('.$sIpolHnr.') ';
|
||||||
|
}
|
||||||
|
$sSql .= 'END) DESC';
|
||||||
|
|
||||||
|
|
||||||
|
$aOrder[] = $sSql;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!empty($this->aName)) {
|
if (!empty($this->aName)) {
|
||||||
@@ -624,7 +644,7 @@ class SearchDescription
|
|||||||
$aOrder[] = $this->oContext->distanceSQL('centroid');
|
$aOrder[] = $this->oContext->distanceSQL('centroid');
|
||||||
} elseif ($this->sPostcode) {
|
} elseif ($this->sPostcode) {
|
||||||
if (empty($this->aAddress)) {
|
if (empty($this->aAddress)) {
|
||||||
$aTerms[] = "EXISTS(SELECT place_id FROM location_postcode p WHERE p.postcode = '".$this->sPostcode."' AND ST_DWithin(search_name.centroid, p.geometry, 0.1))";
|
$aTerms[] = "EXISTS(SELECT place_id FROM location_postcode p WHERE p.postcode = '".$this->sPostcode."' AND ST_DWithin(search_name.centroid, p.geometry, 0.12))";
|
||||||
} else {
|
} else {
|
||||||
$aOrder[] = "(SELECT min(ST_Distance(search_name.centroid, p.geometry)) FROM location_postcode p WHERE p.postcode = '".$this->sPostcode."')";
|
$aOrder[] = "(SELECT min(ST_Distance(search_name.centroid, p.geometry)) FROM location_postcode p WHERE p.postcode = '".$this->sPostcode."')";
|
||||||
}
|
}
|
||||||
@@ -719,9 +739,9 @@ class SearchDescription
|
|||||||
return $aResults;
|
return $aResults;
|
||||||
}
|
}
|
||||||
|
|
||||||
$sHouseNumberRegex = '\\\\m'.$this->sHouseNumber.'\\\\M';
|
$sHouseNumberRegex = $oDB->getDBQuoted('\\\\m'.$this->sHouseNumber.'\\\\M');
|
||||||
$sSQL = 'SELECT place_id FROM placex WHERE';
|
$sSQL = 'SELECT place_id FROM placex WHERE';
|
||||||
$sSQL .= " housenumber ~* E'".$sHouseNumberRegex."'";
|
$sSQL .= ' housenumber ~* E'.$sHouseNumberRegex;
|
||||||
$sSQL .= ' AND ('.join(' OR ', $aIDCondition).')';
|
$sSQL .= ' AND ('.join(' OR ', $aIDCondition).')';
|
||||||
$sSQL .= $this->oContext->excludeSQL(' AND place_id');
|
$sSQL .= $this->oContext->excludeSQL(' AND place_id');
|
||||||
|
|
||||||
|
|||||||
131
lib-php/SimpleWordList.php
Normal file
131
lib-php/SimpleWordList.php
Normal file
@@ -0,0 +1,131 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
namespace Nominatim;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A word list creator based on simple splitting by space.
|
||||||
|
*
|
||||||
|
* Creates possible permutations of split phrases by finding all combination
|
||||||
|
* of splitting the phrase on space boundaries.
|
||||||
|
*/
|
||||||
|
class SimpleWordList
|
||||||
|
{
|
||||||
|
const MAX_WORDSET_LEN = 20;
|
||||||
|
const MAX_WORDSETS = 100;
|
||||||
|
|
||||||
|
// The phrase as a list of simple terms (without spaces).
|
||||||
|
private $aWords;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new word list
|
||||||
|
*
|
||||||
|
* @param string sPhrase Phrase to create the word list from. The phrase is
|
||||||
|
* expected to be normalised, so that there are no
|
||||||
|
* subsequent spaces.
|
||||||
|
*/
|
||||||
|
public function __construct($sPhrase)
|
||||||
|
{
|
||||||
|
if (strlen($sPhrase) > 0) {
|
||||||
|
$this->aWords = explode(' ', $sPhrase);
|
||||||
|
} else {
|
||||||
|
$this->aWords = array();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get all possible tokens that are present in this word list.
|
||||||
|
*
|
||||||
|
* @return array The list of string tokens in the word list.
|
||||||
|
*/
|
||||||
|
public function getTokens()
|
||||||
|
{
|
||||||
|
$aTokens = array();
|
||||||
|
$iNumWords = count($this->aWords);
|
||||||
|
|
||||||
|
for ($i = 0; $i < $iNumWords; $i++) {
|
||||||
|
$sPhrase = $this->aWords[$i];
|
||||||
|
$aTokens[$sPhrase] = $sPhrase;
|
||||||
|
|
||||||
|
for ($j = $i + 1; $j < $iNumWords; $j++) {
|
||||||
|
$sPhrase .= ' '.$this->aWords[$j];
|
||||||
|
$aTokens[$sPhrase] = $sPhrase;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return $aTokens;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compute all possible permutations of phrase splits that result in
|
||||||
|
* words which are in the token list.
|
||||||
|
*/
|
||||||
|
public function getWordSets($oTokens)
|
||||||
|
{
|
||||||
|
$iNumWords = count($this->aWords);
|
||||||
|
|
||||||
|
if ($iNumWords == 0) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Caches the word set for the partial phrase up to word i.
|
||||||
|
$aSetCache = array_fill(0, $iNumWords, array());
|
||||||
|
|
||||||
|
// Initialise first element of cache. There can only be the word.
|
||||||
|
if ($oTokens->containsAny($this->aWords[0])) {
|
||||||
|
$aSetCache[0][] = array($this->aWords[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Now do the next elements using what we already have.
|
||||||
|
for ($i = 1; $i < $iNumWords; $i++) {
|
||||||
|
for ($j = $i; $j > 0; $j--) {
|
||||||
|
$sPartial = $j == $i ? $this->aWords[$j] : $this->aWords[$j].' '.$sPartial;
|
||||||
|
if (!empty($aSetCache[$j - 1]) && $oTokens->containsAny($sPartial)) {
|
||||||
|
$aPartial = array($sPartial);
|
||||||
|
foreach ($aSetCache[$j - 1] as $aSet) {
|
||||||
|
if (count($aSet) < SimpleWordList::MAX_WORDSET_LEN) {
|
||||||
|
$aSetCache[$i][] = array_merge($aSet, $aPartial);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (count($aSetCache[$i]) > 2 * SimpleWordList::MAX_WORDSETS) {
|
||||||
|
usort(
|
||||||
|
$aSetCache[$i],
|
||||||
|
array('\Nominatim\SimpleWordList', 'cmpByArraylen')
|
||||||
|
);
|
||||||
|
$aSetCache[$i] = array_slice(
|
||||||
|
$aSetCache[$i],
|
||||||
|
0,
|
||||||
|
SimpleWordList::MAX_WORDSETS
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// finally the current full phrase
|
||||||
|
$sPartial = $this->aWords[0].' '.$sPartial;
|
||||||
|
if ($oTokens->containsAny($sPartial)) {
|
||||||
|
$aSetCache[$i][] = array($sPartial);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
$aWordSets = $aSetCache[$iNumWords - 1];
|
||||||
|
usort($aWordSets, array('\Nominatim\SimpleWordList', 'cmpByArraylen'));
|
||||||
|
return array_slice($aWordSets, 0, SimpleWordList::MAX_WORDSETS);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static function cmpByArraylen($aA, $aB)
|
||||||
|
{
|
||||||
|
$iALen = count($aA);
|
||||||
|
$iBLen = count($aB);
|
||||||
|
|
||||||
|
if ($iALen == $iBLen) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
return ($iALen < $iBLen) ? -1 : 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
public function debugInfo()
|
||||||
|
{
|
||||||
|
return $this->aWords;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -36,7 +36,9 @@ class Country
|
|||||||
*/
|
*/
|
||||||
public function isExtendable($oSearch, $oPosition)
|
public function isExtendable($oSearch, $oPosition)
|
||||||
{
|
{
|
||||||
return !$oSearch->hasCountry() && $oPosition->maybePhrase('country');
|
return !$oSearch->hasCountry()
|
||||||
|
&& $oPosition->maybePhrase('country')
|
||||||
|
&& $oSearch->getContext()->isCountryApplicable($this->sCountryCode);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -58,8 +58,8 @@ class HouseNumber
|
|||||||
// up of numbers, add a penalty
|
// up of numbers, add a penalty
|
||||||
$iSearchCost = 1;
|
$iSearchCost = 1;
|
||||||
if (preg_match('/\\d/', $this->sToken) === 0
|
if (preg_match('/\\d/', $this->sToken) === 0
|
||||||
|| preg_match_all('/[^0-9]/', $this->sToken, $aMatches) > 2) {
|
|| preg_match_all('/[^0-9 ]/', $this->sToken, $aMatches) > 3) {
|
||||||
$iSearchCost++;
|
$iSearchCost += strlen($this->sToken) - 1;
|
||||||
}
|
}
|
||||||
if (!$oSearch->hasOperator(\Nominatim\Operator::NONE)) {
|
if (!$oSearch->hasOperator(\Nominatim\Operator::NONE)) {
|
||||||
$iSearchCost++;
|
$iSearchCost++;
|
||||||
|
|||||||
@@ -90,6 +90,7 @@ class Partial
|
|||||||
$oNewSearch->addPartialNameToken(
|
$oNewSearch->addPartialNameToken(
|
||||||
$this->iId,
|
$this->iId,
|
||||||
$this->iSearchNameCount < CONST_Max_Word_Frequency,
|
$this->iSearchNameCount < CONST_Max_Word_Frequency,
|
||||||
|
$this->iSearchNameCount > CONST_Search_NameOnlySearchFrequencyThreshold,
|
||||||
$oPosition->getPhrase()
|
$oPosition->getPhrase()
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|||||||
@@ -44,7 +44,10 @@ class SpecialTerm
|
|||||||
*/
|
*/
|
||||||
public function isExtendable($oSearch, $oPosition)
|
public function isExtendable($oSearch, $oPosition)
|
||||||
{
|
{
|
||||||
return !$oSearch->hasOperator() && $oPosition->isPhrase('');
|
return !$oSearch->hasOperator()
|
||||||
|
&& $oPosition->isPhrase('')
|
||||||
|
&& ($this->iOperator != \Nominatim\Operator::NONE
|
||||||
|
|| (!$oSearch->hasAddress() && !$oSearch->hasHousenumber() && !$oSearch->hasCountry()));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -66,8 +69,8 @@ class SpecialTerm
|
|||||||
$iOp = \Nominatim\Operator::NAME;
|
$iOp = \Nominatim\Operator::NAME;
|
||||||
} else {
|
} else {
|
||||||
$iOp = \Nominatim\Operator::NEAR;
|
$iOp = \Nominatim\Operator::NEAR;
|
||||||
}
|
|
||||||
$iSearchCost += 2;
|
$iSearchCost += 2;
|
||||||
|
}
|
||||||
} elseif (!$oPosition->isFirstToken() && !$oPosition->isLastToken()) {
|
} elseif (!$oPosition->isFirstToken() && !$oPosition->isLastToken()) {
|
||||||
$iSearchCost += 2;
|
$iSearchCost += 2;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,34 +0,0 @@
|
|||||||
<?php
|
|
||||||
@define('CONST_LibDir', dirname(dirname(__FILE__)));
|
|
||||||
|
|
||||||
require_once(CONST_LibDir.'/init-cmd.php');
|
|
||||||
|
|
||||||
ini_set('memory_limit', '800M');
|
|
||||||
ini_set('display_errors', 'stderr');
|
|
||||||
|
|
||||||
$aCMDOptions
|
|
||||||
= array(
|
|
||||||
'Import country language data from osm wiki',
|
|
||||||
array('help', 'h', 0, 1, 0, 0, false, 'Show Help'),
|
|
||||||
array('quiet', 'q', 0, 1, 0, 0, 'bool', 'Quiet output'),
|
|
||||||
array('verbose', 'v', 0, 1, 0, 0, 'bool', 'Verbose output'),
|
|
||||||
array('project-dir', '', 0, 1, 1, 1, 'realpath', 'Base directory of the Nominatim installation (default: .)'),
|
|
||||||
);
|
|
||||||
getCmdOpt($_SERVER['argv'], $aCMDOptions, $aCMDResult, true, true);
|
|
||||||
|
|
||||||
loadSettings($aCMDResult['project-dir'] ?? getcwd());
|
|
||||||
setupHTTPProxy();
|
|
||||||
|
|
||||||
if (true) {
|
|
||||||
$sURL = 'https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Country_Codes';
|
|
||||||
$sWikiPageXML = file_get_contents($sURL);
|
|
||||||
if (preg_match_all('#\\| ([a-z]{2}) \\|\\| [^|]+\\|\\| ([a-z,]+)#', $sWikiPageXML, $aMatches, PREG_SET_ORDER)) {
|
|
||||||
foreach ($aMatches as $aMatch) {
|
|
||||||
$aLanguages = explode(',', $aMatch[2]);
|
|
||||||
foreach ($aLanguages as $i => $s) {
|
|
||||||
$aLanguages[$i] = '"'.pg_escape_string($s).'"';
|
|
||||||
}
|
|
||||||
echo "UPDATE country_name set country_default_language_codes = '{".join(',', $aLanguages)."}' where country_code = '".pg_escape_string($aMatch[1])."';\n";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -86,8 +86,13 @@ if (!$aResult['reverse-only']) {
|
|||||||
if ($bVerbose) {
|
if ($bVerbose) {
|
||||||
echo "\n";
|
echo "\n";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
$oTokenizer = new \Nominatim\Tokenizer($oDB);
|
||||||
|
|
||||||
|
$aWords = $oTokenizer->mostFrequentWords(1000);
|
||||||
|
|
||||||
$sSQL = 'SELECT word FROM word WHERE word is not null ORDER BY search_name_count DESC LIMIT 1000';
|
$sSQL = 'SELECT word FROM word WHERE word is not null ORDER BY search_name_count DESC LIMIT 1000';
|
||||||
foreach ($oDB->getCol($sSQL) as $sWord) {
|
foreach ($aWords as $sWord) {
|
||||||
if ($bVerbose) {
|
if ($bVerbose) {
|
||||||
echo "$sWord = ";
|
echo "$sWord = ";
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,21 +0,0 @@
|
|||||||
<?php
|
|
||||||
|
|
||||||
$phpPhraseSettingsFile = $argv[1];
|
|
||||||
$jsonPhraseSettingsFile = dirname($phpPhraseSettingsFile).'/'.basename($phpPhraseSettingsFile, '.php').'.json';
|
|
||||||
|
|
||||||
if (file_exists($phpPhraseSettingsFile) && !file_exists($jsonPhraseSettingsFile)) {
|
|
||||||
include $phpPhraseSettingsFile;
|
|
||||||
|
|
||||||
$data = array();
|
|
||||||
|
|
||||||
if (isset($aTagsBlacklist)) {
|
|
||||||
$data['blackList'] = $aTagsBlacklist;
|
|
||||||
}
|
|
||||||
if (isset($aTagsWhitelist)) {
|
|
||||||
$data['whiteList'] = $aTagsWhitelist;
|
|
||||||
}
|
|
||||||
|
|
||||||
$jsonFile = fopen($jsonPhraseSettingsFile, 'w');
|
|
||||||
fwrite($jsonFile, json_encode($data));
|
|
||||||
fclose($jsonFile);
|
|
||||||
}
|
|
||||||
@@ -2,13 +2,14 @@
|
|||||||
|
|
||||||
namespace Nominatim;
|
namespace Nominatim;
|
||||||
|
|
||||||
|
require_once(CONST_LibDir.'/SimpleWordList.php');
|
||||||
|
|
||||||
class Tokenizer
|
class Tokenizer
|
||||||
{
|
{
|
||||||
private $oDB;
|
private $oDB;
|
||||||
|
|
||||||
private $oNormalizer;
|
private $oNormalizer;
|
||||||
private $oTransliterator;
|
private $oTransliterator;
|
||||||
private $aCountryRestriction;
|
|
||||||
|
|
||||||
public function __construct(&$oDB)
|
public function __construct(&$oDB)
|
||||||
{
|
{
|
||||||
@@ -19,7 +20,7 @@ class Tokenizer
|
|||||||
|
|
||||||
public function checkStatus()
|
public function checkStatus()
|
||||||
{
|
{
|
||||||
$sSQL = 'SELECT word_id FROM word limit 1';
|
$sSQL = 'SELECT word_id FROM word WHERE word_id is not null limit 1';
|
||||||
$iWordID = $this->oDB->getOne($sSQL);
|
$iWordID = $this->oDB->getOne($sSQL);
|
||||||
if ($iWordID === false) {
|
if ($iWordID === false) {
|
||||||
throw new \Exception('Query failed', 703);
|
throw new \Exception('Query failed', 703);
|
||||||
@@ -30,12 +31,6 @@ class Tokenizer
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public function setCountryRestriction($aCountries)
|
|
||||||
{
|
|
||||||
$this->aCountryRestriction = $aCountries;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public function normalizeString($sTerm)
|
public function normalizeString($sTerm)
|
||||||
{
|
{
|
||||||
if ($this->oNormalizer === null) {
|
if ($this->oNormalizer === null) {
|
||||||
@@ -45,6 +40,15 @@ class Tokenizer
|
|||||||
return $this->oNormalizer->transliterate($sTerm);
|
return $this->oNormalizer->transliterate($sTerm);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public function mostFrequentWords($iNum)
|
||||||
|
{
|
||||||
|
$sSQL = "SELECT word FROM word WHERE type = 'W'";
|
||||||
|
$sSQL .= "ORDER BY info->'count' DESC LIMIT ".$iNum;
|
||||||
|
return $this->oDB->getCol($sSQL);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
private function makeStandardWord($sTerm)
|
private function makeStandardWord($sTerm)
|
||||||
{
|
{
|
||||||
return trim($this->oTransliterator->transliterate(' '.$sTerm.' '));
|
return trim($this->oTransliterator->transliterate(' '.$sTerm.' '));
|
||||||
@@ -88,13 +92,10 @@ class Tokenizer
|
|||||||
$sNormQuery .= ','.$this->normalizeString($oPhrase->getPhrase());
|
$sNormQuery .= ','.$this->normalizeString($oPhrase->getPhrase());
|
||||||
$sPhrase = $this->makeStandardWord($oPhrase->getPhrase());
|
$sPhrase = $this->makeStandardWord($oPhrase->getPhrase());
|
||||||
Debug::printVar('Phrase', $sPhrase);
|
Debug::printVar('Phrase', $sPhrase);
|
||||||
if (strlen($sPhrase) > 0) {
|
|
||||||
$aWords = explode(' ', $sPhrase);
|
$oWordList = new SimpleWordList($sPhrase);
|
||||||
Tokenizer::addTokens($aTokens, $aWords);
|
$aTokens = array_merge($aTokens, $oWordList->getTokens());
|
||||||
$aWordLists[] = $aWords;
|
$aWordLists[] = $oWordList;
|
||||||
} else {
|
|
||||||
$aWordLists[] = array();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Debug::printVar('Tokens', $aTokens);
|
Debug::printVar('Tokens', $aTokens);
|
||||||
@@ -103,7 +104,7 @@ class Tokenizer
|
|||||||
$oValidTokens = $this->computeValidTokens($aTokens, $sNormQuery);
|
$oValidTokens = $this->computeValidTokens($aTokens, $sNormQuery);
|
||||||
|
|
||||||
foreach ($aPhrases as $iPhrase => $oPhrase) {
|
foreach ($aPhrases as $iPhrase => $oPhrase) {
|
||||||
$oPhrase->computeWordSets($aWordLists[$iPhrase], $oValidTokens);
|
$oPhrase->setWordSets($aWordLists[$iPhrase]->getWordSets($oValidTokens));
|
||||||
}
|
}
|
||||||
|
|
||||||
return $oValidTokens;
|
return $oValidTokens;
|
||||||
@@ -162,10 +163,7 @@ class Tokenizer
|
|||||||
|
|
||||||
switch ($aWord['type']) {
|
switch ($aWord['type']) {
|
||||||
case 'C': // country name tokens
|
case 'C': // country name tokens
|
||||||
if ($aWord['word'] !== null
|
if ($aWord['word'] !== null) {
|
||||||
&& (!$this->aCountryRestriction
|
|
||||||
|| in_array($aWord['word'], $this->aCountryRestriction))
|
|
||||||
) {
|
|
||||||
$oValidTokens->addToken(
|
$oValidTokens->addToken(
|
||||||
$sTok,
|
$sTok,
|
||||||
new Token\Country($iId, $aWord['word'])
|
new Token\Country($iId, $aWord['word'])
|
||||||
@@ -220,27 +218,4 @@ class Tokenizer
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Add the tokens from this phrase to the given list of tokens.
|
|
||||||
*
|
|
||||||
* @param string[] $aTokens List of tokens to append.
|
|
||||||
*
|
|
||||||
* @return void
|
|
||||||
*/
|
|
||||||
private static function addTokens(&$aTokens, $aWords)
|
|
||||||
{
|
|
||||||
$iNumWords = count($aWords);
|
|
||||||
|
|
||||||
for ($i = 0; $i < $iNumWords; $i++) {
|
|
||||||
$sPhrase = $aWords[$i];
|
|
||||||
$aTokens[$sPhrase] = $sPhrase;
|
|
||||||
|
|
||||||
for ($j = $i + 1; $j < $iNumWords; $j++) {
|
|
||||||
$sPhrase .= ' '.$aWords[$j];
|
|
||||||
$aTokens[$sPhrase] = $sPhrase;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
@@ -2,12 +2,13 @@
|
|||||||
|
|
||||||
namespace Nominatim;
|
namespace Nominatim;
|
||||||
|
|
||||||
|
require_once(CONST_LibDir.'/SimpleWordList.php');
|
||||||
|
|
||||||
class Tokenizer
|
class Tokenizer
|
||||||
{
|
{
|
||||||
private $oDB;
|
private $oDB;
|
||||||
|
|
||||||
private $oNormalizer = null;
|
private $oNormalizer = null;
|
||||||
private $aCountryRestriction = null;
|
|
||||||
|
|
||||||
public function __construct(&$oDB)
|
public function __construct(&$oDB)
|
||||||
{
|
{
|
||||||
@@ -37,12 +38,6 @@ class Tokenizer
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public function setCountryRestriction($aCountries)
|
|
||||||
{
|
|
||||||
$this->aCountryRestriction = $aCountries;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public function normalizeString($sTerm)
|
public function normalizeString($sTerm)
|
||||||
{
|
{
|
||||||
if ($this->oNormalizer === null) {
|
if ($this->oNormalizer === null) {
|
||||||
@@ -53,6 +48,14 @@ class Tokenizer
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public function mostFrequentWords($iNum)
|
||||||
|
{
|
||||||
|
$sSQL = 'SELECT word FROM word WHERE word is not null ';
|
||||||
|
$sSQL .= 'ORDER BY search_name_count DESC LIMIT '.$iNum;
|
||||||
|
return $this->oDB->getCol($sSQL);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public function tokensForSpecialTerm($sTerm)
|
public function tokensForSpecialTerm($sTerm)
|
||||||
{
|
{
|
||||||
$aResults = array();
|
$aResults = array();
|
||||||
@@ -92,6 +95,23 @@ class Tokenizer
|
|||||||
$sNormQuery .= ','.$this->normalizeString($oPhrase->getPhrase());
|
$sNormQuery .= ','.$this->normalizeString($oPhrase->getPhrase());
|
||||||
$sSQL .= 'make_standard_name(:' .$iPhrase.') as p'.$iPhrase.',';
|
$sSQL .= 'make_standard_name(:' .$iPhrase.') as p'.$iPhrase.',';
|
||||||
$aParams[':'.$iPhrase] = $oPhrase->getPhrase();
|
$aParams[':'.$iPhrase] = $oPhrase->getPhrase();
|
||||||
|
|
||||||
|
// Conflicts between US state abbreviations and various words
|
||||||
|
// for 'the' in different languages
|
||||||
|
switch (strtolower($oPhrase->getPhrase())) {
|
||||||
|
case 'il':
|
||||||
|
$aParams[':'.$iPhrase] = 'illinois';
|
||||||
|
break;
|
||||||
|
case 'al':
|
||||||
|
$aParams[':'.$iPhrase] = 'alabama';
|
||||||
|
break;
|
||||||
|
case 'la':
|
||||||
|
$aParams[':'.$iPhrase] = 'louisiana';
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
$aParams[':'.$iPhrase] = $oPhrase->getPhrase();
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
$sSQL = substr($sSQL, 0, -1);
|
$sSQL = substr($sSQL, 0, -1);
|
||||||
|
|
||||||
@@ -106,13 +126,14 @@ class Tokenizer
|
|||||||
$aWordLists = array();
|
$aWordLists = array();
|
||||||
$aTokens = array();
|
$aTokens = array();
|
||||||
foreach ($aNormPhrases as $sPhrase) {
|
foreach ($aNormPhrases as $sPhrase) {
|
||||||
if (strlen($sPhrase) > 0) {
|
$oWordList = new SimpleWordList($sPhrase);
|
||||||
$aWords = explode(' ', $sPhrase);
|
|
||||||
Tokenizer::addTokens($aTokens, $aWords);
|
foreach ($oWordList->getTokens() as $sToken) {
|
||||||
$aWordLists[] = $aWords;
|
$aTokens[' '.$sToken] = ' '.$sToken;
|
||||||
} else {
|
$aTokens[$sToken] = $sToken;
|
||||||
$aWordLists[] = array();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
$aWordLists[] = $oWordList;
|
||||||
}
|
}
|
||||||
|
|
||||||
Debug::printVar('Tokens', $aTokens);
|
Debug::printVar('Tokens', $aTokens);
|
||||||
@@ -121,7 +142,7 @@ class Tokenizer
|
|||||||
$oValidTokens = $this->computeValidTokens($aTokens, $sNormQuery);
|
$oValidTokens = $this->computeValidTokens($aTokens, $sNormQuery);
|
||||||
|
|
||||||
foreach ($aPhrases as $iPhrase => $oPhrase) {
|
foreach ($aPhrases as $iPhrase => $oPhrase) {
|
||||||
$oPhrase->computeWordSets($aWordLists[$iPhrase], $oValidTokens);
|
$oPhrase->setWordSets($aWordLists[$iPhrase]->getWordSets($oValidTokens));
|
||||||
}
|
}
|
||||||
|
|
||||||
return $oValidTokens;
|
return $oValidTokens;
|
||||||
@@ -206,12 +227,7 @@ class Tokenizer
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
} elseif ($aWord['country_code']) {
|
} elseif ($aWord['country_code']) {
|
||||||
// Filter country tokens that do not match restricted countries.
|
|
||||||
if (!$this->aCountryRestriction
|
|
||||||
|| in_array($aWord['country_code'], $this->aCountryRestriction)
|
|
||||||
) {
|
|
||||||
$oToken = new Token\Country($iId, $aWord['country_code']);
|
$oToken = new Token\Country($iId, $aWord['country_code']);
|
||||||
}
|
|
||||||
} elseif ($aWord['word_token'][0] == ' ') {
|
} elseif ($aWord['word_token'][0] == ' ') {
|
||||||
$oToken = new Token\Word(
|
$oToken = new Token\Word(
|
||||||
$iId,
|
$iId,
|
||||||
@@ -238,29 +254,4 @@ class Tokenizer
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Add the tokens from this phrase to the given list of tokens.
|
|
||||||
*
|
|
||||||
* @param string[] $aTokens List of tokens to append.
|
|
||||||
*
|
|
||||||
* @return void
|
|
||||||
*/
|
|
||||||
private static function addTokens(&$aTokens, $aWords)
|
|
||||||
{
|
|
||||||
$iNumWords = count($aWords);
|
|
||||||
|
|
||||||
for ($i = 0; $i < $iNumWords; $i++) {
|
|
||||||
$sPhrase = $aWords[$i];
|
|
||||||
$aTokens[' '.$sPhrase] = ' '.$sPhrase;
|
|
||||||
$aTokens[$sPhrase] = $sPhrase;
|
|
||||||
|
|
||||||
for ($j = $i + 1; $j < $iNumWords; $j++) {
|
|
||||||
$sPhrase .= ' '.$aWords[$j];
|
|
||||||
$aTokens[' '.$sPhrase] = ' '.$sPhrase;
|
|
||||||
$aTokens[$sPhrase] = $sPhrase;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -223,11 +223,13 @@ BEGIN
|
|||||||
OR placex.country_code = place.country_code)
|
OR placex.country_code = place.country_code)
|
||||||
ORDER BY rank_address desc,
|
ORDER BY rank_address desc,
|
||||||
(place_addressline.place_id = in_place_id) desc,
|
(place_addressline.place_id = in_place_id) desc,
|
||||||
(fromarea and place.centroid is not null and not isaddress
|
(CASE WHEN coalesce((avals(name) && avals(place.address)), False) THEN 2
|
||||||
and (place.address is null or avals(name) && avals(place.address))
|
WHEN isaddress THEN 0
|
||||||
and ST_Contains(geometry, place.centroid)) desc,
|
WHEN fromarea
|
||||||
isaddress desc, fromarea desc,
|
and place.centroid is not null
|
||||||
distance asc, rank_search desc
|
and ST_Contains(geometry, place.centroid) THEN 1
|
||||||
|
ELSE -1 END) desc,
|
||||||
|
fromarea desc, distance asc, rank_search desc
|
||||||
LOOP
|
LOOP
|
||||||
-- RAISE WARNING '%',location;
|
-- RAISE WARNING '%',location;
|
||||||
location_isaddress := location.rank_address != current_rank_address;
|
location_isaddress := location.rank_address != current_rank_address;
|
||||||
|
|||||||
@@ -43,7 +43,7 @@ LANGUAGE plpgsql STABLE;
|
|||||||
|
|
||||||
|
|
||||||
-- find the parent road of the cut road parts
|
-- find the parent road of the cut road parts
|
||||||
CREATE OR REPLACE FUNCTION get_interpolation_parent(street INTEGER[], place INTEGER[],
|
CREATE OR REPLACE FUNCTION get_interpolation_parent(token_info JSONB,
|
||||||
partition SMALLINT,
|
partition SMALLINT,
|
||||||
centroid GEOMETRY, geom GEOMETRY)
|
centroid GEOMETRY, geom GEOMETRY)
|
||||||
RETURNS BIGINT
|
RETURNS BIGINT
|
||||||
@@ -52,7 +52,7 @@ DECLARE
|
|||||||
parent_place_id BIGINT;
|
parent_place_id BIGINT;
|
||||||
location RECORD;
|
location RECORD;
|
||||||
BEGIN
|
BEGIN
|
||||||
parent_place_id := find_parent_for_address(street, place, partition, centroid);
|
parent_place_id := find_parent_for_address(token_info, partition, centroid);
|
||||||
|
|
||||||
IF parent_place_id is null THEN
|
IF parent_place_id is null THEN
|
||||||
FOR location IN SELECT place_id FROM placex
|
FOR location IN SELECT place_id FROM placex
|
||||||
@@ -155,9 +155,8 @@ BEGIN
|
|||||||
NEW.interpolationtype = NEW.address->'interpolation';
|
NEW.interpolationtype = NEW.address->'interpolation';
|
||||||
|
|
||||||
place_centroid := ST_PointOnSurface(NEW.linegeo);
|
place_centroid := ST_PointOnSurface(NEW.linegeo);
|
||||||
NEW.parent_place_id = get_interpolation_parent(token_addr_street_match_tokens(NEW.token_info),
|
NEW.parent_place_id = get_interpolation_parent(NEW.token_info, NEW.partition,
|
||||||
token_addr_place_match_tokens(NEW.token_info),
|
place_centroid, NEW.linegeo);
|
||||||
NEW.partition, place_centroid, NEW.linegeo);
|
|
||||||
|
|
||||||
interpol_postcode := token_normalized_postcode(NEW.address->'postcode');
|
interpol_postcode := token_normalized_postcode(NEW.address->'postcode');
|
||||||
|
|
||||||
|
|||||||
@@ -66,7 +66,7 @@ LANGUAGE plpgsql STABLE;
|
|||||||
|
|
||||||
CREATE OR REPLACE FUNCTION get_address_place(in_partition SMALLINT, feature GEOMETRY,
|
CREATE OR REPLACE FUNCTION get_address_place(in_partition SMALLINT, feature GEOMETRY,
|
||||||
from_rank SMALLINT, to_rank SMALLINT,
|
from_rank SMALLINT, to_rank SMALLINT,
|
||||||
extent FLOAT, tokens INT[])
|
extent FLOAT, token_info JSONB, key TEXT)
|
||||||
RETURNS nearfeaturecentr
|
RETURNS nearfeaturecentr
|
||||||
AS $$
|
AS $$
|
||||||
DECLARE
|
DECLARE
|
||||||
@@ -80,7 +80,7 @@ BEGIN
|
|||||||
FROM location_area_large_{{ partition }}
|
FROM location_area_large_{{ partition }}
|
||||||
WHERE geometry && ST_Expand(feature, extent)
|
WHERE geometry && ST_Expand(feature, extent)
|
||||||
AND rank_address between from_rank and to_rank
|
AND rank_address between from_rank and to_rank
|
||||||
AND tokens && keywords
|
AND token_matches_address(token_info, key, keywords)
|
||||||
GROUP BY place_id, keywords, rank_address, rank_search, isguess, postcode, centroid
|
GROUP BY place_id, keywords, rank_address, rank_search, isguess, postcode, centroid
|
||||||
ORDER BY bool_or(ST_Intersects(geometry, feature)), distance LIMIT 1;
|
ORDER BY bool_or(ST_Intersects(geometry, feature)), distance LIMIT 1;
|
||||||
RETURN r;
|
RETURN r;
|
||||||
@@ -148,18 +148,21 @@ LANGUAGE plpgsql;
|
|||||||
|
|
||||||
CREATE OR REPLACE FUNCTION getNearestNamedRoadPlaceId(in_partition INTEGER,
|
CREATE OR REPLACE FUNCTION getNearestNamedRoadPlaceId(in_partition INTEGER,
|
||||||
point GEOMETRY,
|
point GEOMETRY,
|
||||||
isin_token INTEGER[])
|
token_info JSONB)
|
||||||
RETURNS BIGINT
|
RETURNS BIGINT
|
||||||
AS $$
|
AS $$
|
||||||
DECLARE
|
DECLARE
|
||||||
parent BIGINT;
|
parent BIGINT;
|
||||||
BEGIN
|
BEGIN
|
||||||
|
IF not token_has_addr_street(token_info) THEN
|
||||||
|
RETURN NULL;
|
||||||
|
END IF;
|
||||||
|
|
||||||
{% for partition in db.partitions %}
|
{% for partition in db.partitions %}
|
||||||
IF in_partition = {{ partition }} THEN
|
IF in_partition = {{ partition }} THEN
|
||||||
SELECT place_id FROM search_name_{{ partition }}
|
SELECT place_id FROM search_name_{{ partition }}
|
||||||
INTO parent
|
INTO parent
|
||||||
WHERE name_vector && isin_token
|
WHERE token_matches_street(token_info, name_vector)
|
||||||
AND centroid && ST_Expand(point, 0.015)
|
AND centroid && ST_Expand(point, 0.015)
|
||||||
AND address_rank between 26 and 27
|
AND address_rank between 26 and 27
|
||||||
ORDER BY ST_Distance(centroid, point) ASC limit 1;
|
ORDER BY ST_Distance(centroid, point) ASC limit 1;
|
||||||
@@ -174,19 +177,22 @@ LANGUAGE plpgsql STABLE;
|
|||||||
|
|
||||||
CREATE OR REPLACE FUNCTION getNearestNamedPlacePlaceId(in_partition INTEGER,
|
CREATE OR REPLACE FUNCTION getNearestNamedPlacePlaceId(in_partition INTEGER,
|
||||||
point GEOMETRY,
|
point GEOMETRY,
|
||||||
isin_token INTEGER[])
|
token_info JSONB)
|
||||||
RETURNS BIGINT
|
RETURNS BIGINT
|
||||||
AS $$
|
AS $$
|
||||||
DECLARE
|
DECLARE
|
||||||
parent BIGINT;
|
parent BIGINT;
|
||||||
BEGIN
|
BEGIN
|
||||||
|
IF not token_has_addr_place(token_info) THEN
|
||||||
|
RETURN NULL;
|
||||||
|
END IF;
|
||||||
|
|
||||||
{% for partition in db.partitions %}
|
{% for partition in db.partitions %}
|
||||||
IF in_partition = {{ partition }} THEN
|
IF in_partition = {{ partition }} THEN
|
||||||
SELECT place_id
|
SELECT place_id
|
||||||
INTO parent
|
INTO parent
|
||||||
FROM search_name_{{ partition }}
|
FROM search_name_{{ partition }}
|
||||||
WHERE name_vector && isin_token
|
WHERE token_matches_place(token_info, name_vector)
|
||||||
AND centroid && ST_Expand(point, 0.04)
|
AND centroid && ST_Expand(point, 0.04)
|
||||||
AND address_rank between 16 and 25
|
AND address_rank between 16 and 25
|
||||||
ORDER BY ST_Distance(centroid, point) ASC limit 1;
|
ORDER BY ST_Distance(centroid, point) ASC limit 1;
|
||||||
|
|||||||
@@ -247,6 +247,7 @@ BEGIN
|
|||||||
indexed_status = 2,
|
indexed_status = 2,
|
||||||
geometry = NEW.geometry
|
geometry = NEW.geometry
|
||||||
where place_id = existingplacex.place_id;
|
where place_id = existingplacex.place_id;
|
||||||
|
|
||||||
-- if a node(=>house), which is part of a interpolation line, changes (e.g. the street attribute) => mark this line for reparenting
|
-- if a node(=>house), which is part of a interpolation line, changes (e.g. the street attribute) => mark this line for reparenting
|
||||||
-- (already here, because interpolation lines are reindexed before nodes, so in the second call it would be too late)
|
-- (already here, because interpolation lines are reindexed before nodes, so in the second call it would be too late)
|
||||||
IF NEW.osm_type='N'
|
IF NEW.osm_type='N'
|
||||||
@@ -270,6 +271,26 @@ BEGIN
|
|||||||
and x.class = p.class;
|
and x.class = p.class;
|
||||||
END IF;
|
END IF;
|
||||||
|
|
||||||
|
IF coalesce(existing.name::text, '') != coalesce(NEW.name::text, '')
|
||||||
|
THEN
|
||||||
|
IF existingplacex.rank_address between 26 and 27 THEN
|
||||||
|
-- When streets change their name, this may have an effect on POI objects
|
||||||
|
-- with addr:street tags.
|
||||||
|
UPDATE placex SET indexed_status = 2
|
||||||
|
WHERE indexed_status = 0 and address ? 'street'
|
||||||
|
and parent_place_id = existingplacex.place_id;
|
||||||
|
UPDATE placex SET indexed_status = 2
|
||||||
|
WHERE indexed_status = 0 and rank_search = 30 and address ? 'street'
|
||||||
|
and ST_DWithin(NEW.geometry, geometry, 0.002);
|
||||||
|
ELSEIF existingplacex.rank_address between 16 and 25 THEN
|
||||||
|
-- When places change their name, this may have an effect on POI objects
|
||||||
|
-- with addr:place tags.
|
||||||
|
UPDATE placex SET indexed_status = 2
|
||||||
|
WHERE indexed_status = 0 and address ? 'place' and rank_search = 30
|
||||||
|
and parent_place_id = existingplacex.place_id;
|
||||||
|
-- No update of surrounding objects, potentially too expensive.
|
||||||
|
END IF;
|
||||||
|
END IF;
|
||||||
END IF;
|
END IF;
|
||||||
|
|
||||||
-- Abort the add (we modified the existing place instead)
|
-- Abort the add (we modified the existing place instead)
|
||||||
|
|||||||
@@ -1,27 +1,33 @@
|
|||||||
-- Trigger functions for the placex table.
|
-- Trigger functions for the placex table.
|
||||||
|
|
||||||
|
-- Information returned by update preparation.
|
||||||
|
DROP TYPE IF EXISTS prepare_update_info CASCADE;
|
||||||
|
CREATE TYPE prepare_update_info AS (
|
||||||
|
name HSTORE,
|
||||||
|
address HSTORE,
|
||||||
|
rank_address SMALLINT,
|
||||||
|
country_code TEXT,
|
||||||
|
class TEXT,
|
||||||
|
type TEXT,
|
||||||
|
linked_place_id BIGINT
|
||||||
|
);
|
||||||
|
|
||||||
-- Retrieve the data needed by the indexer for updating the place.
|
-- Retrieve the data needed by the indexer for updating the place.
|
||||||
--
|
CREATE OR REPLACE FUNCTION placex_indexing_prepare(p placex)
|
||||||
-- Return parameters:
|
RETURNS prepare_update_info
|
||||||
-- name list of names
|
|
||||||
-- address list of address tags, either from the object or a surrounding
|
|
||||||
-- building
|
|
||||||
-- country_feature If the place is a country feature, this contains the
|
|
||||||
-- country code, otherwise it is null.
|
|
||||||
CREATE OR REPLACE FUNCTION placex_prepare_update(p placex,
|
|
||||||
OUT name HSTORE,
|
|
||||||
OUT address HSTORE,
|
|
||||||
OUT country_feature VARCHAR)
|
|
||||||
AS $$
|
AS $$
|
||||||
|
DECLARE
|
||||||
|
location RECORD;
|
||||||
|
result prepare_update_info;
|
||||||
BEGIN
|
BEGIN
|
||||||
-- For POI nodes, check if the address should be derived from a surrounding
|
-- For POI nodes, check if the address should be derived from a surrounding
|
||||||
-- building.
|
-- building.
|
||||||
IF p.rank_search < 30 OR p.osm_type != 'N' OR p.address is not null THEN
|
IF p.rank_search < 30 OR p.osm_type != 'N' OR p.address is not null THEN
|
||||||
address := p.address;
|
result.address := p.address;
|
||||||
ELSE
|
ELSE
|
||||||
-- The additional && condition works around the misguided query
|
-- The additional && condition works around the misguided query
|
||||||
-- planner of postgis 3.0.
|
-- planner of postgis 3.0.
|
||||||
SELECT placex.address || hstore('_inherited', '') INTO address
|
SELECT placex.address || hstore('_inherited', '') INTO result.address
|
||||||
FROM placex
|
FROM placex
|
||||||
WHERE ST_Covers(geometry, p.centroid)
|
WHERE ST_Covers(geometry, p.centroid)
|
||||||
and geometry && p.centroid
|
and geometry && p.centroid
|
||||||
@@ -31,15 +37,26 @@ BEGIN
|
|||||||
LIMIT 1;
|
LIMIT 1;
|
||||||
END IF;
|
END IF;
|
||||||
|
|
||||||
address := address - '_unlisted_place'::TEXT;
|
result.address := result.address - '_unlisted_place'::TEXT;
|
||||||
name := p.name;
|
result.name := p.name;
|
||||||
|
result.class := p.class;
|
||||||
|
result.type := p.type;
|
||||||
|
result.country_code := p.country_code;
|
||||||
|
result.rank_address := p.rank_address;
|
||||||
|
|
||||||
country_feature := CASE WHEN p.admin_level = 2
|
-- Names of linked places need to be merged in, so search for a linkable
|
||||||
and p.class = 'boundary' and p.type = 'administrative'
|
-- place already here.
|
||||||
and p.osm_type = 'R'
|
SELECT * INTO location FROM find_linked_place(p);
|
||||||
THEN p.country_code
|
|
||||||
ELSE null
|
IF location.place_id is not NULL THEN
|
||||||
END;
|
result.linked_place_id := location.place_id;
|
||||||
|
|
||||||
|
IF NOT location.name IS NULL THEN
|
||||||
|
result.name := location.name || result.name;
|
||||||
|
END IF;
|
||||||
|
END IF;
|
||||||
|
|
||||||
|
RETURN result;
|
||||||
END;
|
END;
|
||||||
$$
|
$$
|
||||||
LANGUAGE plpgsql STABLE;
|
LANGUAGE plpgsql STABLE;
|
||||||
@@ -89,8 +106,7 @@ CREATE OR REPLACE FUNCTION find_parent_for_poi(poi_osm_type CHAR(1),
|
|||||||
poi_osm_id BIGINT,
|
poi_osm_id BIGINT,
|
||||||
poi_partition SMALLINT,
|
poi_partition SMALLINT,
|
||||||
bbox GEOMETRY,
|
bbox GEOMETRY,
|
||||||
addr_street INTEGER[],
|
token_info JSONB,
|
||||||
addr_place INTEGER[],
|
|
||||||
is_place_addr BOOLEAN)
|
is_place_addr BOOLEAN)
|
||||||
RETURNS BIGINT
|
RETURNS BIGINT
|
||||||
AS $$
|
AS $$
|
||||||
@@ -104,8 +120,7 @@ BEGIN
|
|||||||
parent_place_id := find_associated_street(poi_osm_type, poi_osm_id);
|
parent_place_id := find_associated_street(poi_osm_type, poi_osm_id);
|
||||||
|
|
||||||
IF parent_place_id is null THEN
|
IF parent_place_id is null THEN
|
||||||
parent_place_id := find_parent_for_address(addr_street, addr_place,
|
parent_place_id := find_parent_for_address(token_info, poi_partition, bbox);
|
||||||
poi_partition, bbox);
|
|
||||||
END IF;
|
END IF;
|
||||||
|
|
||||||
IF parent_place_id is null and poi_osm_type = 'N' THEN
|
IF parent_place_id is null and poi_osm_type = 'N' THEN
|
||||||
@@ -318,13 +333,14 @@ BEGIN
|
|||||||
WHERE s.place_id = parent_place_id;
|
WHERE s.place_id = parent_place_id;
|
||||||
|
|
||||||
FOR addr_item IN
|
FOR addr_item IN
|
||||||
SELECT (get_addr_tag_rank(key, country)).*, match_tokens, search_tokens
|
SELECT (get_addr_tag_rank(key, country)).*, key,
|
||||||
FROM token_get_address_tokens(token_info)
|
token_get_address_search_tokens(token_info, key) as search_tokens
|
||||||
WHERE not search_tokens <@ parent_address_vector
|
FROM token_get_address_keys(token_info) as key
|
||||||
|
WHERE not token_get_address_search_tokens(token_info, key) <@ parent_address_vector
|
||||||
LOOP
|
LOOP
|
||||||
addr_place := get_address_place(in_partition, geometry,
|
addr_place := get_address_place(in_partition, geometry,
|
||||||
addr_item.from_rank, addr_item.to_rank,
|
addr_item.from_rank, addr_item.to_rank,
|
||||||
addr_item.extent, addr_item.match_tokens);
|
addr_item.extent, token_info, addr_item.key);
|
||||||
|
|
||||||
IF addr_place is null THEN
|
IF addr_place is null THEN
|
||||||
-- No place found in OSM that matches. Make it at least searchable.
|
-- No place found in OSM that matches. Make it at least searchable.
|
||||||
@@ -432,14 +448,16 @@ BEGIN
|
|||||||
|
|
||||||
FOR location IN
|
FOR location IN
|
||||||
SELECT (get_address_place(partition, geometry, from_rank, to_rank,
|
SELECT (get_address_place(partition, geometry, from_rank, to_rank,
|
||||||
extent, match_tokens)).*, search_tokens
|
extent, token_info, key)).*, key
|
||||||
FROM (SELECT (get_addr_tag_rank(key, country)).*, match_tokens, search_tokens
|
FROM (SELECT (get_addr_tag_rank(key, country)).*, key
|
||||||
FROM token_get_address_tokens(token_info)) x
|
FROM token_get_address_keys(token_info) as key) x
|
||||||
ORDER BY rank_address, distance, isguess desc
|
ORDER BY rank_address, distance, isguess desc
|
||||||
LOOP
|
LOOP
|
||||||
IF location.place_id is null THEN
|
IF location.place_id is null THEN
|
||||||
{% if not db.reverse_only %}
|
{% if not db.reverse_only %}
|
||||||
nameaddress_vector := array_merge(nameaddress_vector, location.search_tokens);
|
nameaddress_vector := array_merge(nameaddress_vector,
|
||||||
|
token_get_address_search_tokens(token_info,
|
||||||
|
location.key));
|
||||||
{% endif %}
|
{% endif %}
|
||||||
ELSE
|
ELSE
|
||||||
{% if not db.reverse_only %}
|
{% if not db.reverse_only %}
|
||||||
@@ -674,15 +692,14 @@ DECLARE
|
|||||||
parent_address_level SMALLINT;
|
parent_address_level SMALLINT;
|
||||||
place_address_level SMALLINT;
|
place_address_level SMALLINT;
|
||||||
|
|
||||||
addr_street INTEGER[];
|
|
||||||
addr_place INTEGER[];
|
|
||||||
|
|
||||||
max_rank SMALLINT;
|
max_rank SMALLINT;
|
||||||
|
|
||||||
name_vector INTEGER[];
|
name_vector INTEGER[];
|
||||||
nameaddress_vector INTEGER[];
|
nameaddress_vector INTEGER[];
|
||||||
addr_nameaddress_vector INTEGER[];
|
addr_nameaddress_vector INTEGER[];
|
||||||
|
|
||||||
|
linked_place BIGINT;
|
||||||
|
|
||||||
linked_node_id BIGINT;
|
linked_node_id BIGINT;
|
||||||
linked_importance FLOAT;
|
linked_importance FLOAT;
|
||||||
linked_wikipedia TEXT;
|
linked_wikipedia TEXT;
|
||||||
@@ -718,9 +735,14 @@ BEGIN
|
|||||||
|
|
||||||
NEW.extratags := NEW.extratags - 'linked_place'::TEXT;
|
NEW.extratags := NEW.extratags - 'linked_place'::TEXT;
|
||||||
|
|
||||||
|
-- NEW.linked_place_id contains the precomputed linkee. Save this and restore
|
||||||
|
-- the previous link status.
|
||||||
|
linked_place := NEW.linked_place_id;
|
||||||
|
NEW.linked_place_id := OLD.linked_place_id;
|
||||||
|
|
||||||
IF NEW.linked_place_id is not null THEN
|
IF NEW.linked_place_id is not null THEN
|
||||||
NEW.token_info := null;
|
NEW.token_info := null;
|
||||||
{% if debug %}RAISE WARNING 'place already linked to %', NEW.linked_place_id;{% endif %}
|
{% if debug %}RAISE WARNING 'place already linked to %', OLD.linked_place_id;{% endif %}
|
||||||
RETURN NEW;
|
RETURN NEW;
|
||||||
END IF;
|
END IF;
|
||||||
|
|
||||||
@@ -838,8 +860,6 @@ BEGIN
|
|||||||
END IF;
|
END IF;
|
||||||
|
|
||||||
NEW.housenumber := token_normalized_housenumber(NEW.token_info);
|
NEW.housenumber := token_normalized_housenumber(NEW.token_info);
|
||||||
addr_street := token_addr_street_match_tokens(NEW.token_info);
|
|
||||||
addr_place := token_addr_place_match_tokens(NEW.token_info);
|
|
||||||
|
|
||||||
NEW.postcode := null;
|
NEW.postcode := null;
|
||||||
|
|
||||||
@@ -885,7 +905,7 @@ BEGIN
|
|||||||
NEW.parent_place_id := find_parent_for_poi(NEW.osm_type, NEW.osm_id,
|
NEW.parent_place_id := find_parent_for_poi(NEW.osm_type, NEW.osm_id,
|
||||||
NEW.partition,
|
NEW.partition,
|
||||||
ST_Envelope(NEW.geometry),
|
ST_Envelope(NEW.geometry),
|
||||||
addr_street, addr_place,
|
NEW.token_info,
|
||||||
is_place_address);
|
is_place_address);
|
||||||
|
|
||||||
-- If we found the road take a shortcut here.
|
-- If we found the road take a shortcut here.
|
||||||
@@ -956,8 +976,9 @@ BEGIN
|
|||||||
-- ---------------------------------------------------------------------------
|
-- ---------------------------------------------------------------------------
|
||||||
-- Full indexing
|
-- Full indexing
|
||||||
{% if debug %}RAISE WARNING 'Using full index mode for % %', NEW.osm_type, NEW.osm_id;{% endif %}
|
{% if debug %}RAISE WARNING 'Using full index mode for % %', NEW.osm_type, NEW.osm_id;{% endif %}
|
||||||
SELECT * INTO location FROM find_linked_place(NEW);
|
IF linked_place is not null THEN
|
||||||
IF location.place_id is not null THEN
|
SELECT * INTO location FROM placex WHERE place_id = linked_place;
|
||||||
|
|
||||||
{% if debug %}RAISE WARNING 'Linked %', location;{% endif %}
|
{% if debug %}RAISE WARNING 'Linked %', location;{% endif %}
|
||||||
|
|
||||||
-- Use the linked point as the centre point of the geometry,
|
-- Use the linked point as the centre point of the geometry,
|
||||||
@@ -974,11 +995,6 @@ BEGIN
|
|||||||
NEW.rank_address := location.rank_address;
|
NEW.rank_address := location.rank_address;
|
||||||
END IF;
|
END IF;
|
||||||
|
|
||||||
-- merge in the label name
|
|
||||||
IF NOT location.name IS NULL THEN
|
|
||||||
NEW.name := location.name || NEW.name;
|
|
||||||
END IF;
|
|
||||||
|
|
||||||
-- merge in extra tags
|
-- merge in extra tags
|
||||||
NEW.extratags := hstore('linked_' || location.class, location.type)
|
NEW.extratags := hstore('linked_' || location.class, location.type)
|
||||||
|| coalesce(location.extratags, ''::hstore)
|
|| coalesce(location.extratags, ''::hstore)
|
||||||
|
|||||||
@@ -215,13 +215,12 @@ LANGUAGE plpgsql STABLE;
|
|||||||
|
|
||||||
-- Find the parent of an address with addr:street/addr:place tag.
|
-- Find the parent of an address with addr:street/addr:place tag.
|
||||||
--
|
--
|
||||||
-- \param street Value of addr:street or NULL if tag is missing.
|
-- \param token_info Naming info with the address information.
|
||||||
-- \param place Value of addr:place or NULL if tag is missing.
|
|
||||||
-- \param partition Partition where to search the parent.
|
-- \param partition Partition where to search the parent.
|
||||||
-- \param centroid Location of the address.
|
-- \param centroid Location of the address.
|
||||||
--
|
--
|
||||||
-- \return Place ID of the parent if one was found, NULL otherwise.
|
-- \return Place ID of the parent if one was found, NULL otherwise.
|
||||||
CREATE OR REPLACE FUNCTION find_parent_for_address(street INTEGER[], place INTEGER[],
|
CREATE OR REPLACE FUNCTION find_parent_for_address(token_info JSONB,
|
||||||
partition SMALLINT,
|
partition SMALLINT,
|
||||||
centroid GEOMETRY)
|
centroid GEOMETRY)
|
||||||
RETURNS BIGINT
|
RETURNS BIGINT
|
||||||
@@ -229,30 +228,22 @@ CREATE OR REPLACE FUNCTION find_parent_for_address(street INTEGER[], place INTEG
|
|||||||
DECLARE
|
DECLARE
|
||||||
parent_place_id BIGINT;
|
parent_place_id BIGINT;
|
||||||
BEGIN
|
BEGIN
|
||||||
IF street is not null THEN
|
|
||||||
-- Check for addr:street attributes
|
-- Check for addr:street attributes
|
||||||
-- Note that addr:street links can only be indexed, once the street itself is indexed
|
parent_place_id := getNearestNamedRoadPlaceId(partition, centroid, token_info);
|
||||||
parent_place_id := getNearestNamedRoadPlaceId(partition, centroid, street);
|
|
||||||
IF parent_place_id is not null THEN
|
IF parent_place_id is not null THEN
|
||||||
{% if debug %}RAISE WARNING 'Get parent form addr:street: %', parent_place_id;{% endif %}
|
{% if debug %}RAISE WARNING 'Get parent from addr:street: %', parent_place_id;{% endif %}
|
||||||
RETURN parent_place_id;
|
RETURN parent_place_id;
|
||||||
END IF;
|
END IF;
|
||||||
END IF;
|
|
||||||
|
|
||||||
-- Check for addr:place attributes.
|
-- Check for addr:place attributes.
|
||||||
IF place is not null THEN
|
parent_place_id := getNearestNamedPlacePlaceId(partition, centroid, token_info);
|
||||||
parent_place_id := getNearestNamedPlacePlaceId(partition, centroid, place);
|
{% if debug %}RAISE WARNING 'Get parent from addr:place: %', parent_place_id;{% endif %}
|
||||||
IF parent_place_id is not null THEN
|
|
||||||
{% if debug %}RAISE WARNING 'Get parent form addr:place: %', parent_place_id;{% endif %}
|
|
||||||
RETURN parent_place_id;
|
RETURN parent_place_id;
|
||||||
END IF;
|
|
||||||
END IF;
|
|
||||||
|
|
||||||
RETURN NULL;
|
|
||||||
END;
|
END;
|
||||||
$$
|
$$
|
||||||
LANGUAGE plpgsql STABLE;
|
LANGUAGE plpgsql STABLE;
|
||||||
|
|
||||||
|
|
||||||
CREATE OR REPLACE FUNCTION delete_location(OLD_place_id BIGINT)
|
CREATE OR REPLACE FUNCTION delete_location(OLD_place_id BIGINT)
|
||||||
RETURNS BOOLEAN
|
RETURNS BOOLEAN
|
||||||
AS $$
|
AS $$
|
||||||
|
|||||||
@@ -155,11 +155,11 @@ CREATE INDEX idx_placex_linked_place_id ON placex USING BTREE (linked_place_id)
|
|||||||
CREATE INDEX idx_placex_rank_search ON placex USING BTREE (rank_search, geometry_sector) {{db.tablespace.address_index}};
|
CREATE INDEX idx_placex_rank_search ON placex USING BTREE (rank_search, geometry_sector) {{db.tablespace.address_index}};
|
||||||
CREATE INDEX idx_placex_geometry ON placex USING GIST (geometry) {{db.tablespace.search_index}};
|
CREATE INDEX idx_placex_geometry ON placex USING GIST (geometry) {{db.tablespace.search_index}};
|
||||||
CREATE INDEX idx_placex_geometry_buildings ON placex
|
CREATE INDEX idx_placex_geometry_buildings ON placex
|
||||||
USING GIST (geometry) {{db.tablespace.search_index}}
|
USING {{postgres.spgist_geom}} (geometry) {{db.tablespace.search_index}}
|
||||||
WHERE address is not null and rank_search = 30
|
WHERE address is not null and rank_search = 30
|
||||||
and ST_GeometryType(geometry) in ('ST_Polygon','ST_MultiPolygon');
|
and ST_GeometryType(geometry) in ('ST_Polygon','ST_MultiPolygon');
|
||||||
CREATE INDEX idx_placex_geometry_placenode ON placex
|
CREATE INDEX idx_placex_geometry_placenode ON placex
|
||||||
USING GIST (geometry) {{db.tablespace.search_index}}
|
USING {{postgres.spgist_geom}} (geometry) {{db.tablespace.search_index}}
|
||||||
WHERE osm_type = 'N' and rank_search < 26
|
WHERE osm_type = 'N' and rank_search < 26
|
||||||
and class = 'place' and type != 'postcode' and linked_place_id is null;
|
and class = 'place' and type != 'postcode' and linked_place_id is null;
|
||||||
CREATE INDEX idx_placex_wikidata on placex USING BTREE ((extratags -> 'wikidata')) {{db.tablespace.address_index}} WHERE extratags ? 'wikidata' and class = 'place' and osm_type = 'N' and rank_search < 26;
|
CREATE INDEX idx_placex_wikidata on placex USING BTREE ((extratags -> 'wikidata')) {{db.tablespace.address_index}} WHERE extratags ? 'wikidata' and class = 'place' and osm_type = 'N' and rank_search < 26;
|
||||||
|
|||||||
@@ -14,7 +14,6 @@ DECLARE
|
|||||||
out_partition INTEGER;
|
out_partition INTEGER;
|
||||||
out_parent_place_id BIGINT;
|
out_parent_place_id BIGINT;
|
||||||
location RECORD;
|
location RECORD;
|
||||||
address_street_word_ids INTEGER[];
|
|
||||||
|
|
||||||
BEGIN
|
BEGIN
|
||||||
|
|
||||||
@@ -54,13 +53,9 @@ BEGIN
|
|||||||
|
|
||||||
place_centroid := ST_Centroid(linegeo);
|
place_centroid := ST_Centroid(linegeo);
|
||||||
out_partition := get_partition('us');
|
out_partition := get_partition('us');
|
||||||
out_parent_place_id := null;
|
|
||||||
|
|
||||||
address_street_word_ids := token_addr_street_match_tokens(token_info);
|
|
||||||
IF address_street_word_ids IS NOT NULL THEN
|
|
||||||
out_parent_place_id := getNearestNamedRoadPlaceId(out_partition, place_centroid,
|
out_parent_place_id := getNearestNamedRoadPlaceId(out_partition, place_centroid,
|
||||||
address_street_word_ids);
|
token_info);
|
||||||
END IF;
|
|
||||||
|
|
||||||
IF out_parent_place_id IS NULL THEN
|
IF out_parent_place_id IS NULL THEN
|
||||||
SELECT getNearestParallelRoadFeature(out_partition, linegeo)
|
SELECT getNearestParallelRoadFeature(out_partition, linegeo)
|
||||||
|
|||||||
@@ -34,40 +34,59 @@ AS $$
|
|||||||
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
||||||
|
|
||||||
|
|
||||||
CREATE OR REPLACE FUNCTION token_addr_street_match_tokens(info JSONB)
|
CREATE OR REPLACE FUNCTION token_has_addr_street(info JSONB)
|
||||||
RETURNS INTEGER[]
|
RETURNS BOOLEAN
|
||||||
AS $$
|
AS $$
|
||||||
SELECT (info->>'street')::INTEGER[]
|
SELECT info->>'street' is not null;
|
||||||
|
$$ LANGUAGE SQL IMMUTABLE;
|
||||||
|
|
||||||
|
|
||||||
|
CREATE OR REPLACE FUNCTION token_has_addr_place(info JSONB)
|
||||||
|
RETURNS BOOLEAN
|
||||||
|
AS $$
|
||||||
|
SELECT info->>'place' is not null;
|
||||||
|
$$ LANGUAGE SQL IMMUTABLE;
|
||||||
|
|
||||||
|
|
||||||
|
CREATE OR REPLACE FUNCTION token_matches_street(info JSONB, street_tokens INTEGER[])
|
||||||
|
RETURNS BOOLEAN
|
||||||
|
AS $$
|
||||||
|
SELECT (info->>'street')::INTEGER[] <@ street_tokens
|
||||||
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
||||||
|
|
||||||
|
|
||||||
CREATE OR REPLACE FUNCTION token_addr_place_match_tokens(info JSONB)
|
CREATE OR REPLACE FUNCTION token_matches_place(info JSONB, place_tokens INTEGER[])
|
||||||
RETURNS INTEGER[]
|
RETURNS BOOLEAN
|
||||||
AS $$
|
AS $$
|
||||||
SELECT (info->>'place_match')::INTEGER[]
|
SELECT (info->>'place')::INTEGER[] <@ place_tokens
|
||||||
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
||||||
|
|
||||||
|
|
||||||
CREATE OR REPLACE FUNCTION token_addr_place_search_tokens(info JSONB)
|
CREATE OR REPLACE FUNCTION token_addr_place_search_tokens(info JSONB)
|
||||||
RETURNS INTEGER[]
|
RETURNS INTEGER[]
|
||||||
AS $$
|
AS $$
|
||||||
SELECT (info->>'place_search')::INTEGER[]
|
SELECT (info->>'place')::INTEGER[]
|
||||||
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
||||||
|
|
||||||
|
|
||||||
DROP TYPE IF EXISTS token_addresstoken CASCADE;
|
CREATE OR REPLACE FUNCTION token_get_address_keys(info JSONB)
|
||||||
CREATE TYPE token_addresstoken AS (
|
RETURNS SETOF TEXT
|
||||||
key TEXT,
|
|
||||||
match_tokens INT[],
|
|
||||||
search_tokens INT[]
|
|
||||||
);
|
|
||||||
|
|
||||||
CREATE OR REPLACE FUNCTION token_get_address_tokens(info JSONB)
|
|
||||||
RETURNS SETOF token_addresstoken
|
|
||||||
AS $$
|
AS $$
|
||||||
SELECT key, (value->>1)::int[] as match_tokens,
|
SELECT * FROM jsonb_object_keys(info->'addr');
|
||||||
(value->>0)::int[] as search_tokens
|
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
||||||
FROM jsonb_each(info->'addr');
|
|
||||||
|
|
||||||
|
CREATE OR REPLACE FUNCTION token_get_address_search_tokens(info JSONB, key TEXT)
|
||||||
|
RETURNS INTEGER[]
|
||||||
|
AS $$
|
||||||
|
SELECT (info->'addr'->>key)::INTEGER[];
|
||||||
|
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
||||||
|
|
||||||
|
|
||||||
|
CREATE OR REPLACE FUNCTION token_matches_address(info JSONB, key TEXT, tokens INTEGER[])
|
||||||
|
RETURNS BOOLEAN
|
||||||
|
AS $$
|
||||||
|
SELECT (info->'addr'->>key)::INTEGER[] <@ tokens;
|
||||||
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
||||||
|
|
||||||
|
|
||||||
@@ -127,15 +146,34 @@ BEGIN
|
|||||||
VALUES (term_id, term, 'w', json_build_object('count', term_count));
|
VALUES (term_id, term, 'w', json_build_object('count', term_count));
|
||||||
END IF;
|
END IF;
|
||||||
|
|
||||||
IF term_count < {{ max_word_freq }} THEN
|
|
||||||
partial_tokens := array_merge(partial_tokens, ARRAY[term_id]);
|
partial_tokens := array_merge(partial_tokens, ARRAY[term_id]);
|
||||||
END IF;
|
|
||||||
END LOOP;
|
END LOOP;
|
||||||
END;
|
END;
|
||||||
$$
|
$$
|
||||||
LANGUAGE plpgsql;
|
LANGUAGE plpgsql;
|
||||||
|
|
||||||
|
|
||||||
|
CREATE OR REPLACE FUNCTION getorcreate_partial_word(partial TEXT)
|
||||||
|
RETURNS INTEGER
|
||||||
|
AS $$
|
||||||
|
DECLARE
|
||||||
|
token INTEGER;
|
||||||
|
BEGIN
|
||||||
|
SELECT min(word_id) INTO token
|
||||||
|
FROM word WHERE word_token = partial and type = 'w';
|
||||||
|
|
||||||
|
IF token IS NULL THEN
|
||||||
|
token := nextval('seq_word');
|
||||||
|
INSERT INTO word (word_id, word_token, type, info)
|
||||||
|
VALUES (token, partial, 'w', json_build_object('count', 0));
|
||||||
|
END IF;
|
||||||
|
|
||||||
|
RETURN token;
|
||||||
|
END;
|
||||||
|
$$
|
||||||
|
LANGUAGE plpgsql;
|
||||||
|
|
||||||
|
|
||||||
CREATE OR REPLACE FUNCTION getorcreate_hnr_id(lookup_term TEXT)
|
CREATE OR REPLACE FUNCTION getorcreate_hnr_id(lookup_term TEXT)
|
||||||
RETURNS INTEGER
|
RETURNS INTEGER
|
||||||
AS $$
|
AS $$
|
||||||
@@ -34,17 +34,31 @@ AS $$
|
|||||||
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
||||||
|
|
||||||
|
|
||||||
CREATE OR REPLACE FUNCTION token_addr_street_match_tokens(info JSONB)
|
CREATE OR REPLACE FUNCTION token_has_addr_street(info JSONB)
|
||||||
RETURNS INTEGER[]
|
RETURNS BOOLEAN
|
||||||
AS $$
|
AS $$
|
||||||
SELECT (info->>'street')::INTEGER[]
|
SELECT info->>'street' is not null;
|
||||||
|
$$ LANGUAGE SQL IMMUTABLE;
|
||||||
|
|
||||||
|
|
||||||
|
CREATE OR REPLACE FUNCTION token_has_addr_place(info JSONB)
|
||||||
|
RETURNS BOOLEAN
|
||||||
|
AS $$
|
||||||
|
SELECT info->>'place_match' is not null;
|
||||||
|
$$ LANGUAGE SQL IMMUTABLE;
|
||||||
|
|
||||||
|
|
||||||
|
CREATE OR REPLACE FUNCTION token_matches_street(info JSONB, street_tokens INTEGER[])
|
||||||
|
RETURNS BOOLEAN
|
||||||
|
AS $$
|
||||||
|
SELECT (info->>'street')::INTEGER[] && street_tokens
|
||||||
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
||||||
|
|
||||||
|
|
||||||
CREATE OR REPLACE FUNCTION token_addr_place_match_tokens(info JSONB)
|
CREATE OR REPLACE FUNCTION token_matches_place(info JSONB, place_tokens INTEGER[])
|
||||||
RETURNS INTEGER[]
|
RETURNS BOOLEAN
|
||||||
AS $$
|
AS $$
|
||||||
SELECT (info->>'place_match')::INTEGER[]
|
SELECT (info->>'place_match')::INTEGER[] && place_tokens
|
||||||
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
||||||
|
|
||||||
|
|
||||||
@@ -55,19 +69,24 @@ AS $$
|
|||||||
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
||||||
|
|
||||||
|
|
||||||
DROP TYPE IF EXISTS token_addresstoken CASCADE;
|
CREATE OR REPLACE FUNCTION token_get_address_keys(info JSONB)
|
||||||
CREATE TYPE token_addresstoken AS (
|
RETURNS SETOF TEXT
|
||||||
key TEXT,
|
|
||||||
match_tokens INT[],
|
|
||||||
search_tokens INT[]
|
|
||||||
);
|
|
||||||
|
|
||||||
CREATE OR REPLACE FUNCTION token_get_address_tokens(info JSONB)
|
|
||||||
RETURNS SETOF token_addresstoken
|
|
||||||
AS $$
|
AS $$
|
||||||
SELECT key, (value->>1)::int[] as match_tokens,
|
SELECT * FROM jsonb_object_keys(info->'addr');
|
||||||
(value->>0)::int[] as search_tokens
|
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
||||||
FROM jsonb_each(info->'addr');
|
|
||||||
|
|
||||||
|
CREATE OR REPLACE FUNCTION token_get_address_search_tokens(info JSONB, key TEXT)
|
||||||
|
RETURNS INTEGER[]
|
||||||
|
AS $$
|
||||||
|
SELECT (info->'addr'->key->>0)::INTEGER[];
|
||||||
|
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
||||||
|
|
||||||
|
|
||||||
|
CREATE OR REPLACE FUNCTION token_matches_address(info JSONB, key TEXT, tokens INTEGER[])
|
||||||
|
RETURNS BOOLEAN
|
||||||
|
AS $$
|
||||||
|
SELECT (info->'addr'->key->>1)::INTEGER[] && tokens;
|
||||||
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
$$ LANGUAGE SQL IMMUTABLE STRICT;
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,2 +1,3 @@
|
|||||||
|
-- Required for details lookup.
|
||||||
CREATE INDEX IF NOT EXISTS idx_word_word_id
|
CREATE INDEX IF NOT EXISTS idx_word_word_id
|
||||||
ON word USING BTREE (word_id) {{db.tablespace.search_index}};
|
ON word USING BTREE (word_id) {{db.tablespace.search_index}};
|
||||||
|
|||||||
@@ -1,11 +0,0 @@
|
|||||||
DROP TABLE IF EXISTS word_frequencies;
|
|
||||||
CREATE TABLE word_frequencies AS
|
|
||||||
SELECT unnest(name_vector) as id, count(*) FROM search_name GROUP BY id;
|
|
||||||
|
|
||||||
CREATE INDEX idx_word_frequencies ON word_frequencies(id);
|
|
||||||
|
|
||||||
UPDATE word SET search_name_count = count
|
|
||||||
FROM word_frequencies
|
|
||||||
WHERE word_token like ' %' and word_id = id;
|
|
||||||
|
|
||||||
DROP TABLE word_frequencies;
|
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
# Creates and installs manual page
|
# Creates and installs manual page
|
||||||
|
|
||||||
configure_file(${PROJECT_SOURCE_DIR}/manual/create-manpage.tmpl create_manpage.py)
|
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/create-manpage.tmpl create_manpage.py)
|
||||||
|
|
||||||
find_program(ARGPARSEMANPAGE argparse-manpage)
|
find_program(ARGPARSEMANPAGE argparse-manpage)
|
||||||
|
|
||||||
@@ -8,8 +8,8 @@ ADD_CUSTOM_TARGET(manpage
|
|||||||
COMMAND ${ARGPARSEMANPAGE} --pyfile ${CMAKE_CURRENT_BINARY_DIR}/create_manpage.py
|
COMMAND ${ARGPARSEMANPAGE} --pyfile ${CMAKE_CURRENT_BINARY_DIR}/create_manpage.py
|
||||||
--function get_parser --project-name Nominatim
|
--function get_parser --project-name Nominatim
|
||||||
--url https://nominatim.org > ${CMAKE_CURRENT_SOURCE_DIR}/nominatim.1
|
--url https://nominatim.org > ${CMAKE_CURRENT_SOURCE_DIR}/nominatim.1
|
||||||
|
--author 'the Nominatim developer community'
|
||||||
COMMAND sed -i '/.SH AUTHORS/I,+2 d' ${CMAKE_CURRENT_SOURCE_DIR}/nominatim.1
|
--author-email info@nominatim.org
|
||||||
)
|
)
|
||||||
|
|
||||||
install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/nominatim.1 DESTINATION share/man/man1 )
|
install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/nominatim.1 DESTINATION share/man/man1 )
|
||||||
@@ -6,7 +6,9 @@ nominatim
|
|||||||
[-h] {import,freeze,replication,special-phrases,add-data,index,refresh,admin,export,serve,search,reverse,lookup,details,status} ...
|
[-h] {import,freeze,replication,special-phrases,add-data,index,refresh,admin,export,serve,search,reverse,lookup,details,status} ...
|
||||||
.SH DESCRIPTION
|
.SH DESCRIPTION
|
||||||
Command\-line tools for importing, updating, administrating and
|
Command\-line tools for importing, updating, administrating and
|
||||||
|
.br
|
||||||
querying the Nominatim database.
|
querying the Nominatim database.
|
||||||
|
.br
|
||||||
|
|
||||||
.SH OPTIONS
|
.SH OPTIONS
|
||||||
|
|
||||||
@@ -45,7 +47,7 @@ nominatim
|
|||||||
Start a simple web server for serving the API.
|
Start a simple web server for serving the API.
|
||||||
.TP
|
.TP
|
||||||
\fBnominatim\fR \fI\,search\/\fR
|
\fBnominatim\fR \fI\,search\/\fR
|
||||||
Execute API search query.
|
Execute a search query.
|
||||||
.TP
|
.TP
|
||||||
\fBnominatim\fR \fI\,reverse\/\fR
|
\fBnominatim\fR \fI\,reverse\/\fR
|
||||||
Execute API reverse query.
|
Execute API reverse query.
|
||||||
@@ -66,6 +68,15 @@ usage: nominatim import [-h] [-q] [-v] [--project-dir DIR] [-j NUM]
|
|||||||
[--index-noanalyse]
|
[--index-noanalyse]
|
||||||
|
|
||||||
Create a new Nominatim database from an OSM file.
|
Create a new Nominatim database from an OSM file.
|
||||||
|
.br
|
||||||
|
|
||||||
|
.br
|
||||||
|
This sub\-command sets up a new Nominatim database from scratch starting
|
||||||
|
.br
|
||||||
|
with creating a new database in Postgresql. The user running this command
|
||||||
|
.br
|
||||||
|
needs superuser rights on the database.
|
||||||
|
.br
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -88,7 +99,7 @@ Number of parallel threads to use
|
|||||||
|
|
||||||
.TP
|
.TP
|
||||||
\fB\-\-osm\-file\fR FILE
|
\fB\-\-osm\-file\fR FILE
|
||||||
OSM file to be imported.
|
OSM file to be imported (repeat for importing multiple files)
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
\fB\-\-continue\fR {load\-data,indexing,db\-postprocess}
|
\fB\-\-continue\fR {load\-data,indexing,db\-postprocess}
|
||||||
@@ -116,19 +127,27 @@ Continue import even when errors in SQL are present
|
|||||||
|
|
||||||
.TP
|
.TP
|
||||||
\fB\-\-index\-noanalyse\fR
|
\fB\-\-index\-noanalyse\fR
|
||||||
Do not perform analyse operations during index
|
Do not perform analyse operations during index (expert only)
|
||||||
|
|
||||||
.SH OPTIONS 'nominatim freeze'
|
.SH OPTIONS 'nominatim freeze'
|
||||||
usage: nominatim freeze [-h] [-q] [-v] [--project-dir DIR] [-j NUM]
|
usage: nominatim freeze [-h] [-q] [-v] [--project-dir DIR] [-j NUM]
|
||||||
|
|
||||||
Make database read\-only.
|
Make database read\-only.
|
||||||
|
.br
|
||||||
|
|
||||||
|
.br
|
||||||
About half of data in the Nominatim database is kept only to be able to
|
About half of data in the Nominatim database is kept only to be able to
|
||||||
|
.br
|
||||||
keep the data up\-to\-date with new changes made in OpenStreetMap. This
|
keep the data up\-to\-date with new changes made in OpenStreetMap. This
|
||||||
|
.br
|
||||||
command drops all this data and only keeps the part needed for geocoding
|
command drops all this data and only keeps the part needed for geocoding
|
||||||
|
.br
|
||||||
itself.
|
itself.
|
||||||
|
.br
|
||||||
|
|
||||||
|
.br
|
||||||
This command has the same effect as the `\-\-no\-updates` option for imports.
|
This command has the same effect as the `\-\-no\-updates` option for imports.
|
||||||
|
.br
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -157,6 +176,33 @@ usage: nominatim replication [-h] [-q] [-v] [--project-dir DIR] [-j NUM]
|
|||||||
[--socket-timeout SOCKET_TIMEOUT]
|
[--socket-timeout SOCKET_TIMEOUT]
|
||||||
|
|
||||||
Update the database using an online replication service.
|
Update the database using an online replication service.
|
||||||
|
.br
|
||||||
|
|
||||||
|
.br
|
||||||
|
An OSM replication service is an online service that provides regular
|
||||||
|
.br
|
||||||
|
updates (OSM diff files) for the planet or update they provide. The OSMF
|
||||||
|
.br
|
||||||
|
provides the primary replication service for the full planet at
|
||||||
|
.br
|
||||||
|
https://planet.osm.org/replication/ but there are other providers of
|
||||||
|
.br
|
||||||
|
extracts of OSM data who provide such a service as well.
|
||||||
|
.br
|
||||||
|
|
||||||
|
.br
|
||||||
|
This sub\-command allows to set up such a replication service and download
|
||||||
|
.br
|
||||||
|
and import updates at regular intervals. You need to call '\-\-init' once to
|
||||||
|
.br
|
||||||
|
set up the process or whenever you change the replication configuration
|
||||||
|
.br
|
||||||
|
parameters. Without any arguments, the sub\-command will go into a loop and
|
||||||
|
.br
|
||||||
|
continuously apply updates as they become available. Giving `\-\-once` just
|
||||||
|
.br
|
||||||
|
downloads and imports the next batch of updates.
|
||||||
|
.br
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -195,7 +241,7 @@ Download and apply updates only once. When not set, updates are continuously app
|
|||||||
|
|
||||||
.TP
|
.TP
|
||||||
\fB\-\-no\-index\fR
|
\fB\-\-no\-index\fR
|
||||||
Do not index the new data. Only applicable together with \-\-once
|
Do not index the new data. Only usable together with \-\-once
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
\fB\-\-osm2pgsql\-cache\fR SIZE
|
\fB\-\-osm2pgsql\-cache\fR SIZE
|
||||||
@@ -203,13 +249,47 @@ Size of cache to be used by osm2pgsql (in MB)
|
|||||||
|
|
||||||
.TP
|
.TP
|
||||||
\fB\-\-socket\-timeout\fR \fI\,SOCKET_TIMEOUT\/\fR
|
\fB\-\-socket\-timeout\fR \fI\,SOCKET_TIMEOUT\/\fR
|
||||||
Set timeout for file downloads.
|
Set timeout for file downloads
|
||||||
|
|
||||||
.SH OPTIONS 'nominatim special-phrases'
|
.SH OPTIONS 'nominatim special-phrases'
|
||||||
usage: nominatim special-phrases [-h] [-q] [-v] [--project-dir DIR] [-j NUM]
|
usage: nominatim special-phrases [-h] [-q] [-v] [--project-dir DIR] [-j NUM]
|
||||||
[--import-from-wiki]
|
[--import-from-wiki] [--import-from-csv FILE]
|
||||||
|
[--no-replace]
|
||||||
|
|
||||||
Import special phrases.
|
Import special phrases.
|
||||||
|
.br
|
||||||
|
|
||||||
|
.br
|
||||||
|
Special phrases are search terms that narrow down the type of object
|
||||||
|
.br
|
||||||
|
that should be searched. For example, you might want to search for
|
||||||
|
.br
|
||||||
|
'Hotels in Barcelona'. The OSM wiki has a selection of special phrases
|
||||||
|
.br
|
||||||
|
in many languages, which can be imported with this command.
|
||||||
|
.br
|
||||||
|
|
||||||
|
.br
|
||||||
|
You can also provide your own phrases in a CSV file. The file needs to have
|
||||||
|
.br
|
||||||
|
the following five columns:
|
||||||
|
.br
|
||||||
|
* phrase \- the term expected for searching
|
||||||
|
.br
|
||||||
|
* class \- the OSM tag key of the object type
|
||||||
|
.br
|
||||||
|
* type \- the OSM tag value of the object type
|
||||||
|
.br
|
||||||
|
* operator \- the kind of search to be done (one of: in, near, name, \-)
|
||||||
|
.br
|
||||||
|
* plural \- whether the term is a plural or not (Y/N)
|
||||||
|
.br
|
||||||
|
|
||||||
|
.br
|
||||||
|
An example file can be found in the Nominatim sources at
|
||||||
|
.br
|
||||||
|
'test/testdb/full_en_phrases_test.csv'.
|
||||||
|
.br
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -232,17 +312,48 @@ Number of parallel threads to use
|
|||||||
|
|
||||||
.TP
|
.TP
|
||||||
\fB\-\-import\-from\-wiki\fR
|
\fB\-\-import\-from\-wiki\fR
|
||||||
Import special phrases from the OSM wiki to the database.
|
Import special phrases from the OSM wiki to the database
|
||||||
|
|
||||||
|
.TP
|
||||||
|
\fB\-\-import\-from\-csv\fR FILE
|
||||||
|
Import special phrases from a CSV file
|
||||||
|
|
||||||
|
.TP
|
||||||
|
\fB\-\-no\-replace\fR
|
||||||
|
Keep the old phrases and only add the new ones
|
||||||
|
|
||||||
.SH OPTIONS 'nominatim add-data'
|
.SH OPTIONS 'nominatim add-data'
|
||||||
usage: nominatim add-data [-h] [-q] [-v] [--project-dir DIR] [-j NUM]
|
usage: nominatim add-data [-h] [-q] [-v] [--project-dir DIR] [-j NUM]
|
||||||
(--file FILE | --diff FILE | --node ID | --way ID | --relation ID | --tiger-data DIR)
|
(--file FILE | --diff FILE | --node ID | --way ID | --relation ID | --tiger-data DIR)
|
||||||
[--use-main-api]
|
[--use-main-api] [--osm2pgsql-cache SIZE]
|
||||||
|
[--socket-timeout SOCKET_TIMEOUT]
|
||||||
|
|
||||||
Add additional data from a file or an online source.
|
Add additional data from a file or an online source.
|
||||||
|
.br
|
||||||
|
|
||||||
Data is only imported, not indexed. You need to call `nominatim index`
|
.br
|
||||||
to complete the process.
|
This command allows to add or update the search data in the database.
|
||||||
|
.br
|
||||||
|
The data can come either from an OSM file or single OSM objects can
|
||||||
|
.br
|
||||||
|
directly be downloaded from the OSM API. This function only loads the
|
||||||
|
.br
|
||||||
|
data into the database. Afterwards it still needs to be integrated
|
||||||
|
.br
|
||||||
|
in the search index. Use the `nominatim index` command for that.
|
||||||
|
.br
|
||||||
|
|
||||||
|
.br
|
||||||
|
The command can also be used to add external non\-OSM data to the
|
||||||
|
.br
|
||||||
|
database. At the moment the only supported format is TIGER housenumber
|
||||||
|
.br
|
||||||
|
data. See the online documentation at
|
||||||
|
.br
|
||||||
|
https://nominatim.org/release\-docs/latest/admin/Import/#installing\-tiger\-housenumber\-data\-for\-the\-us
|
||||||
|
.br
|
||||||
|
for more information.
|
||||||
|
.br
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -265,11 +376,11 @@ Number of parallel threads to use
|
|||||||
|
|
||||||
.TP
|
.TP
|
||||||
\fB\-\-file\fR FILE
|
\fB\-\-file\fR FILE
|
||||||
Import data from an OSM file
|
Import data from an OSM file or diff file
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
\fB\-\-diff\fR FILE
|
\fB\-\-diff\fR FILE
|
||||||
Import data from an OSM diff file
|
Import data from an OSM diff file (deprecated: use \-\-file)
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
\fB\-\-node\fR ID
|
\fB\-\-node\fR ID
|
||||||
@@ -285,18 +396,37 @@ Import a single relation from the API
|
|||||||
|
|
||||||
.TP
|
.TP
|
||||||
\fB\-\-tiger\-data\fR DIR
|
\fB\-\-tiger\-data\fR DIR
|
||||||
Add housenumbers from the US TIGER census database.
|
Add housenumbers from the US TIGER census database
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
\fB\-\-use\-main\-api\fR
|
\fB\-\-use\-main\-api\fR
|
||||||
Use OSM API instead of Overpass to download objects
|
Use OSM API instead of Overpass to download objects
|
||||||
|
|
||||||
|
.TP
|
||||||
|
\fB\-\-osm2pgsql\-cache\fR SIZE
|
||||||
|
Size of cache to be used by osm2pgsql (in MB)
|
||||||
|
|
||||||
|
.TP
|
||||||
|
\fB\-\-socket\-timeout\fR \fI\,SOCKET_TIMEOUT\/\fR
|
||||||
|
Set timeout for file downloads
|
||||||
|
|
||||||
.SH OPTIONS 'nominatim index'
|
.SH OPTIONS 'nominatim index'
|
||||||
usage: nominatim index [-h] [-q] [-v] [--project-dir DIR] [-j NUM]
|
usage: nominatim index [-h] [-q] [-v] [--project-dir DIR] [-j NUM]
|
||||||
[--boundaries-only] [--no-boundaries] [--minrank RANK]
|
[--boundaries-only] [--no-boundaries] [--minrank RANK]
|
||||||
[--maxrank RANK]
|
[--maxrank RANK]
|
||||||
|
|
||||||
Reindex all new and modified data.
|
Reindex all new and modified data.
|
||||||
|
.br
|
||||||
|
|
||||||
|
.br
|
||||||
|
Indexing is the process of computing the address and search terms for
|
||||||
|
.br
|
||||||
|
the places in the database. Every time data is added or changed, indexing
|
||||||
|
.br
|
||||||
|
needs to be run. Imports and replication updates automatically take care
|
||||||
|
.br
|
||||||
|
of indexing. For other cases, this function allows to run indexing manually.
|
||||||
|
.br
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -341,8 +471,23 @@ usage: nominatim refresh [-h] [-q] [-v] [--project-dir DIR] [-j NUM]
|
|||||||
[--enable-debug-statements]
|
[--enable-debug-statements]
|
||||||
|
|
||||||
Recompute auxiliary data used by the indexing process.
|
Recompute auxiliary data used by the indexing process.
|
||||||
|
.br
|
||||||
|
|
||||||
These functions must not be run in parallel with other update commands.
|
.br
|
||||||
|
This sub\-commands updates various static data and functions in the database.
|
||||||
|
.br
|
||||||
|
It usually needs to be run after changing various aspects of the
|
||||||
|
.br
|
||||||
|
configuration. The configuration documentation will mention the exact
|
||||||
|
.br
|
||||||
|
command to use in such case.
|
||||||
|
.br
|
||||||
|
|
||||||
|
.br
|
||||||
|
Warning: the 'update' command must not be run in parallel with other update
|
||||||
|
.br
|
||||||
|
commands like 'replication' or 'add\-data'.
|
||||||
|
.br
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -381,7 +526,7 @@ Update the PL/pgSQL functions in the database
|
|||||||
|
|
||||||
.TP
|
.TP
|
||||||
\fB\-\-wiki\-data\fR
|
\fB\-\-wiki\-data\fR
|
||||||
Update Wikipedia/data importance numbers.
|
Update Wikipedia/data importance numbers
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
\fB\-\-importance\fR
|
\fB\-\-importance\fR
|
||||||
@@ -406,6 +551,7 @@ usage: nominatim admin [-h] [-q] [-v] [--project-dir DIR] [-j NUM]
|
|||||||
[--osm-id OSM_ID | --place-id PLACE_ID]
|
[--osm-id OSM_ID | --place-id PLACE_ID]
|
||||||
|
|
||||||
Analyse and maintain the database.
|
Analyse and maintain the database.
|
||||||
|
.br
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -428,19 +574,19 @@ Number of parallel threads to use
|
|||||||
|
|
||||||
.TP
|
.TP
|
||||||
\fB\-\-warm\fR
|
\fB\-\-warm\fR
|
||||||
Warm database caches for search and reverse queries.
|
Warm database caches for search and reverse queries
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
\fB\-\-check\-database\fR
|
\fB\-\-check\-database\fR
|
||||||
Check that the database is complete and operational.
|
Check that the database is complete and operational
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
\fB\-\-migrate\fR
|
\fB\-\-migrate\fR
|
||||||
Migrate the database to a new software version.
|
Migrate the database to a new software version
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
\fB\-\-analyse\-indexing\fR
|
\fB\-\-analyse\-indexing\fR
|
||||||
Print performance analysis of the indexing process.
|
Print performance analysis of the indexing process
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
\fB\-\-search\-only\fR
|
\fB\-\-search\-only\fR
|
||||||
@@ -468,6 +614,7 @@ usage: nominatim export [-h] [-q] [-v] [--project-dir DIR] [-j NUM]
|
|||||||
[--restrict-to-osm-relation ID]
|
[--restrict-to-osm-relation ID]
|
||||||
|
|
||||||
Export addresses as CSV file from the database.
|
Export addresses as CSV file from the database.
|
||||||
|
.br
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -525,12 +672,19 @@ usage: nominatim serve [-h] [-q] [-v] [--project-dir DIR] [-j NUM]
|
|||||||
[--server SERVER]
|
[--server SERVER]
|
||||||
|
|
||||||
Start a simple web server for serving the API.
|
Start a simple web server for serving the API.
|
||||||
|
.br
|
||||||
|
|
||||||
|
.br
|
||||||
This command starts the built\-in PHP webserver to serve the website
|
This command starts the built\-in PHP webserver to serve the website
|
||||||
|
.br
|
||||||
from the current project directory. This webserver is only suitable
|
from the current project directory. This webserver is only suitable
|
||||||
for testing and develop. Do not use it in production setups!
|
.br
|
||||||
|
for testing and development. Do not use it in production setups!
|
||||||
|
.br
|
||||||
|
|
||||||
|
.br
|
||||||
By the default, the webserver can be accessed at: http://127.0.0.1:8088
|
By the default, the webserver can be accessed at: http://127.0.0.1:8088
|
||||||
|
.br
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -568,7 +722,18 @@ usage: nominatim search [-h] [-q] [-v] [--project-dir DIR] [-j NUM]
|
|||||||
[--exclude_place_ids ID,..] [--limit LIMIT]
|
[--exclude_place_ids ID,..] [--limit LIMIT]
|
||||||
[--viewbox X1,Y1,X2,Y2] [--bounded] [--no-dedupe]
|
[--viewbox X1,Y1,X2,Y2] [--bounded] [--no-dedupe]
|
||||||
|
|
||||||
Execute API search query.
|
Execute a search query.
|
||||||
|
.br
|
||||||
|
|
||||||
|
.br
|
||||||
|
This command works exactly the same as if calling the /search endpoint on
|
||||||
|
.br
|
||||||
|
the web API. See the online documentation for more details on the
|
||||||
|
.br
|
||||||
|
various parameters:
|
||||||
|
.br
|
||||||
|
https://nominatim.org/release\-docs/latest/api/Search/
|
||||||
|
.br
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -623,15 +788,15 @@ Format of result
|
|||||||
|
|
||||||
.TP
|
.TP
|
||||||
\fB\-\-addressdetails\fR
|
\fB\-\-addressdetails\fR
|
||||||
Include a breakdown of the address into elements.
|
Include a breakdown of the address into elements
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
\fB\-\-extratags\fR
|
\fB\-\-extratags\fR
|
||||||
Include additional information if available (e.g. wikipedia link, opening hours).
|
Include additional information if available (e.g. wikipedia link, opening hours)
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
\fB\-\-namedetails\fR
|
\fB\-\-namedetails\fR
|
||||||
Include a list of alternative names.
|
Include a list of alternative names
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
\fB\-\-lang\fR LANGS, \fB\-\-accept\-language\fR LANGS
|
\fB\-\-lang\fR LANGS, \fB\-\-accept\-language\fR LANGS
|
||||||
@@ -639,7 +804,7 @@ Preferred language order for presenting search results
|
|||||||
|
|
||||||
.TP
|
.TP
|
||||||
\fB\-\-polygon\-output\fR {geojson,kml,svg,text}
|
\fB\-\-polygon\-output\fR {geojson,kml,svg,text}
|
||||||
Output geometry of results as a GeoJSON, KML, SVG or WKT.
|
Output geometry of results as a GeoJSON, KML, SVG or WKT
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
\fB\-\-polygon\-threshold\fR TOLERANCE
|
\fB\-\-polygon\-threshold\fR TOLERANCE
|
||||||
@@ -647,7 +812,7 @@ Simplify output geometry.Parameter is difference tolerance in degrees.
|
|||||||
|
|
||||||
.TP
|
.TP
|
||||||
\fB\-\-countrycodes\fR CC,..
|
\fB\-\-countrycodes\fR CC,..
|
||||||
Limit search results to one or more countries.
|
Limit search results to one or more countries
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
\fB\-\-exclude_place_ids\fR ID,..
|
\fB\-\-exclude_place_ids\fR ID,..
|
||||||
@@ -679,6 +844,17 @@ usage: nominatim reverse [-h] [-q] [-v] [--project-dir DIR] [-j NUM] --lat LAT
|
|||||||
[--polygon-threshold TOLERANCE]
|
[--polygon-threshold TOLERANCE]
|
||||||
|
|
||||||
Execute API reverse query.
|
Execute API reverse query.
|
||||||
|
.br
|
||||||
|
|
||||||
|
.br
|
||||||
|
This command works exactly the same as if calling the /reverse endpoint on
|
||||||
|
.br
|
||||||
|
the web API. See the online documentation for more details on the
|
||||||
|
.br
|
||||||
|
various parameters:
|
||||||
|
.br
|
||||||
|
https://nominatim.org/release\-docs/latest/api/Reverse/
|
||||||
|
.br
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -717,15 +893,15 @@ Format of result
|
|||||||
|
|
||||||
.TP
|
.TP
|
||||||
\fB\-\-addressdetails\fR
|
\fB\-\-addressdetails\fR
|
||||||
Include a breakdown of the address into elements.
|
Include a breakdown of the address into elements
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
\fB\-\-extratags\fR
|
\fB\-\-extratags\fR
|
||||||
Include additional information if available (e.g. wikipedia link, opening hours).
|
Include additional information if available (e.g. wikipedia link, opening hours)
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
\fB\-\-namedetails\fR
|
\fB\-\-namedetails\fR
|
||||||
Include a list of alternative names.
|
Include a list of alternative names
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
\fB\-\-lang\fR LANGS, \fB\-\-accept\-language\fR LANGS
|
\fB\-\-lang\fR LANGS, \fB\-\-accept\-language\fR LANGS
|
||||||
@@ -733,7 +909,7 @@ Preferred language order for presenting search results
|
|||||||
|
|
||||||
.TP
|
.TP
|
||||||
\fB\-\-polygon\-output\fR {geojson,kml,svg,text}
|
\fB\-\-polygon\-output\fR {geojson,kml,svg,text}
|
||||||
Output geometry of results as a GeoJSON, KML, SVG or WKT.
|
Output geometry of results as a GeoJSON, KML, SVG or WKT
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
\fB\-\-polygon\-threshold\fR TOLERANCE
|
\fB\-\-polygon\-threshold\fR TOLERANCE
|
||||||
@@ -748,6 +924,17 @@ usage: nominatim lookup [-h] [-q] [-v] [--project-dir DIR] [-j NUM] --id OSMID
|
|||||||
[--polygon-threshold TOLERANCE]
|
[--polygon-threshold TOLERANCE]
|
||||||
|
|
||||||
Execute API lookup query.
|
Execute API lookup query.
|
||||||
|
.br
|
||||||
|
|
||||||
|
.br
|
||||||
|
This command works exactly the same as if calling the /lookup endpoint on
|
||||||
|
.br
|
||||||
|
the web API. See the online documentation for more details on the
|
||||||
|
.br
|
||||||
|
various parameters:
|
||||||
|
.br
|
||||||
|
https://nominatim.org/release\-docs/latest/api/Lookup/
|
||||||
|
.br
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -778,15 +965,15 @@ Format of result
|
|||||||
|
|
||||||
.TP
|
.TP
|
||||||
\fB\-\-addressdetails\fR
|
\fB\-\-addressdetails\fR
|
||||||
Include a breakdown of the address into elements.
|
Include a breakdown of the address into elements
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
\fB\-\-extratags\fR
|
\fB\-\-extratags\fR
|
||||||
Include additional information if available (e.g. wikipedia link, opening hours).
|
Include additional information if available (e.g. wikipedia link, opening hours)
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
\fB\-\-namedetails\fR
|
\fB\-\-namedetails\fR
|
||||||
Include a list of alternative names.
|
Include a list of alternative names
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
\fB\-\-lang\fR LANGS, \fB\-\-accept\-language\fR LANGS
|
\fB\-\-lang\fR LANGS, \fB\-\-accept\-language\fR LANGS
|
||||||
@@ -794,7 +981,7 @@ Preferred language order for presenting search results
|
|||||||
|
|
||||||
.TP
|
.TP
|
||||||
\fB\-\-polygon\-output\fR {geojson,kml,svg,text}
|
\fB\-\-polygon\-output\fR {geojson,kml,svg,text}
|
||||||
Output geometry of results as a GeoJSON, KML, SVG or WKT.
|
Output geometry of results as a GeoJSON, KML, SVG or WKT
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
\fB\-\-polygon\-threshold\fR TOLERANCE
|
\fB\-\-polygon\-threshold\fR TOLERANCE
|
||||||
@@ -809,6 +996,17 @@ usage: nominatim details [-h] [-q] [-v] [--project-dir DIR] [-j NUM]
|
|||||||
[--lang LANGS]
|
[--lang LANGS]
|
||||||
|
|
||||||
Execute API details query.
|
Execute API details query.
|
||||||
|
.br
|
||||||
|
|
||||||
|
.br
|
||||||
|
This command works exactly the same as if calling the /details endpoint on
|
||||||
|
.br
|
||||||
|
the web API. See the online documentation for more details on the
|
||||||
|
.br
|
||||||
|
various parameters:
|
||||||
|
.br
|
||||||
|
https://nominatim.org/release\-docs/latest/api/Details/
|
||||||
|
.br
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -843,7 +1041,7 @@ Look up the OSM relation with the given ID.
|
|||||||
|
|
||||||
.TP
|
.TP
|
||||||
\fB\-\-place_id\fR \fI\,PLACE_ID\/\fR, \fB\-p\fR \fI\,PLACE_ID\/\fR
|
\fB\-\-place_id\fR \fI\,PLACE_ID\/\fR, \fB\-p\fR \fI\,PLACE_ID\/\fR
|
||||||
Database internal identifier of the OSM object to look up.
|
Database internal identifier of the OSM object to look up
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
\fB\-\-class\fR \fI\,OBJECT_CLASS\/\fR
|
\fB\-\-class\fR \fI\,OBJECT_CLASS\/\fR
|
||||||
@@ -851,27 +1049,27 @@ Class type to disambiguated multiple entries of the same object.
|
|||||||
|
|
||||||
.TP
|
.TP
|
||||||
\fB\-\-addressdetails\fR
|
\fB\-\-addressdetails\fR
|
||||||
Include a breakdown of the address into elements.
|
Include a breakdown of the address into elements
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
\fB\-\-keywords\fR
|
\fB\-\-keywords\fR
|
||||||
Include a list of name keywords and address keywords.
|
Include a list of name keywords and address keywords
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
\fB\-\-linkedplaces\fR
|
\fB\-\-linkedplaces\fR
|
||||||
Include a details of places that are linked with this one.
|
Include a details of places that are linked with this one
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
\fB\-\-hierarchy\fR
|
\fB\-\-hierarchy\fR
|
||||||
Include details of places lower in the address hierarchy.
|
Include details of places lower in the address hierarchy
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
\fB\-\-group_hierarchy\fR
|
\fB\-\-group_hierarchy\fR
|
||||||
Group the places by type.
|
Group the places by type
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
\fB\-\-polygon_geojson\fR
|
\fB\-\-polygon_geojson\fR
|
||||||
Include geometry of result.
|
Include geometry of result
|
||||||
|
|
||||||
.TP
|
.TP
|
||||||
\fB\-\-lang\fR LANGS, \fB\-\-accept\-language\fR LANGS
|
\fB\-\-lang\fR LANGS, \fB\-\-accept\-language\fR LANGS
|
||||||
@@ -882,6 +1080,17 @@ usage: nominatim status [-h] [-q] [-v] [--project-dir DIR] [-j NUM]
|
|||||||
[--format {text,json}]
|
[--format {text,json}]
|
||||||
|
|
||||||
Execute API status query.
|
Execute API status query.
|
||||||
|
.br
|
||||||
|
|
||||||
|
.br
|
||||||
|
This command works exactly the same as if calling the /status endpoint on
|
||||||
|
.br
|
||||||
|
the web API. See the online documentation for more details on the
|
||||||
|
.br
|
||||||
|
various parameters:
|
||||||
|
.br
|
||||||
|
https://nominatim.org/release\-docs/latest/api/Status/
|
||||||
|
.br
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -906,6 +1115,9 @@ Number of parallel threads to use
|
|||||||
\fB\-\-format\fR {text,json}
|
\fB\-\-format\fR {text,json}
|
||||||
Format of result
|
Format of result
|
||||||
|
|
||||||
|
.SH AUTHORS
|
||||||
|
.B Nominatim
|
||||||
|
was written by the Nominatim developer community <info@nominatim.org>.
|
||||||
.SH DISTRIBUTION
|
.SH DISTRIBUTION
|
||||||
The latest version of Nominatim may be downloaded from
|
The latest version of Nominatim may be downloaded from
|
||||||
.UR https://nominatim.org
|
.UR https://nominatim.org
|
||||||
@@ -176,7 +176,7 @@ class AdminServe:
|
|||||||
|
|
||||||
This command starts the built-in PHP webserver to serve the website
|
This command starts the built-in PHP webserver to serve the website
|
||||||
from the current project directory. This webserver is only suitable
|
from the current project directory. This webserver is only suitable
|
||||||
for testing and develop. Do not use it in production setups!
|
for testing and development. Do not use it in production setups!
|
||||||
|
|
||||||
By the default, the webserver can be accessed at: http://127.0.0.1:8088
|
By the default, the webserver can be accessed at: http://127.0.0.1:8088
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -3,6 +3,8 @@ Implementation of the 'add-data' subcommand.
|
|||||||
"""
|
"""
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
|
import psutil
|
||||||
|
|
||||||
# Do not repeat documentation of subcommand classes.
|
# Do not repeat documentation of subcommand classes.
|
||||||
# pylint: disable=C0111
|
# pylint: disable=C0111
|
||||||
# Using non-top-level imports to avoid eventually unused imports.
|
# Using non-top-level imports to avoid eventually unused imports.
|
||||||
@@ -14,8 +16,17 @@ class UpdateAddData:
|
|||||||
"""\
|
"""\
|
||||||
Add additional data from a file or an online source.
|
Add additional data from a file or an online source.
|
||||||
|
|
||||||
Data is only imported, not indexed. You need to call `nominatim index`
|
This command allows to add or update the search data in the database.
|
||||||
to complete the process.
|
The data can come either from an OSM file or single OSM objects can
|
||||||
|
directly be downloaded from the OSM API. This function only loads the
|
||||||
|
data into the database. Afterwards it still needs to be integrated
|
||||||
|
in the search index. Use the `nominatim index` command for that.
|
||||||
|
|
||||||
|
The command can also be used to add external non-OSM data to the
|
||||||
|
database. At the moment the only supported format is TIGER housenumber
|
||||||
|
data. See the online documentation at
|
||||||
|
https://nominatim.org/release-docs/latest/admin/Import/#installing-tiger-housenumber-data-for-the-us
|
||||||
|
for more information.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@@ -33,14 +44,14 @@ class UpdateAddData:
|
|||||||
group.add_argument('--relation', metavar='ID', type=int,
|
group.add_argument('--relation', metavar='ID', type=int,
|
||||||
help='Import a single relation from the API')
|
help='Import a single relation from the API')
|
||||||
group.add_argument('--tiger-data', metavar='DIR',
|
group.add_argument('--tiger-data', metavar='DIR',
|
||||||
help='Add housenumbers from the US TIGER census database.')
|
help='Add housenumbers from the US TIGER census database')
|
||||||
group = parser.add_argument_group('Extra arguments')
|
group = parser.add_argument_group('Extra arguments')
|
||||||
group.add_argument('--use-main-api', action='store_true',
|
group.add_argument('--use-main-api', action='store_true',
|
||||||
help='Use OSM API instead of Overpass to download objects')
|
help='Use OSM API instead of Overpass to download objects')
|
||||||
group.add_argument('--osm2pgsql-cache', metavar='SIZE', type=int,
|
group.add_argument('--osm2pgsql-cache', metavar='SIZE', type=int,
|
||||||
help='Size of cache to be used by osm2pgsql (in MB)')
|
help='Size of cache to be used by osm2pgsql (in MB)')
|
||||||
group.add_argument('--socket-timeout', dest='socket_timeout', type=int, default=60,
|
group.add_argument('--socket-timeout', dest='socket_timeout', type=int, default=60,
|
||||||
help='Set timeout for file downloads.')
|
help='Set timeout for file downloads')
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def run(args):
|
def run(args):
|
||||||
@@ -50,7 +61,8 @@ class UpdateAddData:
|
|||||||
if args.tiger_data:
|
if args.tiger_data:
|
||||||
tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
|
tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
|
||||||
return tiger_data.add_tiger_data(args.tiger_data,
|
return tiger_data.add_tiger_data(args.tiger_data,
|
||||||
args.config, args.threads or 1,
|
args.config,
|
||||||
|
args.threads or psutil.cpu_count() or 1,
|
||||||
tokenizer)
|
tokenizer)
|
||||||
|
|
||||||
osm2pgsql_params = args.osm2pgsql_options(default_cache=1000, default_threads=1)
|
osm2pgsql_params = args.osm2pgsql_options(default_cache=1000, default_threads=1)
|
||||||
|
|||||||
@@ -23,13 +23,13 @@ class AdminFuncs:
|
|||||||
group = parser.add_argument_group('Admin tasks')
|
group = parser.add_argument_group('Admin tasks')
|
||||||
objs = group.add_mutually_exclusive_group(required=True)
|
objs = group.add_mutually_exclusive_group(required=True)
|
||||||
objs.add_argument('--warm', action='store_true',
|
objs.add_argument('--warm', action='store_true',
|
||||||
help='Warm database caches for search and reverse queries.')
|
help='Warm database caches for search and reverse queries')
|
||||||
objs.add_argument('--check-database', action='store_true',
|
objs.add_argument('--check-database', action='store_true',
|
||||||
help='Check that the database is complete and operational.')
|
help='Check that the database is complete and operational')
|
||||||
objs.add_argument('--migrate', action='store_true',
|
objs.add_argument('--migrate', action='store_true',
|
||||||
help='Migrate the database to a new software version.')
|
help='Migrate the database to a new software version')
|
||||||
objs.add_argument('--analyse-indexing', action='store_true',
|
objs.add_argument('--analyse-indexing', action='store_true',
|
||||||
help='Print performance analysis of the indexing process.')
|
help='Print performance analysis of the indexing process')
|
||||||
group = parser.add_argument_group('Arguments for cache warming')
|
group = parser.add_argument_group('Arguments for cache warming')
|
||||||
group.add_argument('--search-only', action='store_const', dest='target',
|
group.add_argument('--search-only', action='store_const', dest='target',
|
||||||
const='search',
|
const='search',
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ Subcommand definitions for API calls from the command line.
|
|||||||
import logging
|
import logging
|
||||||
|
|
||||||
from nominatim.tools.exec_utils import run_api_script
|
from nominatim.tools.exec_utils import run_api_script
|
||||||
|
from nominatim.errors import UsageError
|
||||||
|
|
||||||
# Do not repeat documentation of subcommand classes.
|
# Do not repeat documentation of subcommand classes.
|
||||||
# pylint: disable=C0111
|
# pylint: disable=C0111
|
||||||
@@ -20,19 +21,19 @@ STRUCTURED_QUERY = (
|
|||||||
)
|
)
|
||||||
|
|
||||||
EXTRADATA_PARAMS = (
|
EXTRADATA_PARAMS = (
|
||||||
('addressdetails', 'Include a breakdown of the address into elements.'),
|
('addressdetails', 'Include a breakdown of the address into elements'),
|
||||||
('extratags', ("Include additional information if available "
|
('extratags', ("Include additional information if available "
|
||||||
"(e.g. wikipedia link, opening hours).")),
|
"(e.g. wikipedia link, opening hours)")),
|
||||||
('namedetails', 'Include a list of alternative names.')
|
('namedetails', 'Include a list of alternative names')
|
||||||
)
|
)
|
||||||
|
|
||||||
DETAILS_SWITCHES = (
|
DETAILS_SWITCHES = (
|
||||||
('addressdetails', 'Include a breakdown of the address into elements.'),
|
('addressdetails', 'Include a breakdown of the address into elements'),
|
||||||
('keywords', 'Include a list of name keywords and address keywords.'),
|
('keywords', 'Include a list of name keywords and address keywords'),
|
||||||
('linkedplaces', 'Include a details of places that are linked with this one.'),
|
('linkedplaces', 'Include a details of places that are linked with this one'),
|
||||||
('hierarchy', 'Include details of places lower in the address hierarchy.'),
|
('hierarchy', 'Include details of places lower in the address hierarchy'),
|
||||||
('group_hierarchy', 'Group the places by type.'),
|
('group_hierarchy', 'Group the places by type'),
|
||||||
('polygon_geojson', 'Include geometry of result.')
|
('polygon_geojson', 'Include geometry of result')
|
||||||
)
|
)
|
||||||
|
|
||||||
def _add_api_output_arguments(parser):
|
def _add_api_output_arguments(parser):
|
||||||
@@ -47,15 +48,32 @@ def _add_api_output_arguments(parser):
|
|||||||
help='Preferred language order for presenting search results')
|
help='Preferred language order for presenting search results')
|
||||||
group.add_argument('--polygon-output',
|
group.add_argument('--polygon-output',
|
||||||
choices=['geojson', 'kml', 'svg', 'text'],
|
choices=['geojson', 'kml', 'svg', 'text'],
|
||||||
help='Output geometry of results as a GeoJSON, KML, SVG or WKT.')
|
help='Output geometry of results as a GeoJSON, KML, SVG or WKT')
|
||||||
group.add_argument('--polygon-threshold', type=float, metavar='TOLERANCE',
|
group.add_argument('--polygon-threshold', type=float, metavar='TOLERANCE',
|
||||||
help=("Simplify output geometry."
|
help=("Simplify output geometry."
|
||||||
"Parameter is difference tolerance in degrees."))
|
"Parameter is difference tolerance in degrees."))
|
||||||
|
|
||||||
|
|
||||||
|
def _run_api(endpoint, args, params):
|
||||||
|
script_file = args.project_dir / 'website' / (endpoint + '.php')
|
||||||
|
|
||||||
|
if not script_file.exists():
|
||||||
|
LOG.error("Cannot find API script file.\n\n"
|
||||||
|
"Make sure to run 'nominatim' from the project directory \n"
|
||||||
|
"or use the option --project-dir.")
|
||||||
|
raise UsageError("API script not found.")
|
||||||
|
|
||||||
|
return run_api_script(endpoint, args.project_dir,
|
||||||
|
phpcgi_bin=args.phpcgi_path, params=params)
|
||||||
|
|
||||||
class APISearch:
|
class APISearch:
|
||||||
"""\
|
"""\
|
||||||
Execute API search query.
|
Execute a search query.
|
||||||
|
|
||||||
|
This command works exactly the same as if calling the /search endpoint on
|
||||||
|
the web API. See the online documentation for more details on the
|
||||||
|
various parameters:
|
||||||
|
https://nominatim.org/release-docs/latest/api/Search/
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@@ -70,7 +88,7 @@ class APISearch:
|
|||||||
|
|
||||||
group = parser.add_argument_group('Result limitation')
|
group = parser.add_argument_group('Result limitation')
|
||||||
group.add_argument('--countrycodes', metavar='CC,..',
|
group.add_argument('--countrycodes', metavar='CC,..',
|
||||||
help='Limit search results to one or more countries.')
|
help='Limit search results to one or more countries')
|
||||||
group.add_argument('--exclude_place_ids', metavar='ID,..',
|
group.add_argument('--exclude_place_ids', metavar='ID,..',
|
||||||
help='List of search object to be excluded')
|
help='List of search object to be excluded')
|
||||||
group.add_argument('--limit', type=int,
|
group.add_argument('--limit', type=int,
|
||||||
@@ -109,12 +127,16 @@ class APISearch:
|
|||||||
if not args.dedupe:
|
if not args.dedupe:
|
||||||
params['dedupe'] = '0'
|
params['dedupe'] = '0'
|
||||||
|
|
||||||
return run_api_script('search', args.project_dir,
|
return _run_api('search', args, params)
|
||||||
phpcgi_bin=args.phpcgi_path, params=params)
|
|
||||||
|
|
||||||
class APIReverse:
|
class APIReverse:
|
||||||
"""\
|
"""\
|
||||||
Execute API reverse query.
|
Execute API reverse query.
|
||||||
|
|
||||||
|
This command works exactly the same as if calling the /reverse endpoint on
|
||||||
|
the web API. See the online documentation for more details on the
|
||||||
|
various parameters:
|
||||||
|
https://nominatim.org/release-docs/latest/api/Reverse/
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@@ -148,13 +170,17 @@ class APIReverse:
|
|||||||
if args.polygon_threshold:
|
if args.polygon_threshold:
|
||||||
params['polygon_threshold'] = args.polygon_threshold
|
params['polygon_threshold'] = args.polygon_threshold
|
||||||
|
|
||||||
return run_api_script('reverse', args.project_dir,
|
return _run_api('reverse', args, params)
|
||||||
phpcgi_bin=args.phpcgi_path, params=params)
|
|
||||||
|
|
||||||
|
|
||||||
class APILookup:
|
class APILookup:
|
||||||
"""\
|
"""\
|
||||||
Execute API lookup query.
|
Execute API lookup query.
|
||||||
|
|
||||||
|
This command works exactly the same as if calling the /lookup endpoint on
|
||||||
|
the web API. See the online documentation for more details on the
|
||||||
|
various parameters:
|
||||||
|
https://nominatim.org/release-docs/latest/api/Lookup/
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@@ -183,13 +209,17 @@ class APILookup:
|
|||||||
if args.polygon_threshold:
|
if args.polygon_threshold:
|
||||||
params['polygon_threshold'] = args.polygon_threshold
|
params['polygon_threshold'] = args.polygon_threshold
|
||||||
|
|
||||||
return run_api_script('lookup', args.project_dir,
|
return _run_api('lookup', args, params)
|
||||||
phpcgi_bin=args.phpcgi_path, params=params)
|
|
||||||
|
|
||||||
|
|
||||||
class APIDetails:
|
class APIDetails:
|
||||||
"""\
|
"""\
|
||||||
Execute API details query.
|
Execute API details query.
|
||||||
|
|
||||||
|
This command works exactly the same as if calling the /details endpoint on
|
||||||
|
the web API. See the online documentation for more details on the
|
||||||
|
various parameters:
|
||||||
|
https://nominatim.org/release-docs/latest/api/Details/
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@@ -203,7 +233,7 @@ class APIDetails:
|
|||||||
objs.add_argument('--relation', '-r', type=int,
|
objs.add_argument('--relation', '-r', type=int,
|
||||||
help="Look up the OSM relation with the given ID.")
|
help="Look up the OSM relation with the given ID.")
|
||||||
objs.add_argument('--place_id', '-p', type=int,
|
objs.add_argument('--place_id', '-p', type=int,
|
||||||
help='Database internal identifier of the OSM object to look up.')
|
help='Database internal identifier of the OSM object to look up')
|
||||||
group.add_argument('--class', dest='object_class',
|
group.add_argument('--class', dest='object_class',
|
||||||
help=("Class type to disambiguated multiple entries "
|
help=("Class type to disambiguated multiple entries "
|
||||||
"of the same object."))
|
"of the same object."))
|
||||||
@@ -229,13 +259,17 @@ class APIDetails:
|
|||||||
for name, _ in DETAILS_SWITCHES:
|
for name, _ in DETAILS_SWITCHES:
|
||||||
params[name] = '1' if getattr(args, name) else '0'
|
params[name] = '1' if getattr(args, name) else '0'
|
||||||
|
|
||||||
return run_api_script('details', args.project_dir,
|
return _run_api('details', args, params)
|
||||||
phpcgi_bin=args.phpcgi_path, params=params)
|
|
||||||
|
|
||||||
|
|
||||||
class APIStatus:
|
class APIStatus:
|
||||||
"""\
|
"""\
|
||||||
Execute API status query.
|
Execute API status query.
|
||||||
|
|
||||||
|
This command works exactly the same as if calling the /status endpoint on
|
||||||
|
the web API. See the online documentation for more details on the
|
||||||
|
various parameters:
|
||||||
|
https://nominatim.org/release-docs/latest/api/Status/
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@@ -246,6 +280,4 @@ class APIStatus:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def run(args):
|
def run(args):
|
||||||
return run_api_script('status', args.project_dir,
|
return _run_api('status', args, dict(format=args.format))
|
||||||
phpcgi_bin=args.phpcgi_path,
|
|
||||||
params=dict(format=args.format))
|
|
||||||
|
|||||||
@@ -1,7 +1,12 @@
|
|||||||
"""
|
"""
|
||||||
Provides custom functions over command-line arguments.
|
Provides custom functions over command-line arguments.
|
||||||
"""
|
"""
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from nominatim.errors import UsageError
|
||||||
|
|
||||||
|
LOG = logging.getLogger()
|
||||||
|
|
||||||
class NominatimArgs:
|
class NominatimArgs:
|
||||||
""" Customized namespace class for the nominatim command line tool
|
""" Customized namespace class for the nominatim command line tool
|
||||||
@@ -18,10 +23,27 @@ class NominatimArgs:
|
|||||||
osm2pgsql_style=self.config.get_import_style_file(),
|
osm2pgsql_style=self.config.get_import_style_file(),
|
||||||
threads=self.threads or default_threads,
|
threads=self.threads or default_threads,
|
||||||
dsn=self.config.get_libpq_dsn(),
|
dsn=self.config.get_libpq_dsn(),
|
||||||
flatnode_file=self.config.FLATNODE_FILE,
|
flatnode_file=str(self.config.get_path('FLATNODE_FILE')),
|
||||||
tablespaces=dict(slim_data=self.config.TABLESPACE_OSM_DATA,
|
tablespaces=dict(slim_data=self.config.TABLESPACE_OSM_DATA,
|
||||||
slim_index=self.config.TABLESPACE_OSM_INDEX,
|
slim_index=self.config.TABLESPACE_OSM_INDEX,
|
||||||
main_data=self.config.TABLESPACE_PLACE_DATA,
|
main_data=self.config.TABLESPACE_PLACE_DATA,
|
||||||
main_index=self.config.TABLESPACE_PLACE_INDEX
|
main_index=self.config.TABLESPACE_PLACE_INDEX
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_osm_file_list(self):
|
||||||
|
""" Return the --osm-file argument as a list of Paths or None
|
||||||
|
if no argument was given. The function also checks if the files
|
||||||
|
exist and raises a UsageError if one cannot be found.
|
||||||
|
"""
|
||||||
|
if not self.osm_file:
|
||||||
|
return None
|
||||||
|
|
||||||
|
files = [Path(f) for f in self.osm_file]
|
||||||
|
for fname in files:
|
||||||
|
if not fname.is_file():
|
||||||
|
LOG.fatal("OSM file '%s' does not exist.", fname)
|
||||||
|
raise UsageError('Cannot access file.')
|
||||||
|
|
||||||
|
return files
|
||||||
|
|||||||
@@ -31,6 +31,6 @@ class SetupFreeze:
|
|||||||
|
|
||||||
with connect(args.config.get_libpq_dsn()) as conn:
|
with connect(args.config.get_libpq_dsn()) as conn:
|
||||||
freeze.drop_update_tables(conn)
|
freeze.drop_update_tables(conn)
|
||||||
freeze.drop_flatnode_file(args.config.FLATNODE_FILE)
|
freeze.drop_flatnode_file(str(args.config.get_path('FLATNODE_FILE')))
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
|
|||||||
@@ -15,6 +15,11 @@ from nominatim.db.connection import connect
|
|||||||
class UpdateIndex:
|
class UpdateIndex:
|
||||||
"""\
|
"""\
|
||||||
Reindex all new and modified data.
|
Reindex all new and modified data.
|
||||||
|
|
||||||
|
Indexing is the process of computing the address and search terms for
|
||||||
|
the places in the database. Every time data is added or changed, indexing
|
||||||
|
needs to be run. Imports and replication updates automatically take care
|
||||||
|
of indexing. For other cases, this function allows to run indexing manually.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|||||||
@@ -17,7 +17,13 @@ class UpdateRefresh:
|
|||||||
"""\
|
"""\
|
||||||
Recompute auxiliary data used by the indexing process.
|
Recompute auxiliary data used by the indexing process.
|
||||||
|
|
||||||
These functions must not be run in parallel with other update commands.
|
This sub-commands updates various static data and functions in the database.
|
||||||
|
It usually needs to be run after changing various aspects of the
|
||||||
|
configuration. The configuration documentation will mention the exact
|
||||||
|
command to use in such case.
|
||||||
|
|
||||||
|
Warning: the 'update' command must not be run in parallel with other update
|
||||||
|
commands like 'replication' or 'add-data'.
|
||||||
"""
|
"""
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.tokenizer = None
|
self.tokenizer = None
|
||||||
@@ -34,7 +40,7 @@ class UpdateRefresh:
|
|||||||
group.add_argument('--functions', action='store_true',
|
group.add_argument('--functions', action='store_true',
|
||||||
help='Update the PL/pgSQL functions in the database')
|
help='Update the PL/pgSQL functions in the database')
|
||||||
group.add_argument('--wiki-data', action='store_true',
|
group.add_argument('--wiki-data', action='store_true',
|
||||||
help='Update Wikipedia/data importance numbers.')
|
help='Update Wikipedia/data importance numbers')
|
||||||
group.add_argument('--importance', action='store_true',
|
group.add_argument('--importance', action='store_true',
|
||||||
help='Recompute place importances (expensive!)')
|
help='Recompute place importances (expensive!)')
|
||||||
group.add_argument('--website', action='store_true',
|
group.add_argument('--website', action='store_true',
|
||||||
@@ -65,14 +71,13 @@ class UpdateRefresh:
|
|||||||
"Postcode updates on a frozen database is not possible.")
|
"Postcode updates on a frozen database is not possible.")
|
||||||
|
|
||||||
if args.word_counts:
|
if args.word_counts:
|
||||||
LOG.warning('Recompute frequency of full-word search terms')
|
LOG.warning('Recompute word statistics')
|
||||||
refresh.recompute_word_counts(args.config.get_libpq_dsn(), args.sqllib_dir)
|
self._get_tokenizer(args.config).update_statistics()
|
||||||
|
|
||||||
if args.address_levels:
|
if args.address_levels:
|
||||||
cfg = Path(args.config.ADDRESS_LEVEL_CONFIG)
|
LOG.warning('Updating address levels')
|
||||||
LOG.warning('Updating address levels from %s', cfg)
|
|
||||||
with connect(args.config.get_libpq_dsn()) as conn:
|
with connect(args.config.get_libpq_dsn()) as conn:
|
||||||
refresh.load_address_levels_from_file(conn, cfg)
|
refresh.load_address_levels_from_config(conn, args.config)
|
||||||
|
|
||||||
if args.functions:
|
if args.functions:
|
||||||
LOG.warning('Create functions')
|
LOG.warning('Create functions')
|
||||||
|
|||||||
@@ -20,6 +20,19 @@ LOG = logging.getLogger()
|
|||||||
class UpdateReplication:
|
class UpdateReplication:
|
||||||
"""\
|
"""\
|
||||||
Update the database using an online replication service.
|
Update the database using an online replication service.
|
||||||
|
|
||||||
|
An OSM replication service is an online service that provides regular
|
||||||
|
updates (OSM diff files) for the planet or update they provide. The OSMF
|
||||||
|
provides the primary replication service for the full planet at
|
||||||
|
https://planet.osm.org/replication/ but there are other providers of
|
||||||
|
extracts of OSM data who provide such a service as well.
|
||||||
|
|
||||||
|
This sub-command allows to set up such a replication service and download
|
||||||
|
and import updates at regular intervals. You need to call '--init' once to
|
||||||
|
set up the process or whenever you change the replication configuration
|
||||||
|
parameters. Without any arguments, the sub-command will go into a loop and
|
||||||
|
continuously apply updates as they become available. Giving `--once` just
|
||||||
|
downloads and imports the next batch of updates.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@@ -29,22 +42,25 @@ class UpdateReplication:
|
|||||||
help='Initialise the update process')
|
help='Initialise the update process')
|
||||||
group.add_argument('--no-update-functions', dest='update_functions',
|
group.add_argument('--no-update-functions', dest='update_functions',
|
||||||
action='store_false',
|
action='store_false',
|
||||||
help=("Do not update the trigger function to "
|
help="Do not update the trigger function to "
|
||||||
"support differential updates."))
|
"support differential updates (EXPERT)")
|
||||||
group = parser.add_argument_group('Arguments for updates')
|
group = parser.add_argument_group('Arguments for updates')
|
||||||
group.add_argument('--check-for-updates', action='store_true',
|
group.add_argument('--check-for-updates', action='store_true',
|
||||||
help='Check if new updates are available and exit')
|
help='Check if new updates are available and exit')
|
||||||
group.add_argument('--once', action='store_true',
|
group.add_argument('--once', action='store_true',
|
||||||
help=("Download and apply updates only once. When "
|
help="Download and apply updates only once. When "
|
||||||
"not set, updates are continuously applied"))
|
"not set, updates are continuously applied")
|
||||||
|
group.add_argument('--catch-up', action='store_true',
|
||||||
|
help="Download and apply updates until no new "
|
||||||
|
"data is available on the server")
|
||||||
group.add_argument('--no-index', action='store_false', dest='do_index',
|
group.add_argument('--no-index', action='store_false', dest='do_index',
|
||||||
help=("Do not index the new data. Only applicable "
|
help=("Do not index the new data. Only usable "
|
||||||
"together with --once"))
|
"together with --once"))
|
||||||
group.add_argument('--osm2pgsql-cache', metavar='SIZE', type=int,
|
group.add_argument('--osm2pgsql-cache', metavar='SIZE', type=int,
|
||||||
help='Size of cache to be used by osm2pgsql (in MB)')
|
help='Size of cache to be used by osm2pgsql (in MB)')
|
||||||
group = parser.add_argument_group('Download parameters')
|
group = parser.add_argument_group('Download parameters')
|
||||||
group.add_argument('--socket-timeout', dest='socket_timeout', type=int, default=60,
|
group.add_argument('--socket-timeout', dest='socket_timeout', type=int, default=60,
|
||||||
help='Set timeout for file downloads.')
|
help='Set timeout for file downloads')
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _init_replication(args):
|
def _init_replication(args):
|
||||||
@@ -79,28 +95,40 @@ class UpdateReplication:
|
|||||||
round_time(end - start_import),
|
round_time(end - start_import),
|
||||||
round_time(end - batchdate))
|
round_time(end - batchdate))
|
||||||
|
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _compute_update_interval(args):
|
||||||
|
if args.catch_up:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
update_interval = args.config.get_int('REPLICATION_UPDATE_INTERVAL')
|
||||||
|
# Sanity check to not overwhelm the Geofabrik servers.
|
||||||
|
if 'download.geofabrik.de' in args.config.REPLICATION_URL\
|
||||||
|
and update_interval < 86400:
|
||||||
|
LOG.fatal("Update interval too low for download.geofabrik.de.\n"
|
||||||
|
"Please check install documentation "
|
||||||
|
"(https://nominatim.org/release-docs/latest/admin/Import-and-Update#"
|
||||||
|
"setting-up-the-update-process).")
|
||||||
|
raise UsageError("Invalid replication update interval setting.")
|
||||||
|
|
||||||
|
return update_interval
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _update(args):
|
def _update(args):
|
||||||
from ..tools import replication
|
from ..tools import replication
|
||||||
from ..indexer.indexer import Indexer
|
from ..indexer.indexer import Indexer
|
||||||
from ..tokenizer import factory as tokenizer_factory
|
from ..tokenizer import factory as tokenizer_factory
|
||||||
|
|
||||||
|
update_interval = UpdateReplication._compute_update_interval(args)
|
||||||
|
|
||||||
params = args.osm2pgsql_options(default_cache=2000, default_threads=1)
|
params = args.osm2pgsql_options(default_cache=2000, default_threads=1)
|
||||||
params.update(base_url=args.config.REPLICATION_URL,
|
params.update(base_url=args.config.REPLICATION_URL,
|
||||||
update_interval=args.config.get_int('REPLICATION_UPDATE_INTERVAL'),
|
update_interval=update_interval,
|
||||||
import_file=args.project_dir / 'osmosischange.osc',
|
import_file=args.project_dir / 'osmosischange.osc',
|
||||||
max_diff_size=args.config.get_int('REPLICATION_MAX_DIFF'),
|
max_diff_size=args.config.get_int('REPLICATION_MAX_DIFF'),
|
||||||
indexed_only=not args.once)
|
indexed_only=not args.once)
|
||||||
|
|
||||||
# Sanity check to not overwhelm the Geofabrik servers.
|
|
||||||
if 'download.geofabrik.de' in params['base_url']\
|
|
||||||
and params['update_interval'] < 86400:
|
|
||||||
LOG.fatal("Update interval too low for download.geofabrik.de.\n"
|
|
||||||
"Please check install documentation "
|
|
||||||
"(https://nominatim.org/release-docs/latest/admin/Import-and-Update#"
|
|
||||||
"setting-up-the-update-process).")
|
|
||||||
raise UsageError("Invalid replication update interval setting.")
|
|
||||||
|
|
||||||
if not args.once:
|
if not args.once:
|
||||||
if not args.do_index:
|
if not args.do_index:
|
||||||
LOG.fatal("Indexing cannot be disabled when running updates continuously.")
|
LOG.fatal("Indexing cannot be disabled when running updates continuously.")
|
||||||
@@ -108,6 +136,7 @@ class UpdateReplication:
|
|||||||
recheck_interval = args.config.get_int('REPLICATION_RECHECK_INTERVAL')
|
recheck_interval = args.config.get_int('REPLICATION_RECHECK_INTERVAL')
|
||||||
|
|
||||||
tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
|
tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
|
||||||
|
indexer = Indexer(args.config.get_libpq_dsn(), tokenizer, args.threads or 1)
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
with connect(args.config.get_libpq_dsn()) as conn:
|
with connect(args.config.get_libpq_dsn()) as conn:
|
||||||
@@ -120,10 +149,7 @@ class UpdateReplication:
|
|||||||
|
|
||||||
if state is not replication.UpdateState.NO_CHANGES and args.do_index:
|
if state is not replication.UpdateState.NO_CHANGES and args.do_index:
|
||||||
index_start = dt.datetime.now(dt.timezone.utc)
|
index_start = dt.datetime.now(dt.timezone.utc)
|
||||||
indexer = Indexer(args.config.get_libpq_dsn(), tokenizer,
|
indexer.index_full(analyse=False)
|
||||||
args.threads or 1)
|
|
||||||
indexer.index_boundaries(0, 30)
|
|
||||||
indexer.index_by_rank(0, 30)
|
|
||||||
|
|
||||||
with connect(args.config.get_libpq_dsn()) as conn:
|
with connect(args.config.get_libpq_dsn()) as conn:
|
||||||
status.set_indexed(conn, True)
|
status.set_indexed(conn, True)
|
||||||
@@ -132,10 +158,15 @@ class UpdateReplication:
|
|||||||
else:
|
else:
|
||||||
index_start = None
|
index_start = None
|
||||||
|
|
||||||
|
if state is replication.UpdateState.NO_CHANGES and \
|
||||||
|
args.catch_up or update_interval > 40*60:
|
||||||
|
while indexer.has_pending():
|
||||||
|
indexer.index_full(analyse=False)
|
||||||
|
|
||||||
if LOG.isEnabledFor(logging.WARNING):
|
if LOG.isEnabledFor(logging.WARNING):
|
||||||
UpdateReplication._report_update(batchdate, start, index_start)
|
UpdateReplication._report_update(batchdate, start, index_start)
|
||||||
|
|
||||||
if args.once:
|
if args.once or (args.catch_up and state is replication.UpdateState.NO_CHANGES):
|
||||||
break
|
break
|
||||||
|
|
||||||
if state is replication.UpdateState.NO_CHANGES:
|
if state is replication.UpdateState.NO_CHANGES:
|
||||||
|
|||||||
@@ -9,7 +9,6 @@ import psutil
|
|||||||
from nominatim.db.connection import connect
|
from nominatim.db.connection import connect
|
||||||
from nominatim.db import status, properties
|
from nominatim.db import status, properties
|
||||||
from nominatim.version import NOMINATIM_VERSION
|
from nominatim.version import NOMINATIM_VERSION
|
||||||
from nominatim.errors import UsageError
|
|
||||||
|
|
||||||
# Do not repeat documentation of subcommand classes.
|
# Do not repeat documentation of subcommand classes.
|
||||||
# pylint: disable=C0111
|
# pylint: disable=C0111
|
||||||
@@ -21,14 +20,19 @@ LOG = logging.getLogger()
|
|||||||
class SetupAll:
|
class SetupAll:
|
||||||
"""\
|
"""\
|
||||||
Create a new Nominatim database from an OSM file.
|
Create a new Nominatim database from an OSM file.
|
||||||
|
|
||||||
|
This sub-command sets up a new Nominatim database from scratch starting
|
||||||
|
with creating a new database in Postgresql. The user running this command
|
||||||
|
needs superuser rights on the database.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def add_args(parser):
|
def add_args(parser):
|
||||||
group_name = parser.add_argument_group('Required arguments')
|
group_name = parser.add_argument_group('Required arguments')
|
||||||
group = group_name.add_mutually_exclusive_group(required=True)
|
group = group_name.add_mutually_exclusive_group(required=True)
|
||||||
group.add_argument('--osm-file', metavar='FILE',
|
group.add_argument('--osm-file', metavar='FILE', action='append',
|
||||||
help='OSM file to be imported.')
|
help='OSM file to be imported'
|
||||||
|
' (repeat for importing multiple files)')
|
||||||
group.add_argument('--continue', dest='continue_at',
|
group.add_argument('--continue', dest='continue_at',
|
||||||
choices=['load-data', 'indexing', 'db-postprocess'],
|
choices=['load-data', 'indexing', 'db-postprocess'],
|
||||||
help='Continue an import that was interrupted')
|
help='Continue an import that was interrupted')
|
||||||
@@ -47,46 +51,35 @@ class SetupAll:
|
|||||||
group.add_argument('--ignore-errors', action='store_true',
|
group.add_argument('--ignore-errors', action='store_true',
|
||||||
help='Continue import even when errors in SQL are present')
|
help='Continue import even when errors in SQL are present')
|
||||||
group.add_argument('--index-noanalyse', action='store_true',
|
group.add_argument('--index-noanalyse', action='store_true',
|
||||||
help='Do not perform analyse operations during index')
|
help='Do not perform analyse operations during index (expert only)')
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def run(args): # pylint: disable=too-many-statements
|
def run(args):
|
||||||
from ..tools import database_import, refresh, postcodes, freeze
|
from ..tools import database_import, refresh, postcodes, freeze, country_info
|
||||||
from ..indexer.indexer import Indexer
|
from ..indexer.indexer import Indexer
|
||||||
from ..tokenizer import factory as tokenizer_factory
|
|
||||||
|
|
||||||
if args.osm_file and not Path(args.osm_file).is_file():
|
country_info.setup_country_config(args.config)
|
||||||
LOG.fatal("OSM file '%s' does not exist.", args.osm_file)
|
|
||||||
raise UsageError('Cannot access file.')
|
|
||||||
|
|
||||||
if args.continue_at is None:
|
if args.continue_at is None:
|
||||||
|
files = args.get_osm_file_list()
|
||||||
|
|
||||||
|
LOG.warning('Creating database')
|
||||||
database_import.setup_database_skeleton(args.config.get_libpq_dsn(),
|
database_import.setup_database_skeleton(args.config.get_libpq_dsn(),
|
||||||
args.data_dir,
|
|
||||||
args.no_partitions,
|
|
||||||
rouser=args.config.DATABASE_WEBUSER)
|
rouser=args.config.DATABASE_WEBUSER)
|
||||||
|
|
||||||
|
LOG.warning('Setting up country tables')
|
||||||
|
country_info.setup_country_tables(args.config.get_libpq_dsn(),
|
||||||
|
args.data_dir,
|
||||||
|
args.no_partitions)
|
||||||
|
|
||||||
LOG.warning('Importing OSM data file')
|
LOG.warning('Importing OSM data file')
|
||||||
database_import.import_osm_data(Path(args.osm_file),
|
database_import.import_osm_data(files,
|
||||||
args.osm2pgsql_options(0, 1),
|
args.osm2pgsql_options(0, 1),
|
||||||
drop=args.no_updates,
|
drop=args.no_updates,
|
||||||
ignore_errors=args.ignore_errors)
|
ignore_errors=args.ignore_errors)
|
||||||
|
|
||||||
with connect(args.config.get_libpq_dsn()) as conn:
|
SetupAll._setup_tables(args.config, args.reverse_only)
|
||||||
LOG.warning('Create functions (1st pass)')
|
|
||||||
refresh.create_functions(conn, args.config, False, False)
|
|
||||||
LOG.warning('Create tables')
|
|
||||||
database_import.create_tables(conn, args.config,
|
|
||||||
reverse_only=args.reverse_only)
|
|
||||||
refresh.load_address_levels_from_file(conn, Path(args.config.ADDRESS_LEVEL_CONFIG))
|
|
||||||
LOG.warning('Create functions (2nd pass)')
|
|
||||||
refresh.create_functions(conn, args.config, False, False)
|
|
||||||
LOG.warning('Create table triggers')
|
|
||||||
database_import.create_table_triggers(conn, args.config)
|
|
||||||
LOG.warning('Create partition tables')
|
|
||||||
database_import.create_partition_tables(conn, args.config)
|
|
||||||
LOG.warning('Create functions (3rd pass)')
|
|
||||||
refresh.create_functions(conn, args.config, False, False)
|
|
||||||
|
|
||||||
LOG.warning('Importing wikipedia importance data')
|
LOG.warning('Importing wikipedia importance data')
|
||||||
data_path = Path(args.config.WIKIPEDIA_DATA_PATH or args.project_dir)
|
data_path = Path(args.config.WIKIPEDIA_DATA_PATH or args.project_dir)
|
||||||
@@ -105,12 +98,7 @@ class SetupAll:
|
|||||||
args.threads or psutil.cpu_count() or 1)
|
args.threads or psutil.cpu_count() or 1)
|
||||||
|
|
||||||
LOG.warning("Setting up tokenizer")
|
LOG.warning("Setting up tokenizer")
|
||||||
if args.continue_at is None or args.continue_at == 'load-data':
|
tokenizer = SetupAll._get_tokenizer(args.continue_at, args.config)
|
||||||
# (re)initialise the tokenizer data
|
|
||||||
tokenizer = tokenizer_factory.create_tokenizer(args.config)
|
|
||||||
else:
|
|
||||||
# just load the tokenizer
|
|
||||||
tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
|
|
||||||
|
|
||||||
if args.continue_at is None or args.continue_at == 'load-data':
|
if args.continue_at is None or args.continue_at == 'load-data':
|
||||||
LOG.warning('Calculate postcodes')
|
LOG.warning('Calculate postcodes')
|
||||||
@@ -131,33 +119,60 @@ class SetupAll:
|
|||||||
database_import.create_search_indices(conn, args.config,
|
database_import.create_search_indices(conn, args.config,
|
||||||
drop=args.no_updates)
|
drop=args.no_updates)
|
||||||
LOG.warning('Create search index for default country names.')
|
LOG.warning('Create search index for default country names.')
|
||||||
database_import.create_country_names(conn, tokenizer,
|
country_info.create_country_names(conn, tokenizer,
|
||||||
args.config.LANGUAGES)
|
args.config.LANGUAGES)
|
||||||
conn.commit()
|
|
||||||
if args.no_updates:
|
if args.no_updates:
|
||||||
freeze.drop_update_tables(conn)
|
freeze.drop_update_tables(conn)
|
||||||
tokenizer.finalize_import(args.config)
|
tokenizer.finalize_import(args.config)
|
||||||
|
|
||||||
|
LOG.warning('Recompute word counts')
|
||||||
|
tokenizer.update_statistics()
|
||||||
|
|
||||||
webdir = args.project_dir / 'website'
|
webdir = args.project_dir / 'website'
|
||||||
LOG.warning('Setup website at %s', webdir)
|
LOG.warning('Setup website at %s', webdir)
|
||||||
with connect(args.config.get_libpq_dsn()) as conn:
|
with connect(args.config.get_libpq_dsn()) as conn:
|
||||||
refresh.setup_website(webdir, args.config, conn)
|
refresh.setup_website(webdir, args.config, conn)
|
||||||
|
|
||||||
with connect(args.config.get_libpq_dsn()) as conn:
|
SetupAll._set_database_date(args.config.get_libpq_dsn())
|
||||||
try:
|
|
||||||
dbdate = status.compute_database_date(conn)
|
|
||||||
status.set_status(conn, dbdate)
|
|
||||||
LOG.info('Database is at %s.', dbdate)
|
|
||||||
except Exception as exc: # pylint: disable=broad-except
|
|
||||||
LOG.error('Cannot determine date of database: %s', exc)
|
|
||||||
|
|
||||||
properties.set_property(conn, 'database_version',
|
|
||||||
'{0[0]}.{0[1]}.{0[2]}-{0[3]}'.format(NOMINATIM_VERSION))
|
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _setup_tables(config, reverse_only):
|
||||||
|
""" Set up the basic database layout: tables, indexes and functions.
|
||||||
|
"""
|
||||||
|
from ..tools import database_import, refresh
|
||||||
|
|
||||||
|
with connect(config.get_libpq_dsn()) as conn:
|
||||||
|
LOG.warning('Create functions (1st pass)')
|
||||||
|
refresh.create_functions(conn, config, False, False)
|
||||||
|
LOG.warning('Create tables')
|
||||||
|
database_import.create_tables(conn, config, reverse_only=reverse_only)
|
||||||
|
refresh.load_address_levels_from_config(conn, config)
|
||||||
|
LOG.warning('Create functions (2nd pass)')
|
||||||
|
refresh.create_functions(conn, config, False, False)
|
||||||
|
LOG.warning('Create table triggers')
|
||||||
|
database_import.create_table_triggers(conn, config)
|
||||||
|
LOG.warning('Create partition tables')
|
||||||
|
database_import.create_partition_tables(conn, config)
|
||||||
|
LOG.warning('Create functions (3rd pass)')
|
||||||
|
refresh.create_functions(conn, config, False, False)
|
||||||
|
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _get_tokenizer(continue_at, config):
|
||||||
|
""" Set up a new tokenizer or load an already initialised one.
|
||||||
|
"""
|
||||||
|
from ..tokenizer import factory as tokenizer_factory
|
||||||
|
|
||||||
|
if continue_at is None or continue_at == 'load-data':
|
||||||
|
# (re)initialise the tokenizer data
|
||||||
|
return tokenizer_factory.create_tokenizer(config)
|
||||||
|
|
||||||
|
# just load the tokenizer
|
||||||
|
return tokenizer_factory.get_tokenizer_for_db(config)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _create_pending_index(conn, tablespace):
|
def _create_pending_index(conn, tablespace):
|
||||||
""" Add a supporting index for finding places still to be indexed.
|
""" Add a supporting index for finding places still to be indexed.
|
||||||
@@ -178,3 +193,19 @@ class SetupAll:
|
|||||||
{} WHERE indexed_status > 0
|
{} WHERE indexed_status > 0
|
||||||
""".format(tablespace))
|
""".format(tablespace))
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _set_database_date(dsn):
|
||||||
|
""" Determine the database date and set the status accordingly.
|
||||||
|
"""
|
||||||
|
with connect(dsn) as conn:
|
||||||
|
try:
|
||||||
|
dbdate = status.compute_database_date(conn)
|
||||||
|
status.set_status(conn, dbdate)
|
||||||
|
LOG.info('Database is at %s.', dbdate)
|
||||||
|
except Exception as exc: # pylint: disable=broad-except
|
||||||
|
LOG.error('Cannot determine date of database: %s', exc)
|
||||||
|
|
||||||
|
properties.set_property(conn, 'database_version',
|
||||||
|
'{0[0]}.{0[1]}.{0[2]}-{0[3]}'.format(NOMINATIM_VERSION))
|
||||||
|
|||||||
@@ -19,16 +19,42 @@ LOG = logging.getLogger()
|
|||||||
class ImportSpecialPhrases:
|
class ImportSpecialPhrases:
|
||||||
"""\
|
"""\
|
||||||
Import special phrases.
|
Import special phrases.
|
||||||
|
|
||||||
|
Special phrases are search terms that narrow down the type of object
|
||||||
|
that should be searched. For example, you might want to search for
|
||||||
|
'Hotels in Barcelona'. The OSM wiki has a selection of special phrases
|
||||||
|
in many languages, which can be imported with this command.
|
||||||
|
|
||||||
|
You can also provide your own phrases in a CSV file. The file needs to have
|
||||||
|
the following five columns:
|
||||||
|
* phrase - the term expected for searching
|
||||||
|
* class - the OSM tag key of the object type
|
||||||
|
* type - the OSM tag value of the object type
|
||||||
|
* operator - the kind of search to be done (one of: in, near, name, -)
|
||||||
|
* plural - whether the term is a plural or not (Y/N)
|
||||||
|
|
||||||
|
An example file can be found in the Nominatim sources at
|
||||||
|
'test/testdb/full_en_phrases_test.csv'.
|
||||||
|
|
||||||
|
The import can be further configured to ignore specific key/value pairs.
|
||||||
|
This is particularly useful when importing phrases from the wiki. The
|
||||||
|
default configuration excludes some very common tags like building=yes.
|
||||||
|
The configuration can be customized by putting a file `phrase-settings.json`
|
||||||
|
with custom rules into the project directory or by using the `--config`
|
||||||
|
option to point to another configuration file.
|
||||||
"""
|
"""
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def add_args(parser):
|
def add_args(parser):
|
||||||
group = parser.add_argument_group('Input arguments')
|
group = parser.add_argument_group('Input arguments')
|
||||||
group.add_argument('--import-from-wiki', action='store_true',
|
group.add_argument('--import-from-wiki', action='store_true',
|
||||||
help='Import special phrases from the OSM wiki to the database.')
|
help='Import special phrases from the OSM wiki to the database')
|
||||||
group.add_argument('--import-from-csv', metavar='FILE',
|
group.add_argument('--import-from-csv', metavar='FILE',
|
||||||
help='Import special phrases from a CSV file.')
|
help='Import special phrases from a CSV file')
|
||||||
group.add_argument('--no-replace', action='store_true',
|
group.add_argument('--no-replace', action='store_true',
|
||||||
help='Keep the old phrases and only add the new ones.')
|
help='Keep the old phrases and only add the new ones')
|
||||||
|
group.add_argument('--config', action='store',
|
||||||
|
help='Configuration file for black/white listing '
|
||||||
|
'(default: phrase-settings.json)')
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def run(args):
|
def run(args):
|
||||||
@@ -56,5 +82,5 @@ class ImportSpecialPhrases:
|
|||||||
should_replace = not args.no_replace
|
should_replace = not args.no_replace
|
||||||
with connect(args.config.get_libpq_dsn()) as db_connection:
|
with connect(args.config.get_libpq_dsn()) as db_connection:
|
||||||
SPImporter(
|
SPImporter(
|
||||||
args.config, args.phplib_dir, db_connection, loader
|
args.config, db_connection, loader
|
||||||
).import_phrases(tokenizer, should_replace)
|
).import_phrases(tokenizer, should_replace)
|
||||||
|
|||||||
@@ -4,6 +4,8 @@ Nominatim configuration accessor.
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
import json
|
||||||
|
import yaml
|
||||||
|
|
||||||
from dotenv import dotenv_values
|
from dotenv import dotenv_values
|
||||||
|
|
||||||
@@ -11,6 +13,27 @@ from nominatim.errors import UsageError
|
|||||||
|
|
||||||
LOG = logging.getLogger()
|
LOG = logging.getLogger()
|
||||||
|
|
||||||
|
|
||||||
|
def flatten_config_list(content, section=''):
|
||||||
|
""" Flatten YAML configuration lists that contain include sections
|
||||||
|
which are lists themselves.
|
||||||
|
"""
|
||||||
|
if not content:
|
||||||
|
return []
|
||||||
|
|
||||||
|
if not isinstance(content, list):
|
||||||
|
raise UsageError(f"List expected in section '{section}'.")
|
||||||
|
|
||||||
|
output = []
|
||||||
|
for ele in content:
|
||||||
|
if isinstance(ele, list):
|
||||||
|
output.extend(flatten_config_list(ele, section))
|
||||||
|
else:
|
||||||
|
output.append(ele)
|
||||||
|
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
class Configuration:
|
class Configuration:
|
||||||
""" Load and manage the project configuration.
|
""" Load and manage the project configuration.
|
||||||
|
|
||||||
@@ -33,12 +56,6 @@ class Configuration:
|
|||||||
if project_dir is not None and (project_dir / '.env').is_file():
|
if project_dir is not None and (project_dir / '.env').is_file():
|
||||||
self._config.update(dotenv_values(str((project_dir / '.env').resolve())))
|
self._config.update(dotenv_values(str((project_dir / '.env').resolve())))
|
||||||
|
|
||||||
# Add defaults for variables that are left empty to set the default.
|
|
||||||
# They may still be overwritten by environment variables.
|
|
||||||
if not self._config['NOMINATIM_ADDRESS_LEVEL_CONFIG']:
|
|
||||||
self._config['NOMINATIM_ADDRESS_LEVEL_CONFIG'] = \
|
|
||||||
str(config_dir / 'address-levels.json')
|
|
||||||
|
|
||||||
class _LibDirs:
|
class _LibDirs:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@@ -53,7 +70,10 @@ class Configuration:
|
|||||||
def __getattr__(self, name):
|
def __getattr__(self, name):
|
||||||
name = 'NOMINATIM_' + name
|
name = 'NOMINATIM_' + name
|
||||||
|
|
||||||
return self.environ.get(name) or self._config[name]
|
if name in self.environ:
|
||||||
|
return self.environ[name]
|
||||||
|
|
||||||
|
return self._config[name]
|
||||||
|
|
||||||
def get_bool(self, name):
|
def get_bool(self, name):
|
||||||
""" Return the given configuration parameter as a boolean.
|
""" Return the given configuration parameter as a boolean.
|
||||||
@@ -73,6 +93,23 @@ class Configuration:
|
|||||||
raise UsageError("Configuration error.") from exp
|
raise UsageError("Configuration error.") from exp
|
||||||
|
|
||||||
|
|
||||||
|
def get_path(self, name):
|
||||||
|
""" Return the given configuration parameter as a Path.
|
||||||
|
If a relative path is configured, then the function converts this
|
||||||
|
into an absolute path with the project directory as root path.
|
||||||
|
If the configuration is unset, a falsy value is returned.
|
||||||
|
"""
|
||||||
|
value = self.__getattr__(name)
|
||||||
|
if value:
|
||||||
|
value = Path(value)
|
||||||
|
|
||||||
|
if not value.is_absolute():
|
||||||
|
value = self.project_dir / value
|
||||||
|
|
||||||
|
value = value.resolve()
|
||||||
|
|
||||||
|
return value
|
||||||
|
|
||||||
def get_libpq_dsn(self):
|
def get_libpq_dsn(self):
|
||||||
""" Get configured database DSN converted into the key/value format
|
""" Get configured database DSN converted into the key/value format
|
||||||
understood by libpq and psycopg.
|
understood by libpq and psycopg.
|
||||||
@@ -103,7 +140,7 @@ class Configuration:
|
|||||||
if style in ('admin', 'street', 'address', 'full', 'extratags'):
|
if style in ('admin', 'street', 'address', 'full', 'extratags'):
|
||||||
return self.config_dir / 'import-{}.style'.format(style)
|
return self.config_dir / 'import-{}.style'.format(style)
|
||||||
|
|
||||||
return Path(style)
|
return self.find_config_file('', 'IMPORT_STYLE')
|
||||||
|
|
||||||
|
|
||||||
def get_os_env(self):
|
def get_os_env(self):
|
||||||
@@ -114,3 +151,98 @@ class Configuration:
|
|||||||
env.update(self.environ)
|
env.update(self.environ)
|
||||||
|
|
||||||
return env
|
return env
|
||||||
|
|
||||||
|
|
||||||
|
def load_sub_configuration(self, filename, config=None):
|
||||||
|
""" Load additional configuration from a file. `filename` is the name
|
||||||
|
of the configuration file. The file is first searched in the
|
||||||
|
project directory and then in the global settings dirctory.
|
||||||
|
|
||||||
|
If `config` is set, then the name of the configuration file can
|
||||||
|
be additionally given through a .env configuration option. When
|
||||||
|
the option is set, then the file will be exclusively loaded as set:
|
||||||
|
if the name is an absolute path, the file name is taken as is,
|
||||||
|
if the name is relative, it is taken to be relative to the
|
||||||
|
project directory.
|
||||||
|
|
||||||
|
The format of the file is determined from the filename suffix.
|
||||||
|
Currently only files with extension '.yaml' are supported.
|
||||||
|
|
||||||
|
YAML files support a special '!include' construct. When the
|
||||||
|
directive is given, the value is taken to be a filename, the file
|
||||||
|
is loaded using this function and added at the position in the
|
||||||
|
configuration tree.
|
||||||
|
"""
|
||||||
|
configfile = self.find_config_file(filename, config)
|
||||||
|
|
||||||
|
if configfile.suffix in ('.yaml', '.yml'):
|
||||||
|
return self._load_from_yaml(configfile)
|
||||||
|
|
||||||
|
if configfile.suffix == '.json':
|
||||||
|
with configfile.open('r') as cfg:
|
||||||
|
return json.load(cfg)
|
||||||
|
|
||||||
|
raise UsageError(f"Config file '{configfile}' has unknown format.")
|
||||||
|
|
||||||
|
|
||||||
|
def find_config_file(self, filename, config=None):
|
||||||
|
""" Resolve the location of a configuration file given a filename and
|
||||||
|
an optional configuration option with the file name.
|
||||||
|
Raises a UsageError when the file cannot be found or is not
|
||||||
|
a regular file.
|
||||||
|
"""
|
||||||
|
if config is not None:
|
||||||
|
cfg_filename = self.__getattr__(config)
|
||||||
|
if cfg_filename:
|
||||||
|
cfg_filename = Path(cfg_filename)
|
||||||
|
|
||||||
|
if cfg_filename.is_absolute():
|
||||||
|
cfg_filename = cfg_filename.resolve()
|
||||||
|
|
||||||
|
if not cfg_filename.is_file():
|
||||||
|
LOG.fatal("Cannot find config file '%s'.", cfg_filename)
|
||||||
|
raise UsageError("Config file not found.")
|
||||||
|
|
||||||
|
return cfg_filename
|
||||||
|
|
||||||
|
filename = cfg_filename
|
||||||
|
|
||||||
|
|
||||||
|
search_paths = [self.project_dir, self.config_dir]
|
||||||
|
for path in search_paths:
|
||||||
|
if path is not None and (path / filename).is_file():
|
||||||
|
return path / filename
|
||||||
|
|
||||||
|
LOG.fatal("Configuration file '%s' not found.\nDirectories searched: %s",
|
||||||
|
filename, search_paths)
|
||||||
|
raise UsageError("Config file not found.")
|
||||||
|
|
||||||
|
|
||||||
|
def _load_from_yaml(self, cfgfile):
|
||||||
|
""" Load a YAML configuration file. This installs a special handler that
|
||||||
|
allows to include other YAML files using the '!include' operator.
|
||||||
|
"""
|
||||||
|
yaml.add_constructor('!include', self._yaml_include_representer,
|
||||||
|
Loader=yaml.SafeLoader)
|
||||||
|
return yaml.safe_load(cfgfile.read_text(encoding='utf-8'))
|
||||||
|
|
||||||
|
|
||||||
|
def _yaml_include_representer(self, loader, node):
|
||||||
|
""" Handler for the '!include' operator in YAML files.
|
||||||
|
|
||||||
|
When the filename is relative, then the file is first searched in the
|
||||||
|
project directory and then in the global settings dirctory.
|
||||||
|
"""
|
||||||
|
fname = loader.construct_scalar(node)
|
||||||
|
|
||||||
|
if Path(fname).is_absolute():
|
||||||
|
configfile = Path(fname)
|
||||||
|
else:
|
||||||
|
configfile = self.find_config_file(loader.construct_scalar(node))
|
||||||
|
|
||||||
|
if configfile.suffix != '.yaml':
|
||||||
|
LOG.fatal("Format error while reading '%s': only YAML format supported.",
|
||||||
|
configfile)
|
||||||
|
raise UsageError("Cannot handle config file format.")
|
||||||
|
|
||||||
|
return yaml.safe_load(configfile.read_text(encoding='utf-8'))
|
||||||
|
|||||||
@@ -36,7 +36,7 @@ def _setup_tablespace_sql(config):
|
|||||||
tspace = getattr(config, 'TABLESPACE_{}_{}'.format(subset, kind))
|
tspace = getattr(config, 'TABLESPACE_{}_{}'.format(subset, kind))
|
||||||
if tspace:
|
if tspace:
|
||||||
tspace = 'TABLESPACE "{}"'.format(tspace)
|
tspace = 'TABLESPACE "{}"'.format(tspace)
|
||||||
out['{}_{}'.format(subset.lower, kind.lower())] = tspace
|
out['{}_{}'.format(subset.lower(), kind.lower())] = tspace
|
||||||
|
|
||||||
return out
|
return out
|
||||||
|
|
||||||
@@ -46,8 +46,10 @@ def _setup_postgresql_features(conn):
|
|||||||
depend on the database version.
|
depend on the database version.
|
||||||
"""
|
"""
|
||||||
pg_version = conn.server_version_tuple()
|
pg_version = conn.server_version_tuple()
|
||||||
|
postgis_version = conn.postgis_version_tuple()
|
||||||
return {
|
return {
|
||||||
'has_index_non_key_column': pg_version >= (11, 0, 0)
|
'has_index_non_key_column': pg_version >= (11, 0, 0),
|
||||||
|
'spgist_geom' : 'SPGIST' if postgis_version >= (3, 0) else 'GIST'
|
||||||
}
|
}
|
||||||
|
|
||||||
class SQLPreprocessor:
|
class SQLPreprocessor:
|
||||||
|
|||||||
@@ -91,6 +91,17 @@ class Indexer:
|
|||||||
self.num_threads = num_threads
|
self.num_threads = num_threads
|
||||||
|
|
||||||
|
|
||||||
|
def has_pending(self):
|
||||||
|
""" Check if any data still needs indexing.
|
||||||
|
This function must only be used after the import has finished.
|
||||||
|
Otherwise it will be very expensive.
|
||||||
|
"""
|
||||||
|
with connect(self.dsn) as conn:
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
cur.execute("SELECT 'a' FROM placex WHERE indexed_status > 0 LIMIT 1")
|
||||||
|
return cur.rowcount > 0
|
||||||
|
|
||||||
|
|
||||||
def index_full(self, analyse=True):
|
def index_full(self, analyse=True):
|
||||||
""" Index the complete database. This will first index boundaries
|
""" Index the complete database. This will first index boundaries
|
||||||
followed by all other objects. When `analyse` is True, then the
|
followed by all other objects. When `analyse` is True, then the
|
||||||
|
|||||||
68
nominatim/indexer/place_info.py
Normal file
68
nominatim/indexer/place_info.py
Normal file
@@ -0,0 +1,68 @@
|
|||||||
|
"""
|
||||||
|
Wrapper around place information the indexer gets from the database and hands to
|
||||||
|
the tokenizer.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import psycopg2.extras
|
||||||
|
|
||||||
|
class PlaceInfo:
|
||||||
|
""" Data class containing all information the tokenizer gets about a
|
||||||
|
place it should process the names for.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, info):
|
||||||
|
self._info = info
|
||||||
|
|
||||||
|
|
||||||
|
def analyze(self, analyzer):
|
||||||
|
""" Process this place with the given tokenizer and return the
|
||||||
|
result in psycopg2-compatible Json.
|
||||||
|
"""
|
||||||
|
return psycopg2.extras.Json(analyzer.process_place(self))
|
||||||
|
|
||||||
|
|
||||||
|
@property
|
||||||
|
def name(self):
|
||||||
|
""" A dictionary with the names of the place or None if the place
|
||||||
|
has no names.
|
||||||
|
"""
|
||||||
|
return self._info.get('name')
|
||||||
|
|
||||||
|
|
||||||
|
@property
|
||||||
|
def address(self):
|
||||||
|
""" A dictionary with the address elements of the place
|
||||||
|
or None if no address information is available.
|
||||||
|
"""
|
||||||
|
return self._info.get('address')
|
||||||
|
|
||||||
|
|
||||||
|
@property
|
||||||
|
def country_code(self):
|
||||||
|
""" The country code of the country the place is in. Guaranteed
|
||||||
|
to be a two-letter lower-case string or None, if no country
|
||||||
|
could be found.
|
||||||
|
"""
|
||||||
|
return self._info.get('country_code')
|
||||||
|
|
||||||
|
|
||||||
|
@property
|
||||||
|
def rank_address(self):
|
||||||
|
""" The computed rank address before rank correction.
|
||||||
|
"""
|
||||||
|
return self._info.get('rank_address')
|
||||||
|
|
||||||
|
|
||||||
|
def is_a(self, key, value):
|
||||||
|
""" Check if the place's primary tag corresponds to the given
|
||||||
|
key and value.
|
||||||
|
"""
|
||||||
|
return self._info.get('class') == key and self._info.get('type') == value
|
||||||
|
|
||||||
|
|
||||||
|
def is_country(self):
|
||||||
|
""" Check if the place is a valid country boundary.
|
||||||
|
"""
|
||||||
|
return self.rank_address == 4 \
|
||||||
|
and self.is_a('boundary', 'administrative') \
|
||||||
|
and self.country_code is not None
|
||||||
@@ -4,18 +4,21 @@ tasks.
|
|||||||
"""
|
"""
|
||||||
import functools
|
import functools
|
||||||
|
|
||||||
import psycopg2.extras
|
|
||||||
from psycopg2 import sql as pysql
|
from psycopg2 import sql as pysql
|
||||||
|
|
||||||
|
from nominatim.indexer.place_info import PlaceInfo
|
||||||
|
|
||||||
# pylint: disable=C0111
|
# pylint: disable=C0111
|
||||||
|
|
||||||
def _mk_valuelist(template, num):
|
def _mk_valuelist(template, num):
|
||||||
return pysql.SQL(',').join([pysql.SQL(template)] * num)
|
return pysql.SQL(',').join([pysql.SQL(template)] * num)
|
||||||
|
|
||||||
|
|
||||||
class AbstractPlacexRunner:
|
class AbstractPlacexRunner:
|
||||||
""" Returns SQL commands for indexing of the placex table.
|
""" Returns SQL commands for indexing of the placex table.
|
||||||
"""
|
"""
|
||||||
SELECT_SQL = pysql.SQL('SELECT place_id FROM placex ')
|
SELECT_SQL = pysql.SQL('SELECT place_id FROM placex ')
|
||||||
|
UPDATE_LINE = "(%s, %s::hstore, %s::hstore, %s::int, %s::jsonb)"
|
||||||
|
|
||||||
def __init__(self, rank, analyzer):
|
def __init__(self, rank, analyzer):
|
||||||
self.rank = rank
|
self.rank = rank
|
||||||
@@ -27,15 +30,16 @@ class AbstractPlacexRunner:
|
|||||||
def _index_sql(num_places):
|
def _index_sql(num_places):
|
||||||
return pysql.SQL(
|
return pysql.SQL(
|
||||||
""" UPDATE placex
|
""" UPDATE placex
|
||||||
SET indexed_status = 0, address = v.addr, token_info = v.ti
|
SET indexed_status = 0, address = v.addr, token_info = v.ti,
|
||||||
FROM (VALUES {}) as v(id, addr, ti)
|
name = v.name, linked_place_id = v.linked_place_id
|
||||||
|
FROM (VALUES {}) as v(id, name, addr, linked_place_id, ti)
|
||||||
WHERE place_id = v.id
|
WHERE place_id = v.id
|
||||||
""").format(_mk_valuelist("(%s, %s::hstore, %s::jsonb)", num_places))
|
""").format(_mk_valuelist(AbstractPlacexRunner.UPDATE_LINE, num_places))
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_place_details(worker, ids):
|
def get_place_details(worker, ids):
|
||||||
worker.perform("""SELECT place_id, (placex_prepare_update(placex)).*
|
worker.perform("""SELECT place_id, (placex_indexing_prepare(placex)).*
|
||||||
FROM placex WHERE place_id IN %s""",
|
FROM placex WHERE place_id IN %s""",
|
||||||
(tuple((p[0] for p in ids)), ))
|
(tuple((p[0] for p in ids)), ))
|
||||||
|
|
||||||
@@ -43,8 +47,9 @@ class AbstractPlacexRunner:
|
|||||||
def index_places(self, worker, places):
|
def index_places(self, worker, places):
|
||||||
values = []
|
values = []
|
||||||
for place in places:
|
for place in places:
|
||||||
values.extend((place[x] for x in ('place_id', 'address')))
|
for field in ('place_id', 'name', 'address', 'linked_place_id'):
|
||||||
values.append(psycopg2.extras.Json(self.analyzer.process_place(place)))
|
values.append(place[field])
|
||||||
|
values.append(PlaceInfo(place).analyze(self.analyzer))
|
||||||
|
|
||||||
worker.perform(self._index_sql(len(places)), values)
|
worker.perform(self._index_sql(len(places)), values)
|
||||||
|
|
||||||
@@ -138,7 +143,7 @@ class InterpolationRunner:
|
|||||||
values = []
|
values = []
|
||||||
for place in places:
|
for place in places:
|
||||||
values.extend((place[x] for x in ('place_id', 'address')))
|
values.extend((place[x] for x in ('place_id', 'address')))
|
||||||
values.append(psycopg2.extras.Json(self.analyzer.process_place(place)))
|
values.append(PlaceInfo(place).analyze(self.analyzer))
|
||||||
|
|
||||||
worker.perform(self._index_sql(len(places)), values)
|
worker.perform(self._index_sql(len(places)), values)
|
||||||
|
|
||||||
|
|||||||
232
nominatim/tokenizer/base.py
Normal file
232
nominatim/tokenizer/base.py
Normal file
@@ -0,0 +1,232 @@
|
|||||||
|
"""
|
||||||
|
Abstract class defintions for tokenizers. These base classes are here
|
||||||
|
mainly for documentation purposes.
|
||||||
|
"""
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from typing import List, Tuple, Dict, Any
|
||||||
|
|
||||||
|
from nominatim.config import Configuration
|
||||||
|
from nominatim.indexer.place_info import PlaceInfo
|
||||||
|
|
||||||
|
# pylint: disable=unnecessary-pass
|
||||||
|
|
||||||
|
class AbstractAnalyzer(ABC):
|
||||||
|
""" The analyzer provides the functions for analysing names and building
|
||||||
|
the token database.
|
||||||
|
|
||||||
|
Analyzers are instantiated on a per-thread base. Access to global data
|
||||||
|
structures must be synchronised accordingly.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __enter__(self) -> 'AbstractAnalyzer':
|
||||||
|
return self
|
||||||
|
|
||||||
|
|
||||||
|
def __exit__(self, exc_type, exc_value, traceback) -> None:
|
||||||
|
self.close()
|
||||||
|
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def close(self) -> None:
|
||||||
|
""" Free all resources used by the analyzer.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def get_word_token_info(self, words: List[str]) -> List[Tuple[str, str, int]]:
|
||||||
|
""" Return token information for the given list of words.
|
||||||
|
|
||||||
|
The function is used for testing and debugging only
|
||||||
|
and does not need to be particularly efficient.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
words: A list of words to look up the tokens for.
|
||||||
|
If a word starts with # it is assumed to be a full name
|
||||||
|
otherwise is a partial term.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The function returns the list of all tuples that could be
|
||||||
|
found for the given words. Each list entry is a tuple of
|
||||||
|
(original word, word token, word id).
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def normalize_postcode(self, postcode: str) -> str:
|
||||||
|
""" Convert the postcode to its standardized form.
|
||||||
|
|
||||||
|
This function must yield exactly the same result as the SQL function
|
||||||
|
`token_normalized_postcode()`.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
postcode: The postcode to be normalized.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The given postcode after normalization.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def update_postcodes_from_db(self) -> None:
|
||||||
|
""" Update the tokenizer's postcode tokens from the current content
|
||||||
|
of the `location_postcode` table.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def update_special_phrases(self, phrases: List[Tuple[str, str, str, str]],
|
||||||
|
should_replace: bool) -> None:
|
||||||
|
""" Update the tokenizer's special phrase tokens from the given
|
||||||
|
list of special phrases.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
phrases: The new list of special phrases. Each entry is
|
||||||
|
a tuple of (phrase, class, type, operator).
|
||||||
|
should_replace: If true, replace the current list of phrases.
|
||||||
|
When false, just add the given phrases to the
|
||||||
|
ones that already exist.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def add_country_names(self, country_code: str, names: Dict[str, str]):
|
||||||
|
""" Add the given names to the tokenizer's list of country tokens.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
country_code: two-letter country code for the country the names
|
||||||
|
refer to.
|
||||||
|
names: Dictionary of name type to name.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def process_place(self, place: PlaceInfo) -> Any:
|
||||||
|
""" Extract tokens for the given place and compute the
|
||||||
|
information to be handed to the PL/pgSQL processor for building
|
||||||
|
the search index.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
place: Place information retrived from the database.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A JSON-serialisable structure that will be handed into
|
||||||
|
the database via the `token_info` field.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class AbstractTokenizer(ABC):
|
||||||
|
""" The tokenizer instance is the central instance of the tokenizer in
|
||||||
|
the system. There will only be a single instance of the tokenizer
|
||||||
|
active at any time.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
|
||||||
|
""" Set up a new tokenizer for the database.
|
||||||
|
|
||||||
|
The function should copy all necessary data into the project
|
||||||
|
directory or save it in the property table to make sure that
|
||||||
|
the tokenizer remains stable over updates.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
config: Read-only object with configuration options.
|
||||||
|
|
||||||
|
init_db: When set to False, then initialisation of database
|
||||||
|
tables should be skipped. This option is only required for
|
||||||
|
migration purposes and can be savely ignored by custom
|
||||||
|
tokenizers.
|
||||||
|
|
||||||
|
TODO: can we move the init_db parameter somewhere else?
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def init_from_project(self, config: Configuration) -> None:
|
||||||
|
""" Initialise the tokenizer from an existing database setup.
|
||||||
|
|
||||||
|
The function should load all previously saved configuration from
|
||||||
|
the project directory and/or the property table.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
config: Read-only object with configuration options.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def finalize_import(self, config: Configuration) -> None:
|
||||||
|
""" This function is called at the very end of an import when all
|
||||||
|
data has been imported and indexed. The tokenizer may create
|
||||||
|
at this point any additional indexes and data structures needed
|
||||||
|
during query time.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
config: Read-only object with configuration options.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def update_sql_functions(self, config: Configuration) -> None:
|
||||||
|
""" Update the SQL part of the tokenizer. This function is called
|
||||||
|
automatically on migrations or may be called explicitly by the
|
||||||
|
user through the `nominatim refresh --functions` command.
|
||||||
|
|
||||||
|
The tokenizer must only update the code of the tokenizer. The
|
||||||
|
data structures or data itself must not be changed by this function.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
config: Read-only object with configuration options.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def check_database(self, config: Configuration) -> str:
|
||||||
|
""" Check that the database is set up correctly and ready for being
|
||||||
|
queried.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
config: Read-only object with configuration options.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
If an issue was found, return an error message with the
|
||||||
|
description of the issue as well as hints for the user on
|
||||||
|
how to resolve the issue. If everything is okay, return `None`.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def update_statistics(self) -> None:
|
||||||
|
""" Recompute any tokenizer statistics necessary for efficient lookup.
|
||||||
|
This function is meant to be called from time to time by the user
|
||||||
|
to improve performance. However, the tokenizer must not depend on
|
||||||
|
it to be called in order to work.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def name_analyzer(self) -> AbstractAnalyzer:
|
||||||
|
""" Create a new analyzer for tokenizing names and queries
|
||||||
|
using this tokinzer. Analyzers are context managers and should
|
||||||
|
be used accordingly:
|
||||||
|
|
||||||
|
```
|
||||||
|
with tokenizer.name_analyzer() as analyzer:
|
||||||
|
analyser.tokenize()
|
||||||
|
```
|
||||||
|
|
||||||
|
When used outside the with construct, the caller must ensure to
|
||||||
|
call the close() function before destructing the analyzer.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
@@ -85,6 +85,6 @@ def get_tokenizer_for_db(config):
|
|||||||
tokenizer_module = _import_tokenizer(name)
|
tokenizer_module = _import_tokenizer(name)
|
||||||
|
|
||||||
tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
|
tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
|
||||||
tokenizer.init_from_project()
|
tokenizer.init_from_project(config)
|
||||||
|
|
||||||
return tokenizer
|
return tokenizer
|
||||||
|
|||||||
@@ -1,146 +0,0 @@
|
|||||||
"""
|
|
||||||
Processor for names that are imported into the database based on the
|
|
||||||
ICU library.
|
|
||||||
"""
|
|
||||||
from collections import defaultdict
|
|
||||||
import itertools
|
|
||||||
|
|
||||||
from icu import Transliterator
|
|
||||||
import datrie
|
|
||||||
|
|
||||||
from nominatim.db.properties import set_property, get_property
|
|
||||||
from nominatim.tokenizer import icu_variants as variants
|
|
||||||
|
|
||||||
DBCFG_IMPORT_NORM_RULES = "tokenizer_import_normalisation"
|
|
||||||
DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration"
|
|
||||||
DBCFG_IMPORT_REPLACEMENTS = "tokenizer_import_replacements"
|
|
||||||
DBCFG_SEARCH_STD_RULES = "tokenizer_search_standardization"
|
|
||||||
|
|
||||||
|
|
||||||
class ICUNameProcessorRules:
|
|
||||||
""" Data object that saves the rules needed for the name processor.
|
|
||||||
|
|
||||||
The rules can either be initialised through an ICURuleLoader or
|
|
||||||
be loaded from a database when a connection is given.
|
|
||||||
"""
|
|
||||||
def __init__(self, loader=None, conn=None):
|
|
||||||
if loader is not None:
|
|
||||||
self.norm_rules = loader.get_normalization_rules()
|
|
||||||
self.trans_rules = loader.get_transliteration_rules()
|
|
||||||
self.replacements = loader.get_replacement_pairs()
|
|
||||||
self.search_rules = loader.get_search_rules()
|
|
||||||
elif conn is not None:
|
|
||||||
self.norm_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
|
|
||||||
self.trans_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
|
|
||||||
self.replacements = \
|
|
||||||
variants.unpickle_variant_set(get_property(conn, DBCFG_IMPORT_REPLACEMENTS))
|
|
||||||
self.search_rules = get_property(conn, DBCFG_SEARCH_STD_RULES)
|
|
||||||
else:
|
|
||||||
assert False, "Parameter loader or conn required."
|
|
||||||
|
|
||||||
|
|
||||||
def save_rules(self, conn):
|
|
||||||
""" Save the rules in the property table of the given database.
|
|
||||||
the rules can be loaded again by handing in a connection into
|
|
||||||
the constructor of the class.
|
|
||||||
"""
|
|
||||||
set_property(conn, DBCFG_IMPORT_NORM_RULES, self.norm_rules)
|
|
||||||
set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.trans_rules)
|
|
||||||
set_property(conn, DBCFG_IMPORT_REPLACEMENTS,
|
|
||||||
variants.pickle_variant_set(self.replacements))
|
|
||||||
set_property(conn, DBCFG_SEARCH_STD_RULES, self.search_rules)
|
|
||||||
|
|
||||||
|
|
||||||
class ICUNameProcessor:
|
|
||||||
""" Collects the different transformation rules for normalisation of names
|
|
||||||
and provides the functions to aply the transformations.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, rules):
|
|
||||||
self.normalizer = Transliterator.createFromRules("icu_normalization",
|
|
||||||
rules.norm_rules)
|
|
||||||
self.to_ascii = Transliterator.createFromRules("icu_to_ascii",
|
|
||||||
rules.trans_rules +
|
|
||||||
";[:Space:]+ > ' '")
|
|
||||||
self.search = Transliterator.createFromRules("icu_search",
|
|
||||||
rules.search_rules)
|
|
||||||
|
|
||||||
# Intermediate reorder by source. Also compute required character set.
|
|
||||||
immediate = defaultdict(list)
|
|
||||||
chars = set()
|
|
||||||
for variant in rules.replacements:
|
|
||||||
if variant.source[-1] == ' ' and variant.replacement[-1] == ' ':
|
|
||||||
replstr = variant.replacement[:-1]
|
|
||||||
else:
|
|
||||||
replstr = variant.replacement
|
|
||||||
immediate[variant.source].append(replstr)
|
|
||||||
chars.update(variant.source)
|
|
||||||
# Then copy to datrie
|
|
||||||
self.replacements = datrie.Trie(''.join(chars))
|
|
||||||
for src, repllist in immediate.items():
|
|
||||||
self.replacements[src] = repllist
|
|
||||||
|
|
||||||
|
|
||||||
def get_normalized(self, name):
|
|
||||||
""" Normalize the given name, i.e. remove all elements not relevant
|
|
||||||
for search.
|
|
||||||
"""
|
|
||||||
return self.normalizer.transliterate(name).strip()
|
|
||||||
|
|
||||||
def get_variants_ascii(self, norm_name):
|
|
||||||
""" Compute the spelling variants for the given normalized name
|
|
||||||
and transliterate the result.
|
|
||||||
"""
|
|
||||||
baseform = '^ ' + norm_name + ' ^'
|
|
||||||
partials = ['']
|
|
||||||
|
|
||||||
startpos = 0
|
|
||||||
pos = 0
|
|
||||||
force_space = False
|
|
||||||
while pos < len(baseform):
|
|
||||||
full, repl = self.replacements.longest_prefix_item(baseform[pos:],
|
|
||||||
(None, None))
|
|
||||||
if full is not None:
|
|
||||||
done = baseform[startpos:pos]
|
|
||||||
partials = [v + done + r
|
|
||||||
for v, r in itertools.product(partials, repl)
|
|
||||||
if not force_space or r.startswith(' ')]
|
|
||||||
if len(partials) > 128:
|
|
||||||
# If too many variants are produced, they are unlikely
|
|
||||||
# to be helpful. Only use the original term.
|
|
||||||
startpos = 0
|
|
||||||
break
|
|
||||||
startpos = pos + len(full)
|
|
||||||
if full[-1] == ' ':
|
|
||||||
startpos -= 1
|
|
||||||
force_space = True
|
|
||||||
pos = startpos
|
|
||||||
else:
|
|
||||||
pos += 1
|
|
||||||
force_space = False
|
|
||||||
|
|
||||||
# No variants detected? Fast return.
|
|
||||||
if startpos == 0:
|
|
||||||
trans_name = self.to_ascii.transliterate(norm_name).strip()
|
|
||||||
return [trans_name] if trans_name else []
|
|
||||||
|
|
||||||
return self._compute_result_set(partials, baseform[startpos:])
|
|
||||||
|
|
||||||
|
|
||||||
def _compute_result_set(self, partials, prefix):
|
|
||||||
results = set()
|
|
||||||
|
|
||||||
for variant in partials:
|
|
||||||
vname = variant + prefix
|
|
||||||
trans_name = self.to_ascii.transliterate(vname[1:-1]).strip()
|
|
||||||
if trans_name:
|
|
||||||
results.add(trans_name)
|
|
||||||
|
|
||||||
return list(results)
|
|
||||||
|
|
||||||
|
|
||||||
def get_search_normalized(self, name):
|
|
||||||
""" Return the normalized version of the name (including transliteration)
|
|
||||||
to be applied at search time.
|
|
||||||
"""
|
|
||||||
return self.search.transliterate(' ' + name + ' ').strip()
|
|
||||||
@@ -1,61 +1,86 @@
|
|||||||
"""
|
"""
|
||||||
Helper class to create ICU rules from a configuration file.
|
Helper class to create ICU rules from a configuration file.
|
||||||
"""
|
"""
|
||||||
|
import importlib
|
||||||
import io
|
import io
|
||||||
|
import json
|
||||||
import logging
|
import logging
|
||||||
import itertools
|
|
||||||
from pathlib import Path
|
|
||||||
import re
|
|
||||||
|
|
||||||
import yaml
|
|
||||||
from icu import Transliterator
|
|
||||||
|
|
||||||
|
from nominatim.config import flatten_config_list
|
||||||
|
from nominatim.db.properties import set_property, get_property
|
||||||
from nominatim.errors import UsageError
|
from nominatim.errors import UsageError
|
||||||
import nominatim.tokenizer.icu_variants as variants
|
from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
|
||||||
|
from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
|
||||||
|
import nominatim.tools.country_info
|
||||||
|
|
||||||
LOG = logging.getLogger()
|
LOG = logging.getLogger()
|
||||||
|
|
||||||
def _flatten_yaml_list(content):
|
DBCFG_IMPORT_NORM_RULES = "tokenizer_import_normalisation"
|
||||||
if not content:
|
DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration"
|
||||||
return []
|
DBCFG_IMPORT_ANALYSIS_RULES = "tokenizer_import_analysis_rules"
|
||||||
|
|
||||||
if not isinstance(content, list):
|
|
||||||
raise UsageError("List expected in ICU yaml configuration.")
|
|
||||||
|
|
||||||
output = []
|
|
||||||
for ele in content:
|
|
||||||
if isinstance(ele, list):
|
|
||||||
output.extend(_flatten_yaml_list(ele))
|
|
||||||
else:
|
|
||||||
output.append(ele)
|
|
||||||
|
|
||||||
return output
|
|
||||||
|
|
||||||
|
|
||||||
class VariantRule:
|
def _get_section(rules, section):
|
||||||
""" Saves a single variant expansion.
|
""" Get the section named 'section' from the rules. If the section does
|
||||||
|
not exist, raise a usage error with a meaningful message.
|
||||||
An expansion consists of the normalized replacement term and
|
|
||||||
a dicitonary of properties that describe when the expansion applies.
|
|
||||||
"""
|
"""
|
||||||
|
if section not in rules:
|
||||||
|
LOG.fatal("Section '%s' not found in tokenizer config.", section)
|
||||||
|
raise UsageError("Syntax error in tokenizer configuration file.")
|
||||||
|
|
||||||
def __init__(self, replacement, properties):
|
return rules[section]
|
||||||
self.replacement = replacement
|
|
||||||
self.properties = properties or {}
|
|
||||||
|
|
||||||
|
|
||||||
class ICURuleLoader:
|
class ICURuleLoader:
|
||||||
""" Compiler for ICU rules from a tokenizer configuration file.
|
""" Compiler for ICU rules from a tokenizer configuration file.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, configfile):
|
def __init__(self, config):
|
||||||
self.configfile = configfile
|
rules = config.load_sub_configuration('icu_tokenizer.yaml',
|
||||||
self.variants = set()
|
config='TOKENIZER_CONFIG')
|
||||||
|
|
||||||
if configfile.suffix == '.yaml':
|
# Make sure country information is available to analyzers and sanatizers.
|
||||||
self._load_from_yaml()
|
nominatim.tools.country_info.setup_country_config(config)
|
||||||
else:
|
|
||||||
raise UsageError("Unknown format of tokenizer configuration.")
|
self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
|
||||||
|
self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
|
||||||
|
self.analysis_rules = _get_section(rules, 'token-analysis')
|
||||||
|
self._setup_analysis()
|
||||||
|
|
||||||
|
# Load optional sanitizer rule set.
|
||||||
|
self.sanitizer_rules = rules.get('sanitizers', [])
|
||||||
|
|
||||||
|
|
||||||
|
def load_config_from_db(self, conn):
|
||||||
|
""" Get previously saved parts of the configuration from the
|
||||||
|
database.
|
||||||
|
"""
|
||||||
|
self.normalization_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
|
||||||
|
self.transliteration_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
|
||||||
|
self.analysis_rules = json.loads(get_property(conn, DBCFG_IMPORT_ANALYSIS_RULES))
|
||||||
|
self._setup_analysis()
|
||||||
|
|
||||||
|
|
||||||
|
def save_config_to_db(self, conn):
|
||||||
|
""" Save the part of the configuration that cannot be changed into
|
||||||
|
the database.
|
||||||
|
"""
|
||||||
|
set_property(conn, DBCFG_IMPORT_NORM_RULES, self.normalization_rules)
|
||||||
|
set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.transliteration_rules)
|
||||||
|
set_property(conn, DBCFG_IMPORT_ANALYSIS_RULES, json.dumps(self.analysis_rules))
|
||||||
|
|
||||||
|
|
||||||
|
def make_sanitizer(self):
|
||||||
|
""" Create a place sanitizer from the configured rules.
|
||||||
|
"""
|
||||||
|
return PlaceSanitizer(self.sanitizer_rules)
|
||||||
|
|
||||||
|
|
||||||
|
def make_token_analysis(self):
|
||||||
|
""" Create a token analyser from the reviouly loaded rules.
|
||||||
|
"""
|
||||||
|
return ICUTokenAnalysis(self.normalization_rules,
|
||||||
|
self.transliteration_rules, self.analysis)
|
||||||
|
|
||||||
|
|
||||||
def get_search_rules(self):
|
def get_search_rules(self):
|
||||||
@@ -70,177 +95,66 @@ class ICURuleLoader:
|
|||||||
rules.write(self.transliteration_rules)
|
rules.write(self.transliteration_rules)
|
||||||
return rules.getvalue()
|
return rules.getvalue()
|
||||||
|
|
||||||
|
|
||||||
def get_normalization_rules(self):
|
def get_normalization_rules(self):
|
||||||
""" Return rules for normalisation of a term.
|
""" Return rules for normalisation of a term.
|
||||||
"""
|
"""
|
||||||
return self.normalization_rules
|
return self.normalization_rules
|
||||||
|
|
||||||
|
|
||||||
def get_transliteration_rules(self):
|
def get_transliteration_rules(self):
|
||||||
""" Return the rules for converting a string into its asciii representation.
|
""" Return the rules for converting a string into its asciii representation.
|
||||||
"""
|
"""
|
||||||
return self.transliteration_rules
|
return self.transliteration_rules
|
||||||
|
|
||||||
def get_replacement_pairs(self):
|
|
||||||
""" Return the list of possible compound decompositions with
|
def _setup_analysis(self):
|
||||||
application of abbreviations included.
|
""" Process the rules used for creating the various token analyzers.
|
||||||
The result is a list of pairs: the first item is the sequence to
|
|
||||||
replace, the second is a list of replacements.
|
|
||||||
"""
|
"""
|
||||||
return self.variants
|
self.analysis = {}
|
||||||
|
|
||||||
def _yaml_include_representer(self, loader, node):
|
if not isinstance(self.analysis_rules, list):
|
||||||
value = loader.construct_scalar(node)
|
raise UsageError("Configuration section 'token-analysis' must be a list.")
|
||||||
|
|
||||||
if Path(value).is_absolute():
|
for section in self.analysis_rules:
|
||||||
content = Path(value).read_text()
|
name = section.get('id', None)
|
||||||
|
if name in self.analysis:
|
||||||
|
if name is None:
|
||||||
|
LOG.fatal("ICU tokenizer configuration has two default token analyzers.")
|
||||||
else:
|
else:
|
||||||
content = (self.configfile.parent / value).read_text()
|
LOG.fatal("ICU tokenizer configuration has two token "
|
||||||
|
"analyzers with id '%s'.", name)
|
||||||
return yaml.safe_load(content)
|
raise UsageError("Syntax error in ICU tokenizer config.")
|
||||||
|
self.analysis[name] = TokenAnalyzerRule(section, self.normalization_rules)
|
||||||
|
|
||||||
|
|
||||||
def _load_from_yaml(self):
|
@staticmethod
|
||||||
yaml.add_constructor('!include', self._yaml_include_representer,
|
def _cfg_to_icu_rules(rules, section):
|
||||||
Loader=yaml.SafeLoader)
|
|
||||||
rules = yaml.safe_load(self.configfile.read_text())
|
|
||||||
|
|
||||||
self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
|
|
||||||
self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
|
|
||||||
self._parse_variant_list(self._get_section(rules, 'variants'))
|
|
||||||
|
|
||||||
|
|
||||||
def _get_section(self, rules, section):
|
|
||||||
""" Get the section named 'section' from the rules. If the section does
|
|
||||||
not exist, raise a usage error with a meaningful message.
|
|
||||||
"""
|
|
||||||
if section not in rules:
|
|
||||||
LOG.fatal("Section '%s' not found in tokenizer config '%s'.",
|
|
||||||
section, str(self.configfile))
|
|
||||||
raise UsageError("Syntax error in tokenizer configuration file.")
|
|
||||||
|
|
||||||
return rules[section]
|
|
||||||
|
|
||||||
|
|
||||||
def _cfg_to_icu_rules(self, rules, section):
|
|
||||||
""" Load an ICU ruleset from the given section. If the section is a
|
""" Load an ICU ruleset from the given section. If the section is a
|
||||||
simple string, it is interpreted as a file name and the rules are
|
simple string, it is interpreted as a file name and the rules are
|
||||||
loaded verbatim from the given file. The filename is expected to be
|
loaded verbatim from the given file. The filename is expected to be
|
||||||
relative to the tokenizer rule file. If the section is a list then
|
relative to the tokenizer rule file. If the section is a list then
|
||||||
each line is assumed to be a rule. All rules are concatenated and returned.
|
each line is assumed to be a rule. All rules are concatenated and returned.
|
||||||
"""
|
"""
|
||||||
content = self._get_section(rules, section)
|
content = _get_section(rules, section)
|
||||||
|
|
||||||
if content is None:
|
if content is None:
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
return ';'.join(_flatten_yaml_list(content)) + ';'
|
return ';'.join(flatten_config_list(content, section)) + ';'
|
||||||
|
|
||||||
|
|
||||||
def _parse_variant_list(self, rules):
|
class TokenAnalyzerRule:
|
||||||
self.variants.clear()
|
""" Factory for a single analysis module. The class saves the configuration
|
||||||
|
and creates a new token analyzer on request.
|
||||||
if not rules:
|
|
||||||
return
|
|
||||||
|
|
||||||
rules = _flatten_yaml_list(rules)
|
|
||||||
|
|
||||||
vmaker = _VariantMaker(self.normalization_rules)
|
|
||||||
|
|
||||||
properties = []
|
|
||||||
for section in rules:
|
|
||||||
# Create the property field and deduplicate against existing
|
|
||||||
# instances.
|
|
||||||
props = variants.ICUVariantProperties.from_rules(section)
|
|
||||||
for existing in properties:
|
|
||||||
if existing == props:
|
|
||||||
props = existing
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
properties.append(props)
|
|
||||||
|
|
||||||
for rule in (section.get('words') or []):
|
|
||||||
self.variants.update(vmaker.compute(rule, props))
|
|
||||||
|
|
||||||
|
|
||||||
class _VariantMaker:
|
|
||||||
""" Generater for all necessary ICUVariants from a single variant rule.
|
|
||||||
|
|
||||||
All text in rules is normalized to make sure the variants match later.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, norm_rules):
|
def __init__(self, rules, normalization_rules):
|
||||||
self.norm = Transliterator.createFromRules("rule_loader_normalization",
|
# Find the analysis module
|
||||||
norm_rules)
|
module_name = 'nominatim.tokenizer.token_analysis.' \
|
||||||
|
+ _get_section(rules, 'analyzer').replace('-', '_')
|
||||||
|
analysis_mod = importlib.import_module(module_name)
|
||||||
|
self.create = analysis_mod.create
|
||||||
|
|
||||||
|
# Load the configuration.
|
||||||
def compute(self, rule, props):
|
self.config = analysis_mod.configure(rules, normalization_rules)
|
||||||
""" Generator for all ICUVariant tuples from a single variant rule.
|
|
||||||
"""
|
|
||||||
parts = re.split(r'(\|)?([=-])>', rule)
|
|
||||||
if len(parts) != 4:
|
|
||||||
raise UsageError("Syntax error in variant rule: " + rule)
|
|
||||||
|
|
||||||
decompose = parts[1] is None
|
|
||||||
src_terms = [self._parse_variant_word(t) for t in parts[0].split(',')]
|
|
||||||
repl_terms = (self.norm.transliterate(t.strip()) for t in parts[3].split(','))
|
|
||||||
|
|
||||||
# If the source should be kept, add a 1:1 replacement
|
|
||||||
if parts[2] == '-':
|
|
||||||
for src in src_terms:
|
|
||||||
if src:
|
|
||||||
for froms, tos in _create_variants(*src, src[0], decompose):
|
|
||||||
yield variants.ICUVariant(froms, tos, props)
|
|
||||||
|
|
||||||
for src, repl in itertools.product(src_terms, repl_terms):
|
|
||||||
if src and repl:
|
|
||||||
for froms, tos in _create_variants(*src, repl, decompose):
|
|
||||||
yield variants.ICUVariant(froms, tos, props)
|
|
||||||
|
|
||||||
|
|
||||||
def _parse_variant_word(self, name):
|
|
||||||
name = name.strip()
|
|
||||||
match = re.fullmatch(r'([~^]?)([^~$^]*)([~$]?)', name)
|
|
||||||
if match is None or (match.group(1) == '~' and match.group(3) == '~'):
|
|
||||||
raise UsageError("Invalid variant word descriptor '{}'".format(name))
|
|
||||||
norm_name = self.norm.transliterate(match.group(2))
|
|
||||||
if not norm_name:
|
|
||||||
return None
|
|
||||||
|
|
||||||
return norm_name, match.group(1), match.group(3)
|
|
||||||
|
|
||||||
|
|
||||||
_FLAG_MATCH = {'^': '^ ',
|
|
||||||
'$': ' ^',
|
|
||||||
'': ' '}
|
|
||||||
|
|
||||||
|
|
||||||
def _create_variants(src, preflag, postflag, repl, decompose):
|
|
||||||
if preflag == '~':
|
|
||||||
postfix = _FLAG_MATCH[postflag]
|
|
||||||
# suffix decomposition
|
|
||||||
src = src + postfix
|
|
||||||
repl = repl + postfix
|
|
||||||
|
|
||||||
yield src, repl
|
|
||||||
yield ' ' + src, ' ' + repl
|
|
||||||
|
|
||||||
if decompose:
|
|
||||||
yield src, ' ' + repl
|
|
||||||
yield ' ' + src, repl
|
|
||||||
elif postflag == '~':
|
|
||||||
# prefix decomposition
|
|
||||||
prefix = _FLAG_MATCH[preflag]
|
|
||||||
src = prefix + src
|
|
||||||
repl = prefix + repl
|
|
||||||
|
|
||||||
yield src, repl
|
|
||||||
yield src + ' ', repl + ' '
|
|
||||||
|
|
||||||
if decompose:
|
|
||||||
yield src, repl + ' '
|
|
||||||
yield src + ' ', repl
|
|
||||||
else:
|
|
||||||
prefix = _FLAG_MATCH[preflag]
|
|
||||||
postfix = _FLAG_MATCH[postflag]
|
|
||||||
|
|
||||||
yield prefix + src + postfix, prefix + repl + postfix
|
|
||||||
|
|||||||
23
nominatim/tokenizer/icu_token_analysis.py
Normal file
23
nominatim/tokenizer/icu_token_analysis.py
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
"""
|
||||||
|
Container class collecting all components required to transform an OSM name
|
||||||
|
into a Nominatim token.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from icu import Transliterator
|
||||||
|
|
||||||
|
class ICUTokenAnalysis:
|
||||||
|
""" Container class collecting the transliterators and token analysis
|
||||||
|
modules for a single NameAnalyser instance.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, norm_rules, trans_rules, analysis_rules):
|
||||||
|
self.normalizer = Transliterator.createFromRules("icu_normalization",
|
||||||
|
norm_rules)
|
||||||
|
trans_rules += ";[:Space:]+ > ' '"
|
||||||
|
self.to_ascii = Transliterator.createFromRules("icu_to_ascii",
|
||||||
|
trans_rules)
|
||||||
|
self.search = Transliterator.createFromRules("icu_search",
|
||||||
|
norm_rules + trans_rules)
|
||||||
|
|
||||||
|
self.analysis = {name: arules.create(self.to_ascii, arules.config)
|
||||||
|
for name, arules in analysis_rules.items()}
|
||||||
@@ -2,22 +2,19 @@
|
|||||||
Tokenizer implementing normalisation as used before Nominatim 4 but using
|
Tokenizer implementing normalisation as used before Nominatim 4 but using
|
||||||
libICU instead of the PostgreSQL module.
|
libICU instead of the PostgreSQL module.
|
||||||
"""
|
"""
|
||||||
from collections import Counter
|
|
||||||
import itertools
|
import itertools
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
from textwrap import dedent
|
from textwrap import dedent
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from nominatim.db.connection import connect
|
from nominatim.db.connection import connect
|
||||||
from nominatim.db.properties import set_property, get_property
|
|
||||||
from nominatim.db.utils import CopyBuffer
|
from nominatim.db.utils import CopyBuffer
|
||||||
from nominatim.db.sql_preprocessor import SQLPreprocessor
|
from nominatim.db.sql_preprocessor import SQLPreprocessor
|
||||||
|
from nominatim.indexer.place_info import PlaceInfo
|
||||||
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
|
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
|
||||||
from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
|
from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
|
||||||
|
|
||||||
DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
|
|
||||||
DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
|
DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
|
||||||
|
|
||||||
LOG = logging.getLogger()
|
LOG = logging.getLogger()
|
||||||
@@ -28,7 +25,7 @@ def create(dsn, data_dir):
|
|||||||
return LegacyICUTokenizer(dsn, data_dir)
|
return LegacyICUTokenizer(dsn, data_dir)
|
||||||
|
|
||||||
|
|
||||||
class LegacyICUTokenizer:
|
class LegacyICUTokenizer(AbstractTokenizer):
|
||||||
""" This tokenizer uses libICU to covert names and queries to ASCII.
|
""" This tokenizer uses libICU to covert names and queries to ASCII.
|
||||||
Otherwise it uses the same algorithms and data structures as the
|
Otherwise it uses the same algorithms and data structures as the
|
||||||
normalization routines in Nominatim 3.
|
normalization routines in Nominatim 3.
|
||||||
@@ -37,9 +34,7 @@ class LegacyICUTokenizer:
|
|||||||
def __init__(self, dsn, data_dir):
|
def __init__(self, dsn, data_dir):
|
||||||
self.dsn = dsn
|
self.dsn = dsn
|
||||||
self.data_dir = data_dir
|
self.data_dir = data_dir
|
||||||
self.naming_rules = None
|
self.loader = None
|
||||||
self.term_normalization = None
|
|
||||||
self.max_word_frequency = None
|
|
||||||
|
|
||||||
|
|
||||||
def init_new_db(self, config, init_db=True):
|
def init_new_db(self, config, init_db=True):
|
||||||
@@ -48,58 +43,67 @@ class LegacyICUTokenizer:
|
|||||||
This copies all necessary data in the project directory to make
|
This copies all necessary data in the project directory to make
|
||||||
sure the tokenizer remains stable even over updates.
|
sure the tokenizer remains stable even over updates.
|
||||||
"""
|
"""
|
||||||
if config.TOKENIZER_CONFIG:
|
self.loader = ICURuleLoader(config)
|
||||||
cfgfile = Path(config.TOKENIZER_CONFIG)
|
|
||||||
else:
|
|
||||||
cfgfile = config.config_dir / 'legacy_icu_tokenizer.yaml'
|
|
||||||
|
|
||||||
loader = ICURuleLoader(cfgfile)
|
|
||||||
self.naming_rules = ICUNameProcessorRules(loader=loader)
|
|
||||||
self.term_normalization = config.TERM_NORMALIZATION
|
|
||||||
self.max_word_frequency = config.MAX_WORD_FREQUENCY
|
|
||||||
|
|
||||||
self._install_php(config.lib_dir.php)
|
self._install_php(config.lib_dir.php)
|
||||||
self._save_config(config)
|
self._save_config()
|
||||||
|
|
||||||
if init_db:
|
if init_db:
|
||||||
self.update_sql_functions(config)
|
self.update_sql_functions(config)
|
||||||
self._init_db_tables(config)
|
self._init_db_tables(config)
|
||||||
|
|
||||||
|
|
||||||
def init_from_project(self):
|
def init_from_project(self, config):
|
||||||
""" Initialise the tokenizer from the project directory.
|
""" Initialise the tokenizer from the project directory.
|
||||||
"""
|
"""
|
||||||
|
self.loader = ICURuleLoader(config)
|
||||||
|
|
||||||
with connect(self.dsn) as conn:
|
with connect(self.dsn) as conn:
|
||||||
self.naming_rules = ICUNameProcessorRules(conn=conn)
|
self.loader.load_config_from_db(conn)
|
||||||
self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
|
|
||||||
self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
|
|
||||||
|
|
||||||
|
|
||||||
def finalize_import(self, _):
|
def finalize_import(self, config):
|
||||||
""" Do any required postprocessing to make the tokenizer data ready
|
""" Do any required postprocessing to make the tokenizer data ready
|
||||||
for use.
|
for use.
|
||||||
"""
|
"""
|
||||||
|
with connect(self.dsn) as conn:
|
||||||
|
sqlp = SQLPreprocessor(conn, config)
|
||||||
|
sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
|
||||||
|
|
||||||
|
|
||||||
def update_sql_functions(self, config):
|
def update_sql_functions(self, config):
|
||||||
""" Reimport the SQL functions for this tokenizer.
|
""" Reimport the SQL functions for this tokenizer.
|
||||||
"""
|
"""
|
||||||
with connect(self.dsn) as conn:
|
with connect(self.dsn) as conn:
|
||||||
max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
|
|
||||||
sqlp = SQLPreprocessor(conn, config)
|
sqlp = SQLPreprocessor(conn, config)
|
||||||
sqlp.run_sql_file(conn, 'tokenizer/legacy_icu_tokenizer.sql',
|
sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
|
||||||
max_word_freq=max_word_freq)
|
|
||||||
|
|
||||||
|
|
||||||
def check_database(self):
|
def check_database(self, config):
|
||||||
""" Check that the tokenizer is set up correctly.
|
""" Check that the tokenizer is set up correctly.
|
||||||
"""
|
"""
|
||||||
self.init_from_project()
|
# Will throw an error if there is an issue.
|
||||||
|
self.init_from_project(config)
|
||||||
|
|
||||||
if self.naming_rules is None:
|
|
||||||
return "Configuration for tokenizer 'legacy_icu' are missing."
|
|
||||||
|
|
||||||
return None
|
def update_statistics(self):
|
||||||
|
""" Recompute frequencies for all name words.
|
||||||
|
"""
|
||||||
|
with connect(self.dsn) as conn:
|
||||||
|
if conn.table_exists('search_name'):
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
cur.drop_table("word_frequencies")
|
||||||
|
LOG.info("Computing word frequencies")
|
||||||
|
cur.execute("""CREATE TEMP TABLE word_frequencies AS
|
||||||
|
SELECT unnest(name_vector) as id, count(*)
|
||||||
|
FROM search_name GROUP BY id""")
|
||||||
|
cur.execute("CREATE INDEX ON word_frequencies(id)")
|
||||||
|
LOG.info("Update word table with recomputed frequencies")
|
||||||
|
cur.execute("""UPDATE word
|
||||||
|
SET info = info || jsonb_build_object('count', count)
|
||||||
|
FROM word_frequencies WHERE word_id = id""")
|
||||||
|
cur.drop_table("word_frequencies")
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
|
||||||
def name_analyzer(self):
|
def name_analyzer(self):
|
||||||
@@ -117,7 +121,8 @@ class LegacyICUTokenizer:
|
|||||||
|
|
||||||
Analyzers are not thread-safe. You need to instantiate one per thread.
|
Analyzers are not thread-safe. You need to instantiate one per thread.
|
||||||
"""
|
"""
|
||||||
return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
|
return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
|
||||||
|
self.loader.make_token_analysis())
|
||||||
|
|
||||||
|
|
||||||
def _install_php(self, phpdir):
|
def _install_php(self, phpdir):
|
||||||
@@ -126,21 +131,18 @@ class LegacyICUTokenizer:
|
|||||||
php_file = self.data_dir / "tokenizer.php"
|
php_file = self.data_dir / "tokenizer.php"
|
||||||
php_file.write_text(dedent(f"""\
|
php_file.write_text(dedent(f"""\
|
||||||
<?php
|
<?php
|
||||||
@define('CONST_Max_Word_Frequency', {self.max_word_frequency});
|
@define('CONST_Max_Word_Frequency', 10000000);
|
||||||
@define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
|
@define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
|
||||||
@define('CONST_Transliteration', "{self.naming_rules.search_rules}");
|
@define('CONST_Transliteration', "{self.loader.get_search_rules()}");
|
||||||
require_once('{phpdir}/tokenizer/legacy_icu_tokenizer.php');"""))
|
require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
|
||||||
|
|
||||||
|
|
||||||
def _save_config(self, config):
|
def _save_config(self):
|
||||||
""" Save the configuration that needs to remain stable for the given
|
""" Save the configuration that needs to remain stable for the given
|
||||||
database as database properties.
|
database as database properties.
|
||||||
"""
|
"""
|
||||||
with connect(self.dsn) as conn:
|
with connect(self.dsn) as conn:
|
||||||
self.naming_rules.save_rules(conn)
|
self.loader.save_config_to_db(conn)
|
||||||
|
|
||||||
set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
|
|
||||||
set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
|
|
||||||
|
|
||||||
|
|
||||||
def _init_db_tables(self, config):
|
def _init_db_tables(self, config):
|
||||||
@@ -152,69 +154,23 @@ class LegacyICUTokenizer:
|
|||||||
sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
|
sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
LOG.warning("Precomputing word tokens")
|
|
||||||
|
|
||||||
# get partial words and their frequencies
|
class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
||||||
words = self._count_partial_terms(conn)
|
|
||||||
|
|
||||||
# copy them back into the word table
|
|
||||||
with CopyBuffer() as copystr:
|
|
||||||
for term, cnt in words.items():
|
|
||||||
copystr.add('w', term, json.dumps({'count': cnt}))
|
|
||||||
|
|
||||||
with conn.cursor() as cur:
|
|
||||||
copystr.copy_out(cur, 'word',
|
|
||||||
columns=['type', 'word_token', 'info'])
|
|
||||||
cur.execute("""UPDATE word SET word_id = nextval('seq_word')
|
|
||||||
WHERE word_id is null and type = 'w'""")
|
|
||||||
|
|
||||||
conn.commit()
|
|
||||||
|
|
||||||
def _count_partial_terms(self, conn):
|
|
||||||
""" Count the partial terms from the names in the place table.
|
|
||||||
"""
|
|
||||||
words = Counter()
|
|
||||||
name_proc = ICUNameProcessor(self.naming_rules)
|
|
||||||
|
|
||||||
with conn.cursor(name="words") as cur:
|
|
||||||
cur.execute(""" SELECT v, count(*) FROM
|
|
||||||
(SELECT svals(name) as v FROM place)x
|
|
||||||
WHERE length(v) < 75 GROUP BY v""")
|
|
||||||
|
|
||||||
for name, cnt in cur:
|
|
||||||
terms = set()
|
|
||||||
for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
|
|
||||||
if ' ' in word:
|
|
||||||
terms.update(word.split())
|
|
||||||
for term in terms:
|
|
||||||
words[term] += cnt
|
|
||||||
|
|
||||||
return words
|
|
||||||
|
|
||||||
|
|
||||||
class LegacyICUNameAnalyzer:
|
|
||||||
""" The legacy analyzer uses the ICU library for splitting names.
|
""" The legacy analyzer uses the ICU library for splitting names.
|
||||||
|
|
||||||
Each instance opens a connection to the database to request the
|
Each instance opens a connection to the database to request the
|
||||||
normalization.
|
normalization.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, dsn, name_proc):
|
def __init__(self, dsn, sanitizer, token_analysis):
|
||||||
self.conn = connect(dsn).connection
|
self.conn = connect(dsn).connection
|
||||||
self.conn.autocommit = True
|
self.conn.autocommit = True
|
||||||
self.name_processor = name_proc
|
self.sanitizer = sanitizer
|
||||||
|
self.token_analysis = token_analysis
|
||||||
|
|
||||||
self._cache = _TokenCache()
|
self._cache = _TokenCache()
|
||||||
|
|
||||||
|
|
||||||
def __enter__(self):
|
|
||||||
return self
|
|
||||||
|
|
||||||
|
|
||||||
def __exit__(self, exc_type, exc_value, traceback):
|
|
||||||
self.close()
|
|
||||||
|
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
""" Free all resources used by the analyzer.
|
""" Free all resources used by the analyzer.
|
||||||
"""
|
"""
|
||||||
@@ -223,6 +179,19 @@ class LegacyICUNameAnalyzer:
|
|||||||
self.conn = None
|
self.conn = None
|
||||||
|
|
||||||
|
|
||||||
|
def _search_normalized(self, name):
|
||||||
|
""" Return the search token transliteration of the given name.
|
||||||
|
"""
|
||||||
|
return self.token_analysis.search.transliterate(name).strip()
|
||||||
|
|
||||||
|
|
||||||
|
def _normalized(self, name):
|
||||||
|
""" Return the normalized version of the given name with all
|
||||||
|
non-relevant information removed.
|
||||||
|
"""
|
||||||
|
return self.token_analysis.normalizer.transliterate(name).strip()
|
||||||
|
|
||||||
|
|
||||||
def get_word_token_info(self, words):
|
def get_word_token_info(self, words):
|
||||||
""" Return token information for the given list of words.
|
""" Return token information for the given list of words.
|
||||||
If a word starts with # it is assumed to be a full name
|
If a word starts with # it is assumed to be a full name
|
||||||
@@ -238,9 +207,9 @@ class LegacyICUNameAnalyzer:
|
|||||||
partial_tokens = {}
|
partial_tokens = {}
|
||||||
for word in words:
|
for word in words:
|
||||||
if word.startswith('#'):
|
if word.startswith('#'):
|
||||||
full_tokens[word] = self.name_processor.get_search_normalized(word[1:])
|
full_tokens[word] = self._search_normalized(word[1:])
|
||||||
else:
|
else:
|
||||||
partial_tokens[word] = self.name_processor.get_search_normalized(word)
|
partial_tokens[word] = self._search_normalized(word)
|
||||||
|
|
||||||
with self.conn.cursor() as cur:
|
with self.conn.cursor() as cur:
|
||||||
cur.execute("""SELECT word_token, word_id
|
cur.execute("""SELECT word_token, word_id
|
||||||
@@ -271,7 +240,7 @@ class LegacyICUNameAnalyzer:
|
|||||||
|
|
||||||
This function takes minor shortcuts on transliteration.
|
This function takes minor shortcuts on transliteration.
|
||||||
"""
|
"""
|
||||||
return self.name_processor.get_search_normalized(hnr)
|
return self._search_normalized(hnr)
|
||||||
|
|
||||||
def update_postcodes_from_db(self):
|
def update_postcodes_from_db(self):
|
||||||
""" Update postcode tokens in the word table from the location_postcode
|
""" Update postcode tokens in the word table from the location_postcode
|
||||||
@@ -294,7 +263,7 @@ class LegacyICUNameAnalyzer:
|
|||||||
if postcode is None:
|
if postcode is None:
|
||||||
to_delete.append(word)
|
to_delete.append(word)
|
||||||
else:
|
else:
|
||||||
copystr.add(self.name_processor.get_search_normalized(postcode),
|
copystr.add(self._search_normalized(postcode),
|
||||||
'P', postcode)
|
'P', postcode)
|
||||||
|
|
||||||
if to_delete:
|
if to_delete:
|
||||||
@@ -312,7 +281,7 @@ class LegacyICUNameAnalyzer:
|
|||||||
completely replaced. Otherwise the phrases are added to the
|
completely replaced. Otherwise the phrases are added to the
|
||||||
already existing ones.
|
already existing ones.
|
||||||
"""
|
"""
|
||||||
norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
|
norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
|
||||||
for p in phrases))
|
for p in phrases))
|
||||||
|
|
||||||
with self.conn.cursor() as cur:
|
with self.conn.cursor() as cur:
|
||||||
@@ -342,7 +311,7 @@ class LegacyICUNameAnalyzer:
|
|||||||
added = 0
|
added = 0
|
||||||
with CopyBuffer() as copystr:
|
with CopyBuffer() as copystr:
|
||||||
for word, cls, typ, oper in to_add:
|
for word, cls, typ, oper in to_add:
|
||||||
term = self.name_processor.get_search_normalized(word)
|
term = self._search_normalized(word)
|
||||||
if term:
|
if term:
|
||||||
copystr.add(term, 'S', word,
|
copystr.add(term, 'S', word,
|
||||||
json.dumps({'class': cls, 'type': typ,
|
json.dumps({'class': cls, 'type': typ,
|
||||||
@@ -376,9 +345,21 @@ class LegacyICUNameAnalyzer:
|
|||||||
def add_country_names(self, country_code, names):
|
def add_country_names(self, country_code, names):
|
||||||
""" Add names for the given country to the search index.
|
""" Add names for the given country to the search index.
|
||||||
"""
|
"""
|
||||||
|
# Make sure any name preprocessing for country names applies.
|
||||||
|
info = PlaceInfo({'name': names, 'country_code': country_code,
|
||||||
|
'rank_address': 4, 'class': 'boundary',
|
||||||
|
'type': 'administrative'})
|
||||||
|
self._add_country_full_names(country_code,
|
||||||
|
self.sanitizer.process_names(info)[0])
|
||||||
|
|
||||||
|
|
||||||
|
def _add_country_full_names(self, country_code, names):
|
||||||
|
""" Add names for the given country from an already sanitized
|
||||||
|
name list.
|
||||||
|
"""
|
||||||
word_tokens = set()
|
word_tokens = set()
|
||||||
for name in self._compute_full_names(names):
|
for name in names:
|
||||||
norm_name = self.name_processor.get_search_normalized(name)
|
norm_name = self._search_normalized(name.name)
|
||||||
if norm_name:
|
if norm_name:
|
||||||
word_tokens.add(norm_name)
|
word_tokens.add(norm_name)
|
||||||
|
|
||||||
@@ -404,23 +385,21 @@ class LegacyICUNameAnalyzer:
|
|||||||
def process_place(self, place):
|
def process_place(self, place):
|
||||||
""" Determine tokenizer information about the given place.
|
""" Determine tokenizer information about the given place.
|
||||||
|
|
||||||
Returns a JSON-serialisable structure that will be handed into
|
Returns a JSON-serializable structure that will be handed into
|
||||||
the database via the token_info field.
|
the database via the token_info field.
|
||||||
"""
|
"""
|
||||||
token_info = _TokenInfo(self._cache)
|
token_info = _TokenInfo(self._cache)
|
||||||
|
|
||||||
names = place.get('name')
|
names, address = self.sanitizer.process_names(place)
|
||||||
|
|
||||||
if names:
|
if names:
|
||||||
fulls, partials = self._compute_name_tokens(names)
|
fulls, partials = self._compute_name_tokens(names)
|
||||||
|
|
||||||
token_info.add_names(fulls, partials)
|
token_info.add_names(fulls, partials)
|
||||||
|
|
||||||
country_feature = place.get('country_feature')
|
if place.is_country():
|
||||||
if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
|
self._add_country_full_names(place.country_code, names)
|
||||||
self.add_country_names(country_feature.lower(), names)
|
|
||||||
|
|
||||||
address = place.get('address')
|
|
||||||
if address:
|
if address:
|
||||||
self._process_place_address(token_info, address)
|
self._process_place_address(token_info, address)
|
||||||
|
|
||||||
@@ -430,18 +409,18 @@ class LegacyICUNameAnalyzer:
|
|||||||
def _process_place_address(self, token_info, address):
|
def _process_place_address(self, token_info, address):
|
||||||
hnrs = []
|
hnrs = []
|
||||||
addr_terms = []
|
addr_terms = []
|
||||||
for key, value in address.items():
|
for item in address:
|
||||||
if key == 'postcode':
|
if item.kind == 'postcode':
|
||||||
self._add_postcode(value)
|
self._add_postcode(item.name)
|
||||||
elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
|
elif item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'):
|
||||||
hnrs.append(value)
|
hnrs.append(item.name)
|
||||||
elif key == 'street':
|
elif item.kind == 'street':
|
||||||
token_info.add_street(*self._compute_name_tokens({'name': value}))
|
token_info.add_street(self._compute_partial_tokens(item.name))
|
||||||
elif key == 'place':
|
elif item.kind == 'place':
|
||||||
token_info.add_place(*self._compute_name_tokens({'name': value}))
|
token_info.add_place(self._compute_partial_tokens(item.name))
|
||||||
elif not key.startswith('_') and \
|
elif not item.kind.startswith('_') and \
|
||||||
key not in ('country', 'full'):
|
item.kind not in ('country', 'full'):
|
||||||
addr_terms.append((key, *self._compute_name_tokens({'name': value})))
|
addr_terms.append((item.kind, self._compute_partial_tokens(item.name)))
|
||||||
|
|
||||||
if hnrs:
|
if hnrs:
|
||||||
hnrs = self._split_housenumbers(hnrs)
|
hnrs = self._split_housenumbers(hnrs)
|
||||||
@@ -451,28 +430,61 @@ class LegacyICUNameAnalyzer:
|
|||||||
token_info.add_address_terms(addr_terms)
|
token_info.add_address_terms(addr_terms)
|
||||||
|
|
||||||
|
|
||||||
|
def _compute_partial_tokens(self, name):
|
||||||
|
""" Normalize the given term, split it into partial words and return
|
||||||
|
then token list for them.
|
||||||
|
"""
|
||||||
|
norm_name = self._search_normalized(name)
|
||||||
|
|
||||||
|
tokens = []
|
||||||
|
need_lookup = []
|
||||||
|
for partial in norm_name.split():
|
||||||
|
token = self._cache.partials.get(partial)
|
||||||
|
if token:
|
||||||
|
tokens.append(token)
|
||||||
|
else:
|
||||||
|
need_lookup.append(partial)
|
||||||
|
|
||||||
|
if need_lookup:
|
||||||
|
with self.conn.cursor() as cur:
|
||||||
|
cur.execute("""SELECT word, getorcreate_partial_word(word)
|
||||||
|
FROM unnest(%s) word""",
|
||||||
|
(need_lookup, ))
|
||||||
|
|
||||||
|
for partial, token in cur:
|
||||||
|
tokens.append(token)
|
||||||
|
self._cache.partials[partial] = token
|
||||||
|
|
||||||
|
return tokens
|
||||||
|
|
||||||
|
|
||||||
def _compute_name_tokens(self, names):
|
def _compute_name_tokens(self, names):
|
||||||
""" Computes the full name and partial name tokens for the given
|
""" Computes the full name and partial name tokens for the given
|
||||||
dictionary of names.
|
dictionary of names.
|
||||||
"""
|
"""
|
||||||
full_names = self._compute_full_names(names)
|
|
||||||
full_tokens = set()
|
full_tokens = set()
|
||||||
partial_tokens = set()
|
partial_tokens = set()
|
||||||
|
|
||||||
for name in full_names:
|
for name in names:
|
||||||
norm_name = self.name_processor.get_normalized(name)
|
analyzer_id = name.get_attr('analyzer')
|
||||||
full, part = self._cache.names.get(norm_name, (None, None))
|
norm_name = self._normalized(name.name)
|
||||||
|
if analyzer_id is None:
|
||||||
|
token_id = norm_name
|
||||||
|
else:
|
||||||
|
token_id = f'{norm_name}@{analyzer_id}'
|
||||||
|
|
||||||
|
full, part = self._cache.names.get(token_id, (None, None))
|
||||||
if full is None:
|
if full is None:
|
||||||
variants = self.name_processor.get_variants_ascii(norm_name)
|
variants = self.token_analysis.analysis[analyzer_id].get_variants_ascii(norm_name)
|
||||||
if not variants:
|
if not variants:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
with self.conn.cursor() as cur:
|
with self.conn.cursor() as cur:
|
||||||
cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
|
cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
|
||||||
(norm_name, variants))
|
(token_id, variants))
|
||||||
full, part = cur.fetchone()
|
full, part = cur.fetchone()
|
||||||
|
|
||||||
self._cache.names[norm_name] = (full, part)
|
self._cache.names[token_id] = (full, part)
|
||||||
|
|
||||||
full_tokens.add(full)
|
full_tokens.add(full)
|
||||||
partial_tokens.update(part)
|
partial_tokens.update(part)
|
||||||
@@ -480,23 +492,6 @@ class LegacyICUNameAnalyzer:
|
|||||||
return full_tokens, partial_tokens
|
return full_tokens, partial_tokens
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _compute_full_names(names):
|
|
||||||
""" Return the set of all full name word ids to be used with the
|
|
||||||
given dictionary of names.
|
|
||||||
"""
|
|
||||||
full_names = set()
|
|
||||||
for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)):
|
|
||||||
if name:
|
|
||||||
full_names.add(name)
|
|
||||||
|
|
||||||
brace_idx = name.find('(')
|
|
||||||
if brace_idx >= 0:
|
|
||||||
full_names.add(name[:brace_idx].strip())
|
|
||||||
|
|
||||||
return full_names
|
|
||||||
|
|
||||||
|
|
||||||
def _add_postcode(self, postcode):
|
def _add_postcode(self, postcode):
|
||||||
""" Make sure the normalized postcode is present in the word table.
|
""" Make sure the normalized postcode is present in the word table.
|
||||||
"""
|
"""
|
||||||
@@ -504,7 +499,7 @@ class LegacyICUNameAnalyzer:
|
|||||||
postcode = self.normalize_postcode(postcode)
|
postcode = self.normalize_postcode(postcode)
|
||||||
|
|
||||||
if postcode not in self._cache.postcodes:
|
if postcode not in self._cache.postcodes:
|
||||||
term = self.name_processor.get_search_normalized(postcode)
|
term = self._search_normalized(postcode)
|
||||||
if not term:
|
if not term:
|
||||||
return
|
return
|
||||||
|
|
||||||
@@ -563,30 +558,25 @@ class _TokenInfo:
|
|||||||
self.data['hnr'] = ';'.join(hnrs)
|
self.data['hnr'] = ';'.join(hnrs)
|
||||||
|
|
||||||
|
|
||||||
def add_street(self, fulls, _):
|
def add_street(self, tokens):
|
||||||
""" Add addr:street match terms.
|
""" Add addr:street match terms.
|
||||||
"""
|
"""
|
||||||
if fulls:
|
if tokens:
|
||||||
self.data['street'] = self._mk_array(fulls)
|
self.data['street'] = self._mk_array(tokens)
|
||||||
|
|
||||||
|
|
||||||
def add_place(self, fulls, partials):
|
def add_place(self, tokens):
|
||||||
""" Add addr:place search and match terms.
|
""" Add addr:place search and match terms.
|
||||||
"""
|
"""
|
||||||
if fulls:
|
if tokens:
|
||||||
self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials))
|
self.data['place'] = self._mk_array(tokens)
|
||||||
self.data['place_match'] = self._mk_array(fulls)
|
|
||||||
|
|
||||||
|
|
||||||
def add_address_terms(self, terms):
|
def add_address_terms(self, terms):
|
||||||
""" Add additional address terms.
|
""" Add additional address terms.
|
||||||
"""
|
"""
|
||||||
tokens = {}
|
tokens = {key: self._mk_array(partials)
|
||||||
|
for key, partials in terms if partials}
|
||||||
for key, fulls, partials in terms:
|
|
||||||
if fulls:
|
|
||||||
tokens[key] = [self._mk_array(itertools.chain(fulls, partials)),
|
|
||||||
self._mk_array(fulls)]
|
|
||||||
|
|
||||||
if tokens:
|
if tokens:
|
||||||
self.data['addr'] = tokens
|
self.data['addr'] = tokens
|
||||||
@@ -600,6 +590,7 @@ class _TokenCache:
|
|||||||
"""
|
"""
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.names = {}
|
self.names = {}
|
||||||
|
self.partials = {}
|
||||||
self.postcodes = set()
|
self.postcodes = set()
|
||||||
self.housenumbers = {}
|
self.housenumbers = {}
|
||||||
|
|
||||||
@@ -1,57 +0,0 @@
|
|||||||
"""
|
|
||||||
Data structures for saving variant expansions for ICU tokenizer.
|
|
||||||
"""
|
|
||||||
from collections import namedtuple
|
|
||||||
import json
|
|
||||||
|
|
||||||
_ICU_VARIANT_PORPERTY_FIELDS = ['lang']
|
|
||||||
|
|
||||||
|
|
||||||
class ICUVariantProperties(namedtuple('_ICUVariantProperties', _ICU_VARIANT_PORPERTY_FIELDS)):
|
|
||||||
""" Data container for saving properties that describe when a variant
|
|
||||||
should be applied.
|
|
||||||
|
|
||||||
Property instances are hashable.
|
|
||||||
"""
|
|
||||||
@classmethod
|
|
||||||
def from_rules(cls, _):
|
|
||||||
""" Create a new property type from a generic dictionary.
|
|
||||||
|
|
||||||
The function only takes into account the properties that are
|
|
||||||
understood presently and ignores all others.
|
|
||||||
"""
|
|
||||||
return cls(lang=None)
|
|
||||||
|
|
||||||
|
|
||||||
ICUVariant = namedtuple('ICUVariant', ['source', 'replacement', 'properties'])
|
|
||||||
|
|
||||||
|
|
||||||
def pickle_variant_set(variants):
|
|
||||||
""" Serializes an iterable of variant rules to a string.
|
|
||||||
"""
|
|
||||||
# Create a list of property sets. So they don't need to be duplicated
|
|
||||||
properties = {}
|
|
||||||
pid = 1
|
|
||||||
for variant in variants:
|
|
||||||
if variant.properties not in properties:
|
|
||||||
properties[variant.properties] = pid
|
|
||||||
pid += 1
|
|
||||||
|
|
||||||
# Convert the variants into a simple list.
|
|
||||||
variants = [(v.source, v.replacement, properties[v.properties]) for v in variants]
|
|
||||||
|
|
||||||
# Convert everythin to json.
|
|
||||||
return json.dumps({'properties': {v: k._asdict() for k, v in properties.items()},
|
|
||||||
'variants': variants})
|
|
||||||
|
|
||||||
|
|
||||||
def unpickle_variant_set(variant_string):
|
|
||||||
""" Deserializes a variant string that was previously created with
|
|
||||||
pickle_variant_set() into a set of ICUVariants.
|
|
||||||
"""
|
|
||||||
data = json.loads(variant_string)
|
|
||||||
|
|
||||||
properties = {int(k): ICUVariantProperties.from_rules(v)
|
|
||||||
for k, v in data['properties'].items()}
|
|
||||||
|
|
||||||
return set((ICUVariant(src, repl, properties[pid]) for src, repl, pid in data['variants']))
|
|
||||||
@@ -16,6 +16,7 @@ from nominatim.db import properties
|
|||||||
from nominatim.db import utils as db_utils
|
from nominatim.db import utils as db_utils
|
||||||
from nominatim.db.sql_preprocessor import SQLPreprocessor
|
from nominatim.db.sql_preprocessor import SQLPreprocessor
|
||||||
from nominatim.errors import UsageError
|
from nominatim.errors import UsageError
|
||||||
|
from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
|
||||||
|
|
||||||
DBCFG_NORMALIZATION = "tokenizer_normalization"
|
DBCFG_NORMALIZATION = "tokenizer_normalization"
|
||||||
DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
|
DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
|
||||||
@@ -76,7 +77,7 @@ def _check_module(module_dir, conn):
|
|||||||
raise UsageError("Database module cannot be accessed.") from err
|
raise UsageError("Database module cannot be accessed.") from err
|
||||||
|
|
||||||
|
|
||||||
class LegacyTokenizer:
|
class LegacyTokenizer(AbstractTokenizer):
|
||||||
""" The legacy tokenizer uses a special PostgreSQL module to normalize
|
""" The legacy tokenizer uses a special PostgreSQL module to normalize
|
||||||
names and queries. The tokenizer thus implements normalization through
|
names and queries. The tokenizer thus implements normalization through
|
||||||
calls to the database.
|
calls to the database.
|
||||||
@@ -112,7 +113,7 @@ class LegacyTokenizer:
|
|||||||
self._init_db_tables(config)
|
self._init_db_tables(config)
|
||||||
|
|
||||||
|
|
||||||
def init_from_project(self):
|
def init_from_project(self, _):
|
||||||
""" Initialise the tokenizer from the project directory.
|
""" Initialise the tokenizer from the project directory.
|
||||||
"""
|
"""
|
||||||
with connect(self.dsn) as conn:
|
with connect(self.dsn) as conn:
|
||||||
@@ -141,7 +142,7 @@ class LegacyTokenizer:
|
|||||||
modulepath=modulepath)
|
modulepath=modulepath)
|
||||||
|
|
||||||
|
|
||||||
def check_database(self):
|
def check_database(self, _):
|
||||||
""" Check that the tokenizer is set up correctly.
|
""" Check that the tokenizer is set up correctly.
|
||||||
"""
|
"""
|
||||||
hint = """\
|
hint = """\
|
||||||
@@ -185,6 +186,25 @@ class LegacyTokenizer:
|
|||||||
self._save_config(conn, config)
|
self._save_config(conn, config)
|
||||||
|
|
||||||
|
|
||||||
|
def update_statistics(self):
|
||||||
|
""" Recompute the frequency of full words.
|
||||||
|
"""
|
||||||
|
with connect(self.dsn) as conn:
|
||||||
|
if conn.table_exists('search_name'):
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
cur.drop_table("word_frequencies")
|
||||||
|
LOG.info("Computing word frequencies")
|
||||||
|
cur.execute("""CREATE TEMP TABLE word_frequencies AS
|
||||||
|
SELECT unnest(name_vector) as id, count(*)
|
||||||
|
FROM search_name GROUP BY id""")
|
||||||
|
cur.execute("CREATE INDEX ON word_frequencies(id)")
|
||||||
|
LOG.info("Update word table with recomputed frequencies")
|
||||||
|
cur.execute("""UPDATE word SET search_name_count = count
|
||||||
|
FROM word_frequencies
|
||||||
|
WHERE word_token like ' %' and word_id = id""")
|
||||||
|
cur.drop_table("word_frequencies")
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
def name_analyzer(self):
|
def name_analyzer(self):
|
||||||
""" Create a new analyzer for tokenizing names and queries
|
""" Create a new analyzer for tokenizing names and queries
|
||||||
using this tokinzer. Analyzers are context managers and should
|
using this tokinzer. Analyzers are context managers and should
|
||||||
@@ -238,7 +258,7 @@ class LegacyTokenizer:
|
|||||||
properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
|
properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
|
||||||
|
|
||||||
|
|
||||||
class LegacyNameAnalyzer:
|
class LegacyNameAnalyzer(AbstractAnalyzer):
|
||||||
""" The legacy analyzer uses the special Postgresql module for
|
""" The legacy analyzer uses the special Postgresql module for
|
||||||
splitting names.
|
splitting names.
|
||||||
|
|
||||||
@@ -255,14 +275,6 @@ class LegacyNameAnalyzer:
|
|||||||
self._cache = _TokenCache(self.conn)
|
self._cache = _TokenCache(self.conn)
|
||||||
|
|
||||||
|
|
||||||
def __enter__(self):
|
|
||||||
return self
|
|
||||||
|
|
||||||
|
|
||||||
def __exit__(self, exc_type, exc_value, traceback):
|
|
||||||
self.close()
|
|
||||||
|
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
""" Free all resources used by the analyzer.
|
""" Free all resources used by the analyzer.
|
||||||
"""
|
"""
|
||||||
@@ -412,16 +424,15 @@ class LegacyNameAnalyzer:
|
|||||||
"""
|
"""
|
||||||
token_info = _TokenInfo(self._cache)
|
token_info = _TokenInfo(self._cache)
|
||||||
|
|
||||||
names = place.get('name')
|
names = place.name
|
||||||
|
|
||||||
if names:
|
if names:
|
||||||
token_info.add_names(self.conn, names)
|
token_info.add_names(self.conn, names)
|
||||||
|
|
||||||
country_feature = place.get('country_feature')
|
if place.is_country():
|
||||||
if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
|
self.add_country_names(place.country_code, names)
|
||||||
self.add_country_names(country_feature.lower(), names)
|
|
||||||
|
|
||||||
address = place.get('address')
|
address = place.address
|
||||||
if address:
|
if address:
|
||||||
self._process_place_address(token_info, address)
|
self._process_place_address(token_info, address)
|
||||||
|
|
||||||
|
|||||||
127
nominatim/tokenizer/place_sanitizer.py
Normal file
127
nominatim/tokenizer/place_sanitizer.py
Normal file
@@ -0,0 +1,127 @@
|
|||||||
|
"""
|
||||||
|
Handler for cleaning name and address tags in place information before it
|
||||||
|
is handed to the token analysis.
|
||||||
|
"""
|
||||||
|
import importlib
|
||||||
|
|
||||||
|
from nominatim.errors import UsageError
|
||||||
|
|
||||||
|
class PlaceName:
|
||||||
|
""" A searchable name for a place together with properties.
|
||||||
|
Every name object saves the name proper and two basic properties:
|
||||||
|
* 'kind' describes the name of the OSM key used without any suffixes
|
||||||
|
(i.e. the part after the colon removed)
|
||||||
|
* 'suffix' contains the suffix of the OSM tag, if any. The suffix
|
||||||
|
is the part of the key after the first colon.
|
||||||
|
In addition to that, the name may have arbitrary additional attributes.
|
||||||
|
Which attributes are used, depends on the token analyser.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, name, kind, suffix):
|
||||||
|
self.name = name
|
||||||
|
self.kind = kind
|
||||||
|
self.suffix = suffix
|
||||||
|
self.attr = {}
|
||||||
|
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return f"PlaceName(name='{self.name}',kind='{self.kind}',suffix='{self.suffix}')"
|
||||||
|
|
||||||
|
|
||||||
|
def clone(self, name=None, kind=None, suffix=None, attr=None):
|
||||||
|
""" Create a deep copy of the place name, optionally with the
|
||||||
|
given parameters replaced. In the attribute list only the given
|
||||||
|
keys are updated. The list is not replaced completely.
|
||||||
|
In particular, the function cannot to be used to remove an
|
||||||
|
attribute from a place name.
|
||||||
|
"""
|
||||||
|
newobj = PlaceName(name or self.name,
|
||||||
|
kind or self.kind,
|
||||||
|
suffix or self.suffix)
|
||||||
|
|
||||||
|
newobj.attr.update(self.attr)
|
||||||
|
if attr:
|
||||||
|
newobj.attr.update(attr)
|
||||||
|
|
||||||
|
return newobj
|
||||||
|
|
||||||
|
|
||||||
|
def set_attr(self, key, value):
|
||||||
|
""" Add the given property to the name. If the property was already
|
||||||
|
set, then the value is overwritten.
|
||||||
|
"""
|
||||||
|
self.attr[key] = value
|
||||||
|
|
||||||
|
|
||||||
|
def get_attr(self, key, default=None):
|
||||||
|
""" Return the given property or the value of 'default' if it
|
||||||
|
is not set.
|
||||||
|
"""
|
||||||
|
return self.attr.get(key, default)
|
||||||
|
|
||||||
|
|
||||||
|
def has_attr(self, key):
|
||||||
|
""" Check if the given attribute is set.
|
||||||
|
"""
|
||||||
|
return key in self.attr
|
||||||
|
|
||||||
|
|
||||||
|
class _ProcessInfo:
|
||||||
|
""" Container class for information handed into to handler functions.
|
||||||
|
The 'names' and 'address' members are mutable. A handler must change
|
||||||
|
them by either modifying the lists place or replacing the old content
|
||||||
|
with a new list.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, place):
|
||||||
|
self.place = place
|
||||||
|
self.names = self._convert_name_dict(place.name)
|
||||||
|
self.address = self._convert_name_dict(place.address)
|
||||||
|
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _convert_name_dict(names):
|
||||||
|
""" Convert a dictionary of names into a list of PlaceNames.
|
||||||
|
The dictionary key is split into the primary part of the key
|
||||||
|
and the suffix (the part after an optional colon).
|
||||||
|
"""
|
||||||
|
out = []
|
||||||
|
|
||||||
|
if names:
|
||||||
|
for key, value in names.items():
|
||||||
|
parts = key.split(':', 1)
|
||||||
|
out.append(PlaceName(value.strip(),
|
||||||
|
parts[0].strip(),
|
||||||
|
parts[1].strip() if len(parts) > 1 else None))
|
||||||
|
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
class PlaceSanitizer:
|
||||||
|
""" Controller class which applies sanitizer functions on the place
|
||||||
|
names and address before they are used by the token analysers.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, rules):
|
||||||
|
self.handlers = []
|
||||||
|
|
||||||
|
if rules:
|
||||||
|
for func in rules:
|
||||||
|
if 'step' not in func:
|
||||||
|
raise UsageError("Sanitizer rule is missing the 'step' attribute.")
|
||||||
|
module_name = 'nominatim.tokenizer.sanitizers.' + func['step'].replace('-', '_')
|
||||||
|
handler_module = importlib.import_module(module_name)
|
||||||
|
self.handlers.append(handler_module.create(func))
|
||||||
|
|
||||||
|
|
||||||
|
def process_names(self, place):
|
||||||
|
""" Extract a sanitized list of names and address parts from the
|
||||||
|
given place. The function returns a tuple
|
||||||
|
(list of names, list of address names)
|
||||||
|
"""
|
||||||
|
obj = _ProcessInfo(place)
|
||||||
|
|
||||||
|
for func in self.handlers:
|
||||||
|
func(obj)
|
||||||
|
|
||||||
|
return obj.names, obj.address
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user