mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-02-16 15:47:58 +00:00
Merge pull request #2770 from lonvia/typed-python
Type annotations for Python code
This commit is contained in:
9
.github/workflows/ci-tests.yml
vendored
9
.github/workflows/ci-tests.yml
vendored
@@ -98,8 +98,8 @@ jobs:
|
|||||||
run: sudo apt-get install -y -qq python3-pytest
|
run: sudo apt-get install -y -qq python3-pytest
|
||||||
if: matrix.ubuntu == 22
|
if: matrix.ubuntu == 22
|
||||||
|
|
||||||
- name: Install latest pylint
|
- name: Install latest pylint/mypy
|
||||||
run: pip3 install pylint
|
run: pip3 install -U pylint mypy types-PyYAML types-jinja2 types-psycopg2 types-psutil typing-extensions
|
||||||
|
|
||||||
- name: PHP linting
|
- name: PHP linting
|
||||||
run: phpcs --report-width=120 .
|
run: phpcs --report-width=120 .
|
||||||
@@ -109,6 +109,11 @@ jobs:
|
|||||||
run: pylint nominatim
|
run: pylint nominatim
|
||||||
working-directory: Nominatim
|
working-directory: Nominatim
|
||||||
|
|
||||||
|
- name: Python static typechecking
|
||||||
|
run: mypy --strict nominatim
|
||||||
|
working-directory: Nominatim
|
||||||
|
|
||||||
|
|
||||||
- name: PHP unit tests
|
- name: PHP unit tests
|
||||||
run: phpunit ./
|
run: phpunit ./
|
||||||
working-directory: Nominatim/test/php
|
working-directory: Nominatim/test/php
|
||||||
|
|||||||
13
.mypy.ini
Normal file
13
.mypy.ini
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
[mypy]
|
||||||
|
|
||||||
|
[mypy-icu.*]
|
||||||
|
ignore_missing_imports = True
|
||||||
|
|
||||||
|
[mypy-osmium.*]
|
||||||
|
ignore_missing_imports = True
|
||||||
|
|
||||||
|
[mypy-datrie.*]
|
||||||
|
ignore_missing_imports = True
|
||||||
|
|
||||||
|
[mypy-dotenv.*]
|
||||||
|
ignore_missing_imports = True
|
||||||
@@ -11,6 +11,8 @@ ignored-modules=icu,datrie
|
|||||||
# 'with' statements.
|
# 'with' statements.
|
||||||
ignored-classes=NominatimArgs,closing
|
ignored-classes=NominatimArgs,closing
|
||||||
# 'too-many-ancestors' is triggered already by deriving from UserDict
|
# 'too-many-ancestors' is triggered already by deriving from UserDict
|
||||||
disable=too-few-public-methods,duplicate-code,too-many-ancestors,bad-option-value,no-self-use
|
# 'not-context-manager' disabled because it causes false positives once
|
||||||
|
# typed Python is enabled. See also https://github.com/PyCQA/pylint/issues/5273
|
||||||
|
disable=too-few-public-methods,duplicate-code,too-many-ancestors,bad-option-value,no-self-use,not-context-manager
|
||||||
|
|
||||||
good-names=i,x,y,fd,db,cc
|
good-names=i,x,y,fd,db,cc
|
||||||
|
|||||||
@@ -33,6 +33,8 @@ It has the following additional requirements:
|
|||||||
* [phpunit](https://phpunit.de) (9.5 is known to work)
|
* [phpunit](https://phpunit.de) (9.5 is known to work)
|
||||||
* [PHP CodeSniffer](https://github.com/squizlabs/PHP_CodeSniffer)
|
* [PHP CodeSniffer](https://github.com/squizlabs/PHP_CodeSniffer)
|
||||||
* [Pylint](https://pylint.org/) (CI always runs the latest version from pip)
|
* [Pylint](https://pylint.org/) (CI always runs the latest version from pip)
|
||||||
|
* [mypy](http://mypy-lang.org/) (plus typing information for external libs)
|
||||||
|
* [Python Typing Extensions](https://github.com/python/typing_extensions) (for Python < 3.9)
|
||||||
* [pytest](https://pytest.org)
|
* [pytest](https://pytest.org)
|
||||||
|
|
||||||
The documentation is built with mkdocs:
|
The documentation is built with mkdocs:
|
||||||
@@ -50,9 +52,10 @@ To install all necessary packages run:
|
|||||||
|
|
||||||
```sh
|
```sh
|
||||||
sudo apt install php-cgi phpunit php-codesniffer \
|
sudo apt install php-cgi phpunit php-codesniffer \
|
||||||
python3-pip python3-setuptools python3-dev pylint
|
python3-pip python3-setuptools python3-dev
|
||||||
|
|
||||||
pip3 install --user behave mkdocs mkdocstrings pytest
|
pip3 install --user behave mkdocs mkdocstrings pytest \
|
||||||
|
pylint mypy types-PyYAML types-jinja2 types-psycopg2
|
||||||
```
|
```
|
||||||
|
|
||||||
The `mkdocs` executable will be located in `.local/bin`. You may have to add
|
The `mkdocs` executable will be located in `.local/bin`. You may have to add
|
||||||
|
|||||||
@@ -45,7 +45,7 @@ GRANT SELECT ON TABLE country_name TO "{{config.DATABASE_WEBUSER}}";
|
|||||||
|
|
||||||
DROP TABLE IF EXISTS nominatim_properties;
|
DROP TABLE IF EXISTS nominatim_properties;
|
||||||
CREATE TABLE nominatim_properties (
|
CREATE TABLE nominatim_properties (
|
||||||
property TEXT,
|
property TEXT NOT NULL,
|
||||||
value TEXT
|
value TEXT
|
||||||
);
|
);
|
||||||
GRANT SELECT ON TABLE nominatim_properties TO "{{config.DATABASE_WEBUSER}}";
|
GRANT SELECT ON TABLE nominatim_properties TO "{{config.DATABASE_WEBUSER}}";
|
||||||
|
|||||||
@@ -8,6 +8,7 @@
|
|||||||
Command-line interface to the Nominatim functions for import, update,
|
Command-line interface to the Nominatim functions for import, update,
|
||||||
database administration and querying.
|
database administration and querying.
|
||||||
"""
|
"""
|
||||||
|
from typing import Optional, Any, List, Union
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
@@ -19,16 +20,15 @@ from nominatim.tools.exec_utils import run_legacy_script, run_php_server
|
|||||||
from nominatim.errors import UsageError
|
from nominatim.errors import UsageError
|
||||||
from nominatim import clicmd
|
from nominatim import clicmd
|
||||||
from nominatim import version
|
from nominatim import version
|
||||||
from nominatim.clicmd.args import NominatimArgs
|
from nominatim.clicmd.args import NominatimArgs, Subcommand
|
||||||
|
|
||||||
LOG = logging.getLogger()
|
LOG = logging.getLogger()
|
||||||
|
|
||||||
|
|
||||||
class CommandlineParser:
|
class CommandlineParser:
|
||||||
""" Wraps some of the common functions for parsing the command line
|
""" Wraps some of the common functions for parsing the command line
|
||||||
and setting up subcommands.
|
and setting up subcommands.
|
||||||
"""
|
"""
|
||||||
def __init__(self, prog, description):
|
def __init__(self, prog: str, description: Optional[str]):
|
||||||
self.parser = argparse.ArgumentParser(
|
self.parser = argparse.ArgumentParser(
|
||||||
prog=prog,
|
prog=prog,
|
||||||
description=description,
|
description=description,
|
||||||
@@ -56,8 +56,8 @@ class CommandlineParser:
|
|||||||
group.add_argument('-j', '--threads', metavar='NUM', type=int,
|
group.add_argument('-j', '--threads', metavar='NUM', type=int,
|
||||||
help='Number of parallel threads to use')
|
help='Number of parallel threads to use')
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def nominatim_version_text():
|
def nominatim_version_text(self) -> str:
|
||||||
""" Program name and version number as string
|
""" Program name and version number as string
|
||||||
"""
|
"""
|
||||||
text = f'Nominatim version {version.version_str()}'
|
text = f'Nominatim version {version.version_str()}'
|
||||||
@@ -65,11 +65,14 @@ class CommandlineParser:
|
|||||||
text += f' ({version.GIT_COMMIT_HASH})'
|
text += f' ({version.GIT_COMMIT_HASH})'
|
||||||
return text
|
return text
|
||||||
|
|
||||||
def add_subcommand(self, name, cmd):
|
|
||||||
|
def add_subcommand(self, name: str, cmd: Subcommand) -> None:
|
||||||
""" Add a subcommand to the parser. The subcommand must be a class
|
""" Add a subcommand to the parser. The subcommand must be a class
|
||||||
with a function add_args() that adds the parameters for the
|
with a function add_args() that adds the parameters for the
|
||||||
subcommand and a run() function that executes the command.
|
subcommand and a run() function that executes the command.
|
||||||
"""
|
"""
|
||||||
|
assert cmd.__doc__ is not None
|
||||||
|
|
||||||
parser = self.subs.add_parser(name, parents=[self.default_args],
|
parser = self.subs.add_parser(name, parents=[self.default_args],
|
||||||
help=cmd.__doc__.split('\n', 1)[0],
|
help=cmd.__doc__.split('\n', 1)[0],
|
||||||
description=cmd.__doc__,
|
description=cmd.__doc__,
|
||||||
@@ -78,7 +81,8 @@ class CommandlineParser:
|
|||||||
parser.set_defaults(command=cmd)
|
parser.set_defaults(command=cmd)
|
||||||
cmd.add_args(parser)
|
cmd.add_args(parser)
|
||||||
|
|
||||||
def run(self, **kwargs):
|
|
||||||
|
def run(self, **kwargs: Any) -> int:
|
||||||
""" Parse the command line arguments of the program and execute the
|
""" Parse the command line arguments of the program and execute the
|
||||||
appropriate subcommand.
|
appropriate subcommand.
|
||||||
"""
|
"""
|
||||||
@@ -89,7 +93,7 @@ class CommandlineParser:
|
|||||||
return 1
|
return 1
|
||||||
|
|
||||||
if args.version:
|
if args.version:
|
||||||
print(CommandlineParser.nominatim_version_text())
|
print(self.nominatim_version_text())
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
if args.subcommand is None:
|
if args.subcommand is None:
|
||||||
@@ -145,8 +149,7 @@ class QueryExport:
|
|||||||
Export addresses as CSV file from the database.
|
Export addresses as CSV file from the database.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@staticmethod
|
def add_args(self, parser: argparse.ArgumentParser) -> None:
|
||||||
def add_args(parser):
|
|
||||||
group = parser.add_argument_group('Output arguments')
|
group = parser.add_argument_group('Output arguments')
|
||||||
group.add_argument('--output-type', default='street',
|
group.add_argument('--output-type', default='street',
|
||||||
choices=('continent', 'country', 'state', 'county',
|
choices=('continent', 'country', 'state', 'county',
|
||||||
@@ -175,11 +178,10 @@ class QueryExport:
|
|||||||
help='Export only children of this OSM relation')
|
help='Export only children of this OSM relation')
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
def run(self, args: NominatimArgs) -> int:
|
||||||
def run(args):
|
params: List[Union[int, str]] = [
|
||||||
params = ['export.php',
|
'--output-type', args.output_type,
|
||||||
'--output-type', args.output_type,
|
'--output-format', args.output_format]
|
||||||
'--output-format', args.output_format]
|
|
||||||
if args.output_all_postcodes:
|
if args.output_all_postcodes:
|
||||||
params.append('--output-all-postcodes')
|
params.append('--output-all-postcodes')
|
||||||
if args.language:
|
if args.language:
|
||||||
@@ -193,7 +195,7 @@ class QueryExport:
|
|||||||
if args.restrict_to_osm_relation:
|
if args.restrict_to_osm_relation:
|
||||||
params.extend(('--restrict-to-osm-relation', args.restrict_to_osm_relation))
|
params.extend(('--restrict-to-osm-relation', args.restrict_to_osm_relation))
|
||||||
|
|
||||||
return run_legacy_script(*params, nominatim_env=args)
|
return run_legacy_script('export.php', *params, nominatim_env=args)
|
||||||
|
|
||||||
|
|
||||||
class AdminServe:
|
class AdminServe:
|
||||||
@@ -207,51 +209,52 @@ class AdminServe:
|
|||||||
By the default, the webserver can be accessed at: http://127.0.0.1:8088
|
By the default, the webserver can be accessed at: http://127.0.0.1:8088
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@staticmethod
|
def add_args(self, parser: argparse.ArgumentParser) -> None:
|
||||||
def add_args(parser):
|
|
||||||
group = parser.add_argument_group('Server arguments')
|
group = parser.add_argument_group('Server arguments')
|
||||||
group.add_argument('--server', default='127.0.0.1:8088',
|
group.add_argument('--server', default='127.0.0.1:8088',
|
||||||
help='The address the server will listen to.')
|
help='The address the server will listen to.')
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def run(args):
|
|
||||||
run_php_server(args.server, args.project_dir / 'website')
|
|
||||||
|
|
||||||
def get_set_parser(**kwargs):
|
def run(self, args: NominatimArgs) -> int:
|
||||||
|
run_php_server(args.server, args.project_dir / 'website')
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
def get_set_parser(**kwargs: Any) -> CommandlineParser:
|
||||||
"""\
|
"""\
|
||||||
Initializes the parser and adds various subcommands for
|
Initializes the parser and adds various subcommands for
|
||||||
nominatim cli.
|
nominatim cli.
|
||||||
"""
|
"""
|
||||||
parser = CommandlineParser('nominatim', nominatim.__doc__)
|
parser = CommandlineParser('nominatim', nominatim.__doc__)
|
||||||
|
|
||||||
parser.add_subcommand('import', clicmd.SetupAll)
|
parser.add_subcommand('import', clicmd.SetupAll())
|
||||||
parser.add_subcommand('freeze', clicmd.SetupFreeze)
|
parser.add_subcommand('freeze', clicmd.SetupFreeze())
|
||||||
parser.add_subcommand('replication', clicmd.UpdateReplication)
|
parser.add_subcommand('replication', clicmd.UpdateReplication())
|
||||||
|
|
||||||
parser.add_subcommand('special-phrases', clicmd.ImportSpecialPhrases)
|
parser.add_subcommand('special-phrases', clicmd.ImportSpecialPhrases())
|
||||||
|
|
||||||
parser.add_subcommand('add-data', clicmd.UpdateAddData)
|
parser.add_subcommand('add-data', clicmd.UpdateAddData())
|
||||||
parser.add_subcommand('index', clicmd.UpdateIndex)
|
parser.add_subcommand('index', clicmd.UpdateIndex())
|
||||||
parser.add_subcommand('refresh', clicmd.UpdateRefresh())
|
parser.add_subcommand('refresh', clicmd.UpdateRefresh())
|
||||||
|
|
||||||
parser.add_subcommand('admin', clicmd.AdminFuncs)
|
parser.add_subcommand('admin', clicmd.AdminFuncs())
|
||||||
|
|
||||||
parser.add_subcommand('export', QueryExport)
|
parser.add_subcommand('export', QueryExport())
|
||||||
parser.add_subcommand('serve', AdminServe)
|
parser.add_subcommand('serve', AdminServe())
|
||||||
|
|
||||||
if kwargs.get('phpcgi_path'):
|
if kwargs.get('phpcgi_path'):
|
||||||
parser.add_subcommand('search', clicmd.APISearch)
|
parser.add_subcommand('search', clicmd.APISearch())
|
||||||
parser.add_subcommand('reverse', clicmd.APIReverse)
|
parser.add_subcommand('reverse', clicmd.APIReverse())
|
||||||
parser.add_subcommand('lookup', clicmd.APILookup)
|
parser.add_subcommand('lookup', clicmd.APILookup())
|
||||||
parser.add_subcommand('details', clicmd.APIDetails)
|
parser.add_subcommand('details', clicmd.APIDetails())
|
||||||
parser.add_subcommand('status', clicmd.APIStatus)
|
parser.add_subcommand('status', clicmd.APIStatus())
|
||||||
else:
|
else:
|
||||||
parser.parser.epilog = 'php-cgi not found. Query commands not available.'
|
parser.parser.epilog = 'php-cgi not found. Query commands not available.'
|
||||||
|
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
|
|
||||||
def nominatim(**kwargs):
|
def nominatim(**kwargs: Any) -> int:
|
||||||
"""\
|
"""\
|
||||||
Command-line tools for importing, updating, administrating and
|
Command-line tools for importing, updating, administrating and
|
||||||
querying the Nominatim database.
|
querying the Nominatim database.
|
||||||
|
|||||||
@@ -7,13 +7,20 @@
|
|||||||
"""
|
"""
|
||||||
Subcommand definitions for the command-line tool.
|
Subcommand definitions for the command-line tool.
|
||||||
"""
|
"""
|
||||||
|
# mypy and pylint disagree about the style of explicit exports,
|
||||||
|
# see https://github.com/PyCQA/pylint/issues/6006.
|
||||||
|
# pylint: disable=useless-import-alias
|
||||||
|
|
||||||
from nominatim.clicmd.setup import SetupAll
|
from nominatim.clicmd.setup import SetupAll as SetupAll
|
||||||
from nominatim.clicmd.replication import UpdateReplication
|
from nominatim.clicmd.replication import UpdateReplication as UpdateReplication
|
||||||
from nominatim.clicmd.api import APISearch, APIReverse, APILookup, APIDetails, APIStatus
|
from nominatim.clicmd.api import (APISearch as APISearch,
|
||||||
from nominatim.clicmd.index import UpdateIndex
|
APIReverse as APIReverse,
|
||||||
from nominatim.clicmd.refresh import UpdateRefresh
|
APILookup as APILookup,
|
||||||
from nominatim.clicmd.add_data import UpdateAddData
|
APIDetails as APIDetails,
|
||||||
from nominatim.clicmd.admin import AdminFuncs
|
APIStatus as APIStatus)
|
||||||
from nominatim.clicmd.freeze import SetupFreeze
|
from nominatim.clicmd.index import UpdateIndex as UpdateIndex
|
||||||
from nominatim.clicmd.special_phrases import ImportSpecialPhrases
|
from nominatim.clicmd.refresh import UpdateRefresh as UpdateRefresh
|
||||||
|
from nominatim.clicmd.add_data import UpdateAddData as UpdateAddData
|
||||||
|
from nominatim.clicmd.admin import AdminFuncs as AdminFuncs
|
||||||
|
from nominatim.clicmd.freeze import SetupFreeze as SetupFreeze
|
||||||
|
from nominatim.clicmd.special_phrases import ImportSpecialPhrases as ImportSpecialPhrases
|
||||||
|
|||||||
@@ -7,10 +7,14 @@
|
|||||||
"""
|
"""
|
||||||
Implementation of the 'add-data' subcommand.
|
Implementation of the 'add-data' subcommand.
|
||||||
"""
|
"""
|
||||||
|
from typing import cast
|
||||||
|
import argparse
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
import psutil
|
import psutil
|
||||||
|
|
||||||
|
from nominatim.clicmd.args import NominatimArgs
|
||||||
|
|
||||||
# Do not repeat documentation of subcommand classes.
|
# Do not repeat documentation of subcommand classes.
|
||||||
# pylint: disable=C0111
|
# pylint: disable=C0111
|
||||||
# Using non-top-level imports to avoid eventually unused imports.
|
# Using non-top-level imports to avoid eventually unused imports.
|
||||||
@@ -35,32 +39,31 @@ class UpdateAddData:
|
|||||||
for more information.
|
for more information.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@staticmethod
|
def add_args(self, parser: argparse.ArgumentParser) -> None:
|
||||||
def add_args(parser):
|
|
||||||
group_name = parser.add_argument_group('Source')
|
group_name = parser.add_argument_group('Source')
|
||||||
group = group_name.add_mutually_exclusive_group(required=True)
|
group1 = group_name.add_mutually_exclusive_group(required=True)
|
||||||
group.add_argument('--file', metavar='FILE',
|
group1.add_argument('--file', metavar='FILE',
|
||||||
help='Import data from an OSM file or diff file')
|
help='Import data from an OSM file or diff file')
|
||||||
group.add_argument('--diff', metavar='FILE',
|
group1.add_argument('--diff', metavar='FILE',
|
||||||
help='Import data from an OSM diff file (deprecated: use --file)')
|
help='Import data from an OSM diff file (deprecated: use --file)')
|
||||||
group.add_argument('--node', metavar='ID', type=int,
|
group1.add_argument('--node', metavar='ID', type=int,
|
||||||
help='Import a single node from the API')
|
help='Import a single node from the API')
|
||||||
group.add_argument('--way', metavar='ID', type=int,
|
group1.add_argument('--way', metavar='ID', type=int,
|
||||||
help='Import a single way from the API')
|
help='Import a single way from the API')
|
||||||
group.add_argument('--relation', metavar='ID', type=int,
|
group1.add_argument('--relation', metavar='ID', type=int,
|
||||||
help='Import a single relation from the API')
|
help='Import a single relation from the API')
|
||||||
group.add_argument('--tiger-data', metavar='DIR',
|
group1.add_argument('--tiger-data', metavar='DIR',
|
||||||
help='Add housenumbers from the US TIGER census database')
|
help='Add housenumbers from the US TIGER census database')
|
||||||
group = parser.add_argument_group('Extra arguments')
|
group2 = parser.add_argument_group('Extra arguments')
|
||||||
group.add_argument('--use-main-api', action='store_true',
|
group2.add_argument('--use-main-api', action='store_true',
|
||||||
help='Use OSM API instead of Overpass to download objects')
|
help='Use OSM API instead of Overpass to download objects')
|
||||||
group.add_argument('--osm2pgsql-cache', metavar='SIZE', type=int,
|
group2.add_argument('--osm2pgsql-cache', metavar='SIZE', type=int,
|
||||||
help='Size of cache to be used by osm2pgsql (in MB)')
|
help='Size of cache to be used by osm2pgsql (in MB)')
|
||||||
group.add_argument('--socket-timeout', dest='socket_timeout', type=int, default=60,
|
group2.add_argument('--socket-timeout', dest='socket_timeout', type=int, default=60,
|
||||||
help='Set timeout for file downloads')
|
help='Set timeout for file downloads')
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def run(args):
|
def run(self, args: NominatimArgs) -> int:
|
||||||
from nominatim.tokenizer import factory as tokenizer_factory
|
from nominatim.tokenizer import factory as tokenizer_factory
|
||||||
from nominatim.tools import tiger_data, add_osm_data
|
from nominatim.tools import tiger_data, add_osm_data
|
||||||
|
|
||||||
@@ -73,7 +76,7 @@ class UpdateAddData:
|
|||||||
|
|
||||||
osm2pgsql_params = args.osm2pgsql_options(default_cache=1000, default_threads=1)
|
osm2pgsql_params = args.osm2pgsql_options(default_cache=1000, default_threads=1)
|
||||||
if args.file or args.diff:
|
if args.file or args.diff:
|
||||||
return add_osm_data.add_data_from_file(args.file or args.diff,
|
return add_osm_data.add_data_from_file(cast(str, args.file or args.diff),
|
||||||
osm2pgsql_params)
|
osm2pgsql_params)
|
||||||
|
|
||||||
if args.node:
|
if args.node:
|
||||||
|
|||||||
@@ -8,8 +8,10 @@
|
|||||||
Implementation of the 'admin' subcommand.
|
Implementation of the 'admin' subcommand.
|
||||||
"""
|
"""
|
||||||
import logging
|
import logging
|
||||||
|
import argparse
|
||||||
|
|
||||||
from nominatim.tools.exec_utils import run_legacy_script
|
from nominatim.tools.exec_utils import run_legacy_script
|
||||||
|
from nominatim.clicmd.args import NominatimArgs
|
||||||
|
|
||||||
# Do not repeat documentation of subcommand classes.
|
# Do not repeat documentation of subcommand classes.
|
||||||
# pylint: disable=C0111
|
# pylint: disable=C0111
|
||||||
@@ -23,8 +25,7 @@ class AdminFuncs:
|
|||||||
Analyse and maintain the database.
|
Analyse and maintain the database.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@staticmethod
|
def add_args(self, parser: argparse.ArgumentParser) -> None:
|
||||||
def add_args(parser):
|
|
||||||
group = parser.add_argument_group('Admin tasks')
|
group = parser.add_argument_group('Admin tasks')
|
||||||
objs = group.add_mutually_exclusive_group(required=True)
|
objs = group.add_mutually_exclusive_group(required=True)
|
||||||
objs.add_argument('--warm', action='store_true',
|
objs.add_argument('--warm', action='store_true',
|
||||||
@@ -49,10 +50,9 @@ class AdminFuncs:
|
|||||||
mgroup.add_argument('--place-id', type=int,
|
mgroup.add_argument('--place-id', type=int,
|
||||||
help='Analyse indexing of the given Nominatim object')
|
help='Analyse indexing of the given Nominatim object')
|
||||||
|
|
||||||
@staticmethod
|
def run(self, args: NominatimArgs) -> int:
|
||||||
def run(args):
|
|
||||||
if args.warm:
|
if args.warm:
|
||||||
return AdminFuncs._warm(args)
|
return self._warm(args)
|
||||||
|
|
||||||
if args.check_database:
|
if args.check_database:
|
||||||
LOG.warning('Checking database')
|
LOG.warning('Checking database')
|
||||||
@@ -73,8 +73,7 @@ class AdminFuncs:
|
|||||||
return 1
|
return 1
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
def _warm(self, args: NominatimArgs) -> int:
|
||||||
def _warm(args):
|
|
||||||
LOG.warning('Warming database caches')
|
LOG.warning('Warming database caches')
|
||||||
params = ['warm.php']
|
params = ['warm.php']
|
||||||
if args.target == 'reverse':
|
if args.target == 'reverse':
|
||||||
|
|||||||
@@ -7,10 +7,13 @@
|
|||||||
"""
|
"""
|
||||||
Subcommand definitions for API calls from the command line.
|
Subcommand definitions for API calls from the command line.
|
||||||
"""
|
"""
|
||||||
|
from typing import Mapping, Dict
|
||||||
|
import argparse
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from nominatim.tools.exec_utils import run_api_script
|
from nominatim.tools.exec_utils import run_api_script
|
||||||
from nominatim.errors import UsageError
|
from nominatim.errors import UsageError
|
||||||
|
from nominatim.clicmd.args import NominatimArgs
|
||||||
|
|
||||||
# Do not repeat documentation of subcommand classes.
|
# Do not repeat documentation of subcommand classes.
|
||||||
# pylint: disable=C0111
|
# pylint: disable=C0111
|
||||||
@@ -42,7 +45,7 @@ DETAILS_SWITCHES = (
|
|||||||
('polygon_geojson', 'Include geometry of result')
|
('polygon_geojson', 'Include geometry of result')
|
||||||
)
|
)
|
||||||
|
|
||||||
def _add_api_output_arguments(parser):
|
def _add_api_output_arguments(parser: argparse.ArgumentParser) -> None:
|
||||||
group = parser.add_argument_group('Output arguments')
|
group = parser.add_argument_group('Output arguments')
|
||||||
group.add_argument('--format', default='jsonv2',
|
group.add_argument('--format', default='jsonv2',
|
||||||
choices=['xml', 'json', 'jsonv2', 'geojson', 'geocodejson'],
|
choices=['xml', 'json', 'jsonv2', 'geojson', 'geocodejson'],
|
||||||
@@ -60,7 +63,7 @@ def _add_api_output_arguments(parser):
|
|||||||
"Parameter is difference tolerance in degrees."))
|
"Parameter is difference tolerance in degrees."))
|
||||||
|
|
||||||
|
|
||||||
def _run_api(endpoint, args, params):
|
def _run_api(endpoint: str, args: NominatimArgs, params: Mapping[str, object]) -> int:
|
||||||
script_file = args.project_dir / 'website' / (endpoint + '.php')
|
script_file = args.project_dir / 'website' / (endpoint + '.php')
|
||||||
|
|
||||||
if not script_file.exists():
|
if not script_file.exists():
|
||||||
@@ -82,8 +85,7 @@ class APISearch:
|
|||||||
https://nominatim.org/release-docs/latest/api/Search/
|
https://nominatim.org/release-docs/latest/api/Search/
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@staticmethod
|
def add_args(self, parser: argparse.ArgumentParser) -> None:
|
||||||
def add_args(parser):
|
|
||||||
group = parser.add_argument_group('Query arguments')
|
group = parser.add_argument_group('Query arguments')
|
||||||
group.add_argument('--query',
|
group.add_argument('--query',
|
||||||
help='Free-form query string')
|
help='Free-form query string')
|
||||||
@@ -109,8 +111,8 @@ class APISearch:
|
|||||||
help='Do not remove duplicates from the result list')
|
help='Do not remove duplicates from the result list')
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
def run(self, args: NominatimArgs) -> int:
|
||||||
def run(args):
|
params: Dict[str, object]
|
||||||
if args.query:
|
if args.query:
|
||||||
params = dict(q=args.query)
|
params = dict(q=args.query)
|
||||||
else:
|
else:
|
||||||
@@ -145,8 +147,7 @@ class APIReverse:
|
|||||||
https://nominatim.org/release-docs/latest/api/Reverse/
|
https://nominatim.org/release-docs/latest/api/Reverse/
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@staticmethod
|
def add_args(self, parser: argparse.ArgumentParser) -> None:
|
||||||
def add_args(parser):
|
|
||||||
group = parser.add_argument_group('Query arguments')
|
group = parser.add_argument_group('Query arguments')
|
||||||
group.add_argument('--lat', type=float, required=True,
|
group.add_argument('--lat', type=float, required=True,
|
||||||
help='Latitude of coordinate to look up (in WGS84)')
|
help='Latitude of coordinate to look up (in WGS84)')
|
||||||
@@ -158,8 +159,7 @@ class APIReverse:
|
|||||||
_add_api_output_arguments(parser)
|
_add_api_output_arguments(parser)
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
def run(self, args: NominatimArgs) -> int:
|
||||||
def run(args):
|
|
||||||
params = dict(lat=args.lat, lon=args.lon, format=args.format)
|
params = dict(lat=args.lat, lon=args.lon, format=args.format)
|
||||||
if args.zoom is not None:
|
if args.zoom is not None:
|
||||||
params['zoom'] = args.zoom
|
params['zoom'] = args.zoom
|
||||||
@@ -187,8 +187,7 @@ class APILookup:
|
|||||||
https://nominatim.org/release-docs/latest/api/Lookup/
|
https://nominatim.org/release-docs/latest/api/Lookup/
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@staticmethod
|
def add_args(self, parser: argparse.ArgumentParser) -> None:
|
||||||
def add_args(parser):
|
|
||||||
group = parser.add_argument_group('Query arguments')
|
group = parser.add_argument_group('Query arguments')
|
||||||
group.add_argument('--id', metavar='OSMID',
|
group.add_argument('--id', metavar='OSMID',
|
||||||
action='append', required=True, dest='ids',
|
action='append', required=True, dest='ids',
|
||||||
@@ -197,9 +196,8 @@ class APILookup:
|
|||||||
_add_api_output_arguments(parser)
|
_add_api_output_arguments(parser)
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
def run(self, args: NominatimArgs) -> int:
|
||||||
def run(args):
|
params: Dict[str, object] = dict(osm_ids=','.join(args.ids), format=args.format)
|
||||||
params = dict(osm_ids=','.join(args.ids), format=args.format)
|
|
||||||
|
|
||||||
for param, _ in EXTRADATA_PARAMS:
|
for param, _ in EXTRADATA_PARAMS:
|
||||||
if getattr(args, param):
|
if getattr(args, param):
|
||||||
@@ -224,8 +222,7 @@ class APIDetails:
|
|||||||
https://nominatim.org/release-docs/latest/api/Details/
|
https://nominatim.org/release-docs/latest/api/Details/
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@staticmethod
|
def add_args(self, parser: argparse.ArgumentParser) -> None:
|
||||||
def add_args(parser):
|
|
||||||
group = parser.add_argument_group('Query arguments')
|
group = parser.add_argument_group('Query arguments')
|
||||||
objs = group.add_mutually_exclusive_group(required=True)
|
objs = group.add_mutually_exclusive_group(required=True)
|
||||||
objs.add_argument('--node', '-n', type=int,
|
objs.add_argument('--node', '-n', type=int,
|
||||||
@@ -246,8 +243,8 @@ class APIDetails:
|
|||||||
group.add_argument('--lang', '--accept-language', metavar='LANGS',
|
group.add_argument('--lang', '--accept-language', metavar='LANGS',
|
||||||
help='Preferred language order for presenting search results')
|
help='Preferred language order for presenting search results')
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def run(args):
|
def run(self, args: NominatimArgs) -> int:
|
||||||
if args.node:
|
if args.node:
|
||||||
params = dict(osmtype='N', osmid=args.node)
|
params = dict(osmtype='N', osmid=args.node)
|
||||||
elif args.way:
|
elif args.way:
|
||||||
@@ -276,12 +273,11 @@ class APIStatus:
|
|||||||
https://nominatim.org/release-docs/latest/api/Status/
|
https://nominatim.org/release-docs/latest/api/Status/
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@staticmethod
|
def add_args(self, parser: argparse.ArgumentParser) -> None:
|
||||||
def add_args(parser):
|
|
||||||
group = parser.add_argument_group('API parameters')
|
group = parser.add_argument_group('API parameters')
|
||||||
group.add_argument('--format', default='text', choices=['text', 'json'],
|
group.add_argument('--format', default='text', choices=['text', 'json'],
|
||||||
help='Format of result')
|
help='Format of result')
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def run(args):
|
def run(self, args: NominatimArgs) -> int:
|
||||||
return _run_api('status', args, dict(format=args.format))
|
return _run_api('status', args, dict(format=args.format))
|
||||||
|
|||||||
@@ -7,19 +7,174 @@
|
|||||||
"""
|
"""
|
||||||
Provides custom functions over command-line arguments.
|
Provides custom functions over command-line arguments.
|
||||||
"""
|
"""
|
||||||
|
from typing import Optional, List, Dict, Any, Sequence, Tuple
|
||||||
|
import argparse
|
||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from nominatim.errors import UsageError
|
from nominatim.errors import UsageError
|
||||||
|
from nominatim.config import Configuration
|
||||||
|
from nominatim.typing import Protocol
|
||||||
|
|
||||||
LOG = logging.getLogger()
|
LOG = logging.getLogger()
|
||||||
|
|
||||||
|
class Subcommand(Protocol):
|
||||||
|
"""
|
||||||
|
Interface to be implemented by classes implementing a CLI subcommand.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def add_args(self, parser: argparse.ArgumentParser) -> None:
|
||||||
|
"""
|
||||||
|
Fill the given parser for the subcommand with the appropriate
|
||||||
|
parameters.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def run(self, args: 'NominatimArgs') -> int:
|
||||||
|
"""
|
||||||
|
Run the subcommand with the given parsed arguments.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
class NominatimArgs:
|
class NominatimArgs:
|
||||||
""" Customized namespace class for the nominatim command line tool
|
""" Customized namespace class for the nominatim command line tool
|
||||||
to receive the command-line arguments.
|
to receive the command-line arguments.
|
||||||
"""
|
"""
|
||||||
|
# Basic environment set by root program.
|
||||||
|
config: Configuration
|
||||||
|
project_dir: Path
|
||||||
|
module_dir: Path
|
||||||
|
osm2pgsql_path: Path
|
||||||
|
phplib_dir: Path
|
||||||
|
sqllib_dir: Path
|
||||||
|
data_dir: Path
|
||||||
|
config_dir: Path
|
||||||
|
phpcgi_path: Path
|
||||||
|
|
||||||
def osm2pgsql_options(self, default_cache, default_threads):
|
# Global switches
|
||||||
|
version: bool
|
||||||
|
subcommand: Optional[str]
|
||||||
|
command: Subcommand
|
||||||
|
|
||||||
|
# Shared parameters
|
||||||
|
osm2pgsql_cache: Optional[int]
|
||||||
|
socket_timeout: int
|
||||||
|
|
||||||
|
# Arguments added to all subcommands.
|
||||||
|
verbose: int
|
||||||
|
threads: Optional[int]
|
||||||
|
|
||||||
|
# Arguments to 'add-data'
|
||||||
|
file: Optional[str]
|
||||||
|
diff: Optional[str]
|
||||||
|
node: Optional[int]
|
||||||
|
way: Optional[int]
|
||||||
|
relation: Optional[int]
|
||||||
|
tiger_data: Optional[str]
|
||||||
|
use_main_api: bool
|
||||||
|
|
||||||
|
# Arguments to 'admin'
|
||||||
|
warm: bool
|
||||||
|
check_database: bool
|
||||||
|
migrate: bool
|
||||||
|
analyse_indexing: bool
|
||||||
|
target: Optional[str]
|
||||||
|
osm_id: Optional[str]
|
||||||
|
place_id: Optional[int]
|
||||||
|
|
||||||
|
# Arguments to 'import'
|
||||||
|
osm_file: List[str]
|
||||||
|
continue_at: Optional[str]
|
||||||
|
reverse_only: bool
|
||||||
|
no_partitions: bool
|
||||||
|
no_updates: bool
|
||||||
|
offline: bool
|
||||||
|
ignore_errors: bool
|
||||||
|
index_noanalyse: bool
|
||||||
|
|
||||||
|
# Arguments to 'index'
|
||||||
|
boundaries_only: bool
|
||||||
|
no_boundaries: bool
|
||||||
|
minrank: int
|
||||||
|
maxrank: int
|
||||||
|
|
||||||
|
# Arguments to 'export'
|
||||||
|
output_type: str
|
||||||
|
output_format: str
|
||||||
|
output_all_postcodes: bool
|
||||||
|
language: Optional[str]
|
||||||
|
restrict_to_country: Optional[str]
|
||||||
|
restrict_to_osm_node: Optional[int]
|
||||||
|
restrict_to_osm_way: Optional[int]
|
||||||
|
restrict_to_osm_relation: Optional[int]
|
||||||
|
|
||||||
|
# Arguments to 'refresh'
|
||||||
|
postcodes: bool
|
||||||
|
word_tokens: bool
|
||||||
|
word_counts: bool
|
||||||
|
address_levels: bool
|
||||||
|
functions: bool
|
||||||
|
wiki_data: bool
|
||||||
|
importance: bool
|
||||||
|
website: bool
|
||||||
|
diffs: bool
|
||||||
|
enable_debug_statements: bool
|
||||||
|
data_object: Sequence[Tuple[str, int]]
|
||||||
|
data_area: Sequence[Tuple[str, int]]
|
||||||
|
|
||||||
|
# Arguments to 'replication'
|
||||||
|
init: bool
|
||||||
|
update_functions: bool
|
||||||
|
check_for_updates: bool
|
||||||
|
once: bool
|
||||||
|
catch_up: bool
|
||||||
|
do_index: bool
|
||||||
|
|
||||||
|
# Arguments to 'serve'
|
||||||
|
server: str
|
||||||
|
|
||||||
|
# Arguments to 'special-phrases
|
||||||
|
import_from_wiki: bool
|
||||||
|
import_from_csv: Optional[str]
|
||||||
|
no_replace: bool
|
||||||
|
|
||||||
|
# Arguments to all query functions
|
||||||
|
format: str
|
||||||
|
addressdetails: bool
|
||||||
|
extratags: bool
|
||||||
|
namedetails: bool
|
||||||
|
lang: Optional[str]
|
||||||
|
polygon_output: Optional[str]
|
||||||
|
polygon_threshold: Optional[float]
|
||||||
|
|
||||||
|
# Arguments to 'search'
|
||||||
|
query: Optional[str]
|
||||||
|
street: Optional[str]
|
||||||
|
city: Optional[str]
|
||||||
|
county: Optional[str]
|
||||||
|
state: Optional[str]
|
||||||
|
country: Optional[str]
|
||||||
|
postalcode: Optional[str]
|
||||||
|
countrycodes: Optional[str]
|
||||||
|
exclude_place_ids: Optional[str]
|
||||||
|
limit: Optional[int]
|
||||||
|
viewbox: Optional[str]
|
||||||
|
bounded: bool
|
||||||
|
dedupe: bool
|
||||||
|
|
||||||
|
# Arguments to 'reverse'
|
||||||
|
lat: float
|
||||||
|
lon: float
|
||||||
|
zoom: Optional[int]
|
||||||
|
|
||||||
|
# Arguments to 'lookup'
|
||||||
|
ids: Sequence[str]
|
||||||
|
|
||||||
|
# Arguments to 'details'
|
||||||
|
object_class: Optional[str]
|
||||||
|
|
||||||
|
|
||||||
|
def osm2pgsql_options(self, default_cache: int,
|
||||||
|
default_threads: int) -> Dict[str, Any]:
|
||||||
""" Return the standard osm2pgsql options that can be derived
|
""" Return the standard osm2pgsql options that can be derived
|
||||||
from the command line arguments. The resulting dict can be
|
from the command line arguments. The resulting dict can be
|
||||||
further customized and then used in `run_osm2pgsql()`.
|
further customized and then used in `run_osm2pgsql()`.
|
||||||
@@ -29,7 +184,7 @@ class NominatimArgs:
|
|||||||
osm2pgsql_style=self.config.get_import_style_file(),
|
osm2pgsql_style=self.config.get_import_style_file(),
|
||||||
threads=self.threads or default_threads,
|
threads=self.threads or default_threads,
|
||||||
dsn=self.config.get_libpq_dsn(),
|
dsn=self.config.get_libpq_dsn(),
|
||||||
flatnode_file=str(self.config.get_path('FLATNODE_FILE')),
|
flatnode_file=str(self.config.get_path('FLATNODE_FILE') or ''),
|
||||||
tablespaces=dict(slim_data=self.config.TABLESPACE_OSM_DATA,
|
tablespaces=dict(slim_data=self.config.TABLESPACE_OSM_DATA,
|
||||||
slim_index=self.config.TABLESPACE_OSM_INDEX,
|
slim_index=self.config.TABLESPACE_OSM_INDEX,
|
||||||
main_data=self.config.TABLESPACE_PLACE_DATA,
|
main_data=self.config.TABLESPACE_PLACE_DATA,
|
||||||
@@ -38,7 +193,7 @@ class NominatimArgs:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_osm_file_list(self):
|
def get_osm_file_list(self) -> Optional[List[Path]]:
|
||||||
""" Return the --osm-file argument as a list of Paths or None
|
""" Return the --osm-file argument as a list of Paths or None
|
||||||
if no argument was given. The function also checks if the files
|
if no argument was given. The function also checks if the files
|
||||||
exist and raises a UsageError if one cannot be found.
|
exist and raises a UsageError if one cannot be found.
|
||||||
|
|||||||
@@ -7,8 +7,10 @@
|
|||||||
"""
|
"""
|
||||||
Implementation of the 'freeze' subcommand.
|
Implementation of the 'freeze' subcommand.
|
||||||
"""
|
"""
|
||||||
|
import argparse
|
||||||
|
|
||||||
from nominatim.db.connection import connect
|
from nominatim.db.connection import connect
|
||||||
|
from nominatim.clicmd.args import NominatimArgs
|
||||||
|
|
||||||
# Do not repeat documentation of subcommand classes.
|
# Do not repeat documentation of subcommand classes.
|
||||||
# pylint: disable=C0111
|
# pylint: disable=C0111
|
||||||
@@ -27,16 +29,15 @@ class SetupFreeze:
|
|||||||
This command has the same effect as the `--no-updates` option for imports.
|
This command has the same effect as the `--no-updates` option for imports.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@staticmethod
|
def add_args(self, parser: argparse.ArgumentParser) -> None:
|
||||||
def add_args(parser):
|
|
||||||
pass # No options
|
pass # No options
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def run(args):
|
def run(self, args: NominatimArgs) -> int:
|
||||||
from ..tools import freeze
|
from ..tools import freeze
|
||||||
|
|
||||||
with connect(args.config.get_libpq_dsn()) as conn:
|
with connect(args.config.get_libpq_dsn()) as conn:
|
||||||
freeze.drop_update_tables(conn)
|
freeze.drop_update_tables(conn)
|
||||||
freeze.drop_flatnode_file(str(args.config.get_path('FLATNODE_FILE')))
|
freeze.drop_flatnode_file(args.config.get_path('FLATNODE_FILE'))
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
|
|||||||
@@ -7,10 +7,13 @@
|
|||||||
"""
|
"""
|
||||||
Implementation of the 'index' subcommand.
|
Implementation of the 'index' subcommand.
|
||||||
"""
|
"""
|
||||||
|
import argparse
|
||||||
|
|
||||||
import psutil
|
import psutil
|
||||||
|
|
||||||
from nominatim.db import status
|
from nominatim.db import status
|
||||||
from nominatim.db.connection import connect
|
from nominatim.db.connection import connect
|
||||||
|
from nominatim.clicmd.args import NominatimArgs
|
||||||
|
|
||||||
# Do not repeat documentation of subcommand classes.
|
# Do not repeat documentation of subcommand classes.
|
||||||
# pylint: disable=C0111
|
# pylint: disable=C0111
|
||||||
@@ -28,8 +31,7 @@ class UpdateIndex:
|
|||||||
of indexing. For other cases, this function allows to run indexing manually.
|
of indexing. For other cases, this function allows to run indexing manually.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@staticmethod
|
def add_args(self, parser: argparse.ArgumentParser) -> None:
|
||||||
def add_args(parser):
|
|
||||||
group = parser.add_argument_group('Filter arguments')
|
group = parser.add_argument_group('Filter arguments')
|
||||||
group.add_argument('--boundaries-only', action='store_true',
|
group.add_argument('--boundaries-only', action='store_true',
|
||||||
help="""Index only administrative boundaries.""")
|
help="""Index only administrative boundaries.""")
|
||||||
@@ -40,8 +42,8 @@ class UpdateIndex:
|
|||||||
group.add_argument('--maxrank', '-R', type=int, metavar='RANK', default=30,
|
group.add_argument('--maxrank', '-R', type=int, metavar='RANK', default=30,
|
||||||
help='Maximum/finishing rank')
|
help='Maximum/finishing rank')
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def run(args):
|
def run(self, args: NominatimArgs) -> int:
|
||||||
from ..indexer.indexer import Indexer
|
from ..indexer.indexer import Indexer
|
||||||
from ..tokenizer import factory as tokenizer_factory
|
from ..tokenizer import factory as tokenizer_factory
|
||||||
|
|
||||||
|
|||||||
@@ -7,11 +7,15 @@
|
|||||||
"""
|
"""
|
||||||
Implementation of 'refresh' subcommand.
|
Implementation of 'refresh' subcommand.
|
||||||
"""
|
"""
|
||||||
from argparse import ArgumentTypeError
|
from typing import Tuple, Optional
|
||||||
|
import argparse
|
||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
from nominatim.config import Configuration
|
||||||
from nominatim.db.connection import connect
|
from nominatim.db.connection import connect
|
||||||
|
from nominatim.tokenizer.base import AbstractTokenizer
|
||||||
|
from nominatim.clicmd.args import NominatimArgs
|
||||||
|
|
||||||
# Do not repeat documentation of subcommand classes.
|
# Do not repeat documentation of subcommand classes.
|
||||||
# pylint: disable=C0111
|
# pylint: disable=C0111
|
||||||
@@ -20,12 +24,12 @@ from nominatim.db.connection import connect
|
|||||||
|
|
||||||
LOG = logging.getLogger()
|
LOG = logging.getLogger()
|
||||||
|
|
||||||
def _parse_osm_object(obj):
|
def _parse_osm_object(obj: str) -> Tuple[str, int]:
|
||||||
""" Parse the given argument into a tuple of OSM type and ID.
|
""" Parse the given argument into a tuple of OSM type and ID.
|
||||||
Raises an ArgumentError if the format is not recognized.
|
Raises an ArgumentError if the format is not recognized.
|
||||||
"""
|
"""
|
||||||
if len(obj) < 2 or obj[0].lower() not in 'nrw' or not obj[1:].isdigit():
|
if len(obj) < 2 or obj[0].lower() not in 'nrw' or not obj[1:].isdigit():
|
||||||
raise ArgumentTypeError("Cannot parse OSM ID. Expect format: [N|W|R]<id>.")
|
raise argparse.ArgumentTypeError("Cannot parse OSM ID. Expect format: [N|W|R]<id>.")
|
||||||
|
|
||||||
return (obj[0].upper(), int(obj[1:]))
|
return (obj[0].upper(), int(obj[1:]))
|
||||||
|
|
||||||
@@ -42,11 +46,10 @@ class UpdateRefresh:
|
|||||||
Warning: the 'update' command must not be run in parallel with other update
|
Warning: the 'update' command must not be run in parallel with other update
|
||||||
commands like 'replication' or 'add-data'.
|
commands like 'replication' or 'add-data'.
|
||||||
"""
|
"""
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
self.tokenizer = None
|
self.tokenizer: Optional[AbstractTokenizer] = None
|
||||||
|
|
||||||
@staticmethod
|
def add_args(self, parser: argparse.ArgumentParser) -> None:
|
||||||
def add_args(parser):
|
|
||||||
group = parser.add_argument_group('Data arguments')
|
group = parser.add_argument_group('Data arguments')
|
||||||
group.add_argument('--postcodes', action='store_true',
|
group.add_argument('--postcodes', action='store_true',
|
||||||
help='Update postcode centroid table')
|
help='Update postcode centroid table')
|
||||||
@@ -80,7 +83,7 @@ class UpdateRefresh:
|
|||||||
help='Enable debug warning statements in functions')
|
help='Enable debug warning statements in functions')
|
||||||
|
|
||||||
|
|
||||||
def run(self, args): #pylint: disable=too-many-branches
|
def run(self, args: NominatimArgs) -> int: #pylint: disable=too-many-branches
|
||||||
from ..tools import refresh, postcodes
|
from ..tools import refresh, postcodes
|
||||||
from ..indexer.indexer import Indexer
|
from ..indexer.indexer import Indexer
|
||||||
|
|
||||||
@@ -155,7 +158,7 @@ class UpdateRefresh:
|
|||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
def _get_tokenizer(self, config):
|
def _get_tokenizer(self, config: Configuration) -> AbstractTokenizer:
|
||||||
if self.tokenizer is None:
|
if self.tokenizer is None:
|
||||||
from ..tokenizer import factory as tokenizer_factory
|
from ..tokenizer import factory as tokenizer_factory
|
||||||
|
|
||||||
|
|||||||
@@ -7,6 +7,8 @@
|
|||||||
"""
|
"""
|
||||||
Implementation of the 'replication' sub-command.
|
Implementation of the 'replication' sub-command.
|
||||||
"""
|
"""
|
||||||
|
from typing import Optional
|
||||||
|
import argparse
|
||||||
import datetime as dt
|
import datetime as dt
|
||||||
import logging
|
import logging
|
||||||
import socket
|
import socket
|
||||||
@@ -15,6 +17,7 @@ import time
|
|||||||
from nominatim.db import status
|
from nominatim.db import status
|
||||||
from nominatim.db.connection import connect
|
from nominatim.db.connection import connect
|
||||||
from nominatim.errors import UsageError
|
from nominatim.errors import UsageError
|
||||||
|
from nominatim.clicmd.args import NominatimArgs
|
||||||
|
|
||||||
LOG = logging.getLogger()
|
LOG = logging.getLogger()
|
||||||
|
|
||||||
@@ -41,8 +44,7 @@ class UpdateReplication:
|
|||||||
downloads and imports the next batch of updates.
|
downloads and imports the next batch of updates.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@staticmethod
|
def add_args(self, parser: argparse.ArgumentParser) -> None:
|
||||||
def add_args(parser):
|
|
||||||
group = parser.add_argument_group('Arguments for initialisation')
|
group = parser.add_argument_group('Arguments for initialisation')
|
||||||
group.add_argument('--init', action='store_true',
|
group.add_argument('--init', action='store_true',
|
||||||
help='Initialise the update process')
|
help='Initialise the update process')
|
||||||
@@ -68,8 +70,8 @@ class UpdateReplication:
|
|||||||
group.add_argument('--socket-timeout', dest='socket_timeout', type=int, default=60,
|
group.add_argument('--socket-timeout', dest='socket_timeout', type=int, default=60,
|
||||||
help='Set timeout for file downloads')
|
help='Set timeout for file downloads')
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _init_replication(args):
|
def _init_replication(self, args: NominatimArgs) -> int:
|
||||||
from ..tools import replication, refresh
|
from ..tools import replication, refresh
|
||||||
|
|
||||||
LOG.warning("Initialising replication updates")
|
LOG.warning("Initialising replication updates")
|
||||||
@@ -81,16 +83,17 @@ class UpdateReplication:
|
|||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
def _check_for_updates(self, args: NominatimArgs) -> int:
|
||||||
def _check_for_updates(args):
|
|
||||||
from ..tools import replication
|
from ..tools import replication
|
||||||
|
|
||||||
with connect(args.config.get_libpq_dsn()) as conn:
|
with connect(args.config.get_libpq_dsn()) as conn:
|
||||||
return replication.check_for_updates(conn, base_url=args.config.REPLICATION_URL)
|
return replication.check_for_updates(conn, base_url=args.config.REPLICATION_URL)
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _report_update(batchdate, start_import, start_index):
|
def _report_update(self, batchdate: dt.datetime,
|
||||||
def round_time(delta):
|
start_import: dt.datetime,
|
||||||
|
start_index: Optional[dt.datetime]) -> None:
|
||||||
|
def round_time(delta: dt.timedelta) -> dt.timedelta:
|
||||||
return dt.timedelta(seconds=int(delta.total_seconds()))
|
return dt.timedelta(seconds=int(delta.total_seconds()))
|
||||||
|
|
||||||
end = dt.datetime.now(dt.timezone.utc)
|
end = dt.datetime.now(dt.timezone.utc)
|
||||||
@@ -101,8 +104,7 @@ class UpdateReplication:
|
|||||||
round_time(end - batchdate))
|
round_time(end - batchdate))
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
def _compute_update_interval(self, args: NominatimArgs) -> int:
|
||||||
def _compute_update_interval(args):
|
|
||||||
if args.catch_up:
|
if args.catch_up:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
@@ -119,13 +121,13 @@ class UpdateReplication:
|
|||||||
return update_interval
|
return update_interval
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
def _update(self, args: NominatimArgs) -> None:
|
||||||
def _update(args):
|
# pylint: disable=too-many-locals
|
||||||
from ..tools import replication
|
from ..tools import replication
|
||||||
from ..indexer.indexer import Indexer
|
from ..indexer.indexer import Indexer
|
||||||
from ..tokenizer import factory as tokenizer_factory
|
from ..tokenizer import factory as tokenizer_factory
|
||||||
|
|
||||||
update_interval = UpdateReplication._compute_update_interval(args)
|
update_interval = self._compute_update_interval(args)
|
||||||
|
|
||||||
params = args.osm2pgsql_options(default_cache=2000, default_threads=1)
|
params = args.osm2pgsql_options(default_cache=2000, default_threads=1)
|
||||||
params.update(base_url=args.config.REPLICATION_URL,
|
params.update(base_url=args.config.REPLICATION_URL,
|
||||||
@@ -169,7 +171,8 @@ class UpdateReplication:
|
|||||||
indexer.index_full(analyse=False)
|
indexer.index_full(analyse=False)
|
||||||
|
|
||||||
if LOG.isEnabledFor(logging.WARNING):
|
if LOG.isEnabledFor(logging.WARNING):
|
||||||
UpdateReplication._report_update(batchdate, start, index_start)
|
assert batchdate is not None
|
||||||
|
self._report_update(batchdate, start, index_start)
|
||||||
|
|
||||||
if args.once or (args.catch_up and state is replication.UpdateState.NO_CHANGES):
|
if args.once or (args.catch_up and state is replication.UpdateState.NO_CHANGES):
|
||||||
break
|
break
|
||||||
@@ -179,15 +182,14 @@ class UpdateReplication:
|
|||||||
time.sleep(recheck_interval)
|
time.sleep(recheck_interval)
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
def run(self, args: NominatimArgs) -> int:
|
||||||
def run(args):
|
|
||||||
socket.setdefaulttimeout(args.socket_timeout)
|
socket.setdefaulttimeout(args.socket_timeout)
|
||||||
|
|
||||||
if args.init:
|
if args.init:
|
||||||
return UpdateReplication._init_replication(args)
|
return self._init_replication(args)
|
||||||
|
|
||||||
if args.check_for_updates:
|
if args.check_for_updates:
|
||||||
return UpdateReplication._check_for_updates(args)
|
return self._check_for_updates(args)
|
||||||
|
|
||||||
UpdateReplication._update(args)
|
self._update(args)
|
||||||
return 0
|
return 0
|
||||||
|
|||||||
@@ -7,14 +7,20 @@
|
|||||||
"""
|
"""
|
||||||
Implementation of the 'import' subcommand.
|
Implementation of the 'import' subcommand.
|
||||||
"""
|
"""
|
||||||
|
from typing import Optional
|
||||||
|
import argparse
|
||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import psutil
|
import psutil
|
||||||
|
|
||||||
from nominatim.db.connection import connect
|
from nominatim.config import Configuration
|
||||||
|
from nominatim.db.connection import connect, Connection
|
||||||
from nominatim.db import status, properties
|
from nominatim.db import status, properties
|
||||||
|
from nominatim.tokenizer.base import AbstractTokenizer
|
||||||
from nominatim.version import version_str
|
from nominatim.version import version_str
|
||||||
|
from nominatim.clicmd.args import NominatimArgs
|
||||||
|
from nominatim.errors import UsageError
|
||||||
|
|
||||||
# Do not repeat documentation of subcommand classes.
|
# Do not repeat documentation of subcommand classes.
|
||||||
# pylint: disable=C0111
|
# pylint: disable=C0111
|
||||||
@@ -32,38 +38,36 @@ class SetupAll:
|
|||||||
needs superuser rights on the database.
|
needs superuser rights on the database.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@staticmethod
|
def add_args(self, parser: argparse.ArgumentParser) -> None:
|
||||||
def add_args(parser):
|
|
||||||
group_name = parser.add_argument_group('Required arguments')
|
group_name = parser.add_argument_group('Required arguments')
|
||||||
group = group_name.add_mutually_exclusive_group(required=True)
|
group1 = group_name.add_mutually_exclusive_group(required=True)
|
||||||
group.add_argument('--osm-file', metavar='FILE', action='append',
|
group1.add_argument('--osm-file', metavar='FILE', action='append',
|
||||||
help='OSM file to be imported'
|
help='OSM file to be imported'
|
||||||
' (repeat for importing multiple files)')
|
' (repeat for importing multiple files)')
|
||||||
group.add_argument('--continue', dest='continue_at',
|
group1.add_argument('--continue', dest='continue_at',
|
||||||
choices=['load-data', 'indexing', 'db-postprocess'],
|
choices=['load-data', 'indexing', 'db-postprocess'],
|
||||||
help='Continue an import that was interrupted')
|
help='Continue an import that was interrupted')
|
||||||
group = parser.add_argument_group('Optional arguments')
|
group2 = parser.add_argument_group('Optional arguments')
|
||||||
group.add_argument('--osm2pgsql-cache', metavar='SIZE', type=int,
|
group2.add_argument('--osm2pgsql-cache', metavar='SIZE', type=int,
|
||||||
help='Size of cache to be used by osm2pgsql (in MB)')
|
help='Size of cache to be used by osm2pgsql (in MB)')
|
||||||
group.add_argument('--reverse-only', action='store_true',
|
group2.add_argument('--reverse-only', action='store_true',
|
||||||
help='Do not create tables and indexes for searching')
|
help='Do not create tables and indexes for searching')
|
||||||
group.add_argument('--no-partitions', action='store_true',
|
group2.add_argument('--no-partitions', action='store_true',
|
||||||
help=("Do not partition search indices "
|
help=("Do not partition search indices "
|
||||||
"(speeds up import of single country extracts)"))
|
"(speeds up import of single country extracts)"))
|
||||||
group.add_argument('--no-updates', action='store_true',
|
group2.add_argument('--no-updates', action='store_true',
|
||||||
help="Do not keep tables that are only needed for "
|
help="Do not keep tables that are only needed for "
|
||||||
"updating the database later")
|
"updating the database later")
|
||||||
group.add_argument('--offline', action='store_true',
|
group2.add_argument('--offline', action='store_true',
|
||||||
help="Do not attempt to load any additional data from the internet")
|
help="Do not attempt to load any additional data from the internet")
|
||||||
group = parser.add_argument_group('Expert options')
|
group3 = parser.add_argument_group('Expert options')
|
||||||
group.add_argument('--ignore-errors', action='store_true',
|
group3.add_argument('--ignore-errors', action='store_true',
|
||||||
help='Continue import even when errors in SQL are present')
|
help='Continue import even when errors in SQL are present')
|
||||||
group.add_argument('--index-noanalyse', action='store_true',
|
group3.add_argument('--index-noanalyse', action='store_true',
|
||||||
help='Do not perform analyse operations during index (expert only)')
|
help='Do not perform analyse operations during index (expert only)')
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
def run(self, args: NominatimArgs) -> int: # pylint: disable=too-many-statements
|
||||||
def run(args): # pylint: disable=too-many-statements
|
|
||||||
from ..data import country_info
|
from ..data import country_info
|
||||||
from ..tools import database_import, refresh, postcodes, freeze
|
from ..tools import database_import, refresh, postcodes, freeze
|
||||||
from ..indexer.indexer import Indexer
|
from ..indexer.indexer import Indexer
|
||||||
@@ -72,6 +76,8 @@ class SetupAll:
|
|||||||
|
|
||||||
if args.continue_at is None:
|
if args.continue_at is None:
|
||||||
files = args.get_osm_file_list()
|
files = args.get_osm_file_list()
|
||||||
|
if not files:
|
||||||
|
raise UsageError("No input files (use --osm-file).")
|
||||||
|
|
||||||
LOG.warning('Creating database')
|
LOG.warning('Creating database')
|
||||||
database_import.setup_database_skeleton(args.config.get_libpq_dsn(),
|
database_import.setup_database_skeleton(args.config.get_libpq_dsn(),
|
||||||
@@ -88,7 +94,7 @@ class SetupAll:
|
|||||||
drop=args.no_updates,
|
drop=args.no_updates,
|
||||||
ignore_errors=args.ignore_errors)
|
ignore_errors=args.ignore_errors)
|
||||||
|
|
||||||
SetupAll._setup_tables(args.config, args.reverse_only)
|
self._setup_tables(args.config, args.reverse_only)
|
||||||
|
|
||||||
LOG.warning('Importing wikipedia importance data')
|
LOG.warning('Importing wikipedia importance data')
|
||||||
data_path = Path(args.config.WIKIPEDIA_DATA_PATH or args.project_dir)
|
data_path = Path(args.config.WIKIPEDIA_DATA_PATH or args.project_dir)
|
||||||
@@ -107,7 +113,7 @@ class SetupAll:
|
|||||||
args.threads or psutil.cpu_count() or 1)
|
args.threads or psutil.cpu_count() or 1)
|
||||||
|
|
||||||
LOG.warning("Setting up tokenizer")
|
LOG.warning("Setting up tokenizer")
|
||||||
tokenizer = SetupAll._get_tokenizer(args.continue_at, args.config)
|
tokenizer = self._get_tokenizer(args.continue_at, args.config)
|
||||||
|
|
||||||
if args.continue_at is None or args.continue_at == 'load-data':
|
if args.continue_at is None or args.continue_at == 'load-data':
|
||||||
LOG.warning('Calculate postcodes')
|
LOG.warning('Calculate postcodes')
|
||||||
@@ -117,7 +123,7 @@ class SetupAll:
|
|||||||
if args.continue_at is None or args.continue_at in ('load-data', 'indexing'):
|
if args.continue_at is None or args.continue_at in ('load-data', 'indexing'):
|
||||||
if args.continue_at is not None and args.continue_at != 'load-data':
|
if args.continue_at is not None and args.continue_at != 'load-data':
|
||||||
with connect(args.config.get_libpq_dsn()) as conn:
|
with connect(args.config.get_libpq_dsn()) as conn:
|
||||||
SetupAll._create_pending_index(conn, args.config.TABLESPACE_ADDRESS_INDEX)
|
self._create_pending_index(conn, args.config.TABLESPACE_ADDRESS_INDEX)
|
||||||
LOG.warning('Indexing places')
|
LOG.warning('Indexing places')
|
||||||
indexer = Indexer(args.config.get_libpq_dsn(), tokenizer,
|
indexer = Indexer(args.config.get_libpq_dsn(), tokenizer,
|
||||||
args.threads or psutil.cpu_count() or 1)
|
args.threads or psutil.cpu_count() or 1)
|
||||||
@@ -142,13 +148,12 @@ class SetupAll:
|
|||||||
with connect(args.config.get_libpq_dsn()) as conn:
|
with connect(args.config.get_libpq_dsn()) as conn:
|
||||||
refresh.setup_website(webdir, args.config, conn)
|
refresh.setup_website(webdir, args.config, conn)
|
||||||
|
|
||||||
SetupAll._finalize_database(args.config.get_libpq_dsn(), args.offline)
|
self._finalize_database(args.config.get_libpq_dsn(), args.offline)
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
def _setup_tables(self, config: Configuration, reverse_only: bool) -> None:
|
||||||
def _setup_tables(config, reverse_only):
|
|
||||||
""" Set up the basic database layout: tables, indexes and functions.
|
""" Set up the basic database layout: tables, indexes and functions.
|
||||||
"""
|
"""
|
||||||
from ..tools import database_import, refresh
|
from ..tools import database_import, refresh
|
||||||
@@ -169,8 +174,8 @@ class SetupAll:
|
|||||||
refresh.create_functions(conn, config, False, False)
|
refresh.create_functions(conn, config, False, False)
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
def _get_tokenizer(self, continue_at: Optional[str],
|
||||||
def _get_tokenizer(continue_at, config):
|
config: Configuration) -> AbstractTokenizer:
|
||||||
""" Set up a new tokenizer or load an already initialised one.
|
""" Set up a new tokenizer or load an already initialised one.
|
||||||
"""
|
"""
|
||||||
from ..tokenizer import factory as tokenizer_factory
|
from ..tokenizer import factory as tokenizer_factory
|
||||||
@@ -182,8 +187,8 @@ class SetupAll:
|
|||||||
# just load the tokenizer
|
# just load the tokenizer
|
||||||
return tokenizer_factory.get_tokenizer_for_db(config)
|
return tokenizer_factory.get_tokenizer_for_db(config)
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _create_pending_index(conn, tablespace):
|
def _create_pending_index(self, conn: Connection, tablespace: str) -> None:
|
||||||
""" Add a supporting index for finding places still to be indexed.
|
""" Add a supporting index for finding places still to be indexed.
|
||||||
|
|
||||||
This index is normally created at the end of the import process
|
This index is normally created at the end of the import process
|
||||||
@@ -204,8 +209,7 @@ class SetupAll:
|
|||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
def _finalize_database(self, dsn: str, offline: bool) -> None:
|
||||||
def _finalize_database(dsn, offline):
|
|
||||||
""" Determine the database date and set the status accordingly.
|
""" Determine the database date and set the status accordingly.
|
||||||
"""
|
"""
|
||||||
with connect(dsn) as conn:
|
with connect(dsn) as conn:
|
||||||
|
|||||||
@@ -7,13 +7,16 @@
|
|||||||
"""
|
"""
|
||||||
Implementation of the 'special-phrases' command.
|
Implementation of the 'special-phrases' command.
|
||||||
"""
|
"""
|
||||||
|
import argparse
|
||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from nominatim.errors import UsageError
|
from nominatim.errors import UsageError
|
||||||
from nominatim.db.connection import connect
|
from nominatim.db.connection import connect
|
||||||
from nominatim.tools.special_phrases.sp_importer import SPImporter
|
from nominatim.tools.special_phrases.sp_importer import SPImporter, SpecialPhraseLoader
|
||||||
from nominatim.tools.special_phrases.sp_wiki_loader import SPWikiLoader
|
from nominatim.tools.special_phrases.sp_wiki_loader import SPWikiLoader
|
||||||
from nominatim.tools.special_phrases.sp_csv_loader import SPCsvLoader
|
from nominatim.tools.special_phrases.sp_csv_loader import SPCsvLoader
|
||||||
|
from nominatim.clicmd.args import NominatimArgs
|
||||||
|
|
||||||
LOG = logging.getLogger()
|
LOG = logging.getLogger()
|
||||||
|
|
||||||
@@ -49,8 +52,8 @@ class ImportSpecialPhrases:
|
|||||||
with custom rules into the project directory or by using the `--config`
|
with custom rules into the project directory or by using the `--config`
|
||||||
option to point to another configuration file.
|
option to point to another configuration file.
|
||||||
"""
|
"""
|
||||||
@staticmethod
|
|
||||||
def add_args(parser):
|
def add_args(self, parser: argparse.ArgumentParser) -> None:
|
||||||
group = parser.add_argument_group('Input arguments')
|
group = parser.add_argument_group('Input arguments')
|
||||||
group.add_argument('--import-from-wiki', action='store_true',
|
group.add_argument('--import-from-wiki', action='store_true',
|
||||||
help='Import special phrases from the OSM wiki to the database')
|
help='Import special phrases from the OSM wiki to the database')
|
||||||
@@ -58,26 +61,24 @@ class ImportSpecialPhrases:
|
|||||||
help='Import special phrases from a CSV file')
|
help='Import special phrases from a CSV file')
|
||||||
group.add_argument('--no-replace', action='store_true',
|
group.add_argument('--no-replace', action='store_true',
|
||||||
help='Keep the old phrases and only add the new ones')
|
help='Keep the old phrases and only add the new ones')
|
||||||
group.add_argument('--config', action='store',
|
|
||||||
help='Configuration file for black/white listing '
|
|
||||||
'(default: phrase-settings.json)')
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def run(args):
|
def run(self, args: NominatimArgs) -> int:
|
||||||
|
|
||||||
if args.import_from_wiki:
|
if args.import_from_wiki:
|
||||||
ImportSpecialPhrases.start_import(args, SPWikiLoader(args.config))
|
self.start_import(args, SPWikiLoader(args.config))
|
||||||
|
|
||||||
if args.import_from_csv:
|
if args.import_from_csv:
|
||||||
if not Path(args.import_from_csv).is_file():
|
if not Path(args.import_from_csv).is_file():
|
||||||
LOG.fatal("CSV file '%s' does not exist.", args.import_from_csv)
|
LOG.fatal("CSV file '%s' does not exist.", args.import_from_csv)
|
||||||
raise UsageError('Cannot access file.')
|
raise UsageError('Cannot access file.')
|
||||||
|
|
||||||
ImportSpecialPhrases.start_import(args, SPCsvLoader(args.import_from_csv))
|
self.start_import(args, SPCsvLoader(args.import_from_csv))
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def start_import(args, loader):
|
def start_import(self, args: NominatimArgs, loader: SpecialPhraseLoader) -> None:
|
||||||
"""
|
"""
|
||||||
Create the SPImporter object containing the right
|
Create the SPImporter object containing the right
|
||||||
sp loader and then start the import of special phrases.
|
sp loader and then start the import of special phrases.
|
||||||
|
|||||||
@@ -7,6 +7,7 @@
|
|||||||
"""
|
"""
|
||||||
Nominatim configuration accessor.
|
Nominatim configuration accessor.
|
||||||
"""
|
"""
|
||||||
|
from typing import Dict, Any, List, Mapping, Optional
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -15,12 +16,13 @@ import yaml
|
|||||||
|
|
||||||
from dotenv import dotenv_values
|
from dotenv import dotenv_values
|
||||||
|
|
||||||
|
from nominatim.typing import StrPath
|
||||||
from nominatim.errors import UsageError
|
from nominatim.errors import UsageError
|
||||||
|
|
||||||
LOG = logging.getLogger()
|
LOG = logging.getLogger()
|
||||||
CONFIG_CACHE = {}
|
CONFIG_CACHE : Dict[str, Any] = {}
|
||||||
|
|
||||||
def flatten_config_list(content, section=''):
|
def flatten_config_list(content: Any, section: str = '') -> List[Any]:
|
||||||
""" Flatten YAML configuration lists that contain include sections
|
""" Flatten YAML configuration lists that contain include sections
|
||||||
which are lists themselves.
|
which are lists themselves.
|
||||||
"""
|
"""
|
||||||
@@ -54,7 +56,8 @@ class Configuration:
|
|||||||
avoid conflicts with other environment variables.
|
avoid conflicts with other environment variables.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, project_dir, config_dir, environ=None):
|
def __init__(self, project_dir: Path, config_dir: Path,
|
||||||
|
environ: Optional[Mapping[str, str]] = None) -> None:
|
||||||
self.environ = environ or os.environ
|
self.environ = environ or os.environ
|
||||||
self.project_dir = project_dir
|
self.project_dir = project_dir
|
||||||
self.config_dir = config_dir
|
self.config_dir = config_dir
|
||||||
@@ -63,25 +66,32 @@ class Configuration:
|
|||||||
self._config.update(dotenv_values(str((project_dir / '.env').resolve())))
|
self._config.update(dotenv_values(str((project_dir / '.env').resolve())))
|
||||||
|
|
||||||
class _LibDirs:
|
class _LibDirs:
|
||||||
pass
|
module: Path
|
||||||
|
osm2pgsql: Path
|
||||||
|
php: Path
|
||||||
|
sql: Path
|
||||||
|
data: Path
|
||||||
|
|
||||||
self.lib_dir = _LibDirs()
|
self.lib_dir = _LibDirs()
|
||||||
|
|
||||||
def set_libdirs(self, **kwargs):
|
|
||||||
|
def set_libdirs(self, **kwargs: StrPath) -> None:
|
||||||
""" Set paths to library functions and data.
|
""" Set paths to library functions and data.
|
||||||
"""
|
"""
|
||||||
for key, value in kwargs.items():
|
for key, value in kwargs.items():
|
||||||
setattr(self.lib_dir, key, Path(value).resolve())
|
setattr(self.lib_dir, key, Path(value).resolve())
|
||||||
|
|
||||||
def __getattr__(self, name):
|
|
||||||
|
def __getattr__(self, name: str) -> str:
|
||||||
name = 'NOMINATIM_' + name
|
name = 'NOMINATIM_' + name
|
||||||
|
|
||||||
if name in self.environ:
|
if name in self.environ:
|
||||||
return self.environ[name]
|
return self.environ[name]
|
||||||
|
|
||||||
return self._config[name]
|
return self._config[name] or ''
|
||||||
|
|
||||||
def get_bool(self, name):
|
|
||||||
|
def get_bool(self, name: str) -> bool:
|
||||||
""" Return the given configuration parameter as a boolean.
|
""" Return the given configuration parameter as a boolean.
|
||||||
Values of '1', 'yes' and 'true' are accepted as truthy values,
|
Values of '1', 'yes' and 'true' are accepted as truthy values,
|
||||||
everything else is interpreted as false.
|
everything else is interpreted as false.
|
||||||
@@ -89,7 +99,7 @@ class Configuration:
|
|||||||
return getattr(self, name).lower() in ('1', 'yes', 'true')
|
return getattr(self, name).lower() in ('1', 'yes', 'true')
|
||||||
|
|
||||||
|
|
||||||
def get_int(self, name):
|
def get_int(self, name: str) -> int:
|
||||||
""" Return the given configuration parameter as an int.
|
""" Return the given configuration parameter as an int.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
@@ -99,7 +109,7 @@ class Configuration:
|
|||||||
raise UsageError("Configuration error.") from exp
|
raise UsageError("Configuration error.") from exp
|
||||||
|
|
||||||
|
|
||||||
def get_str_list(self, name):
|
def get_str_list(self, name: str) -> Optional[List[str]]:
|
||||||
""" Return the given configuration parameter as a list of strings.
|
""" Return the given configuration parameter as a list of strings.
|
||||||
The values are assumed to be given as a comma-sparated list and
|
The values are assumed to be given as a comma-sparated list and
|
||||||
will be stripped before returning them. On empty values None
|
will be stripped before returning them. On empty values None
|
||||||
@@ -110,30 +120,31 @@ class Configuration:
|
|||||||
return [v.strip() for v in raw.split(',')] if raw else None
|
return [v.strip() for v in raw.split(',')] if raw else None
|
||||||
|
|
||||||
|
|
||||||
def get_path(self, name):
|
def get_path(self, name: str) -> Optional[Path]:
|
||||||
""" Return the given configuration parameter as a Path.
|
""" Return the given configuration parameter as a Path.
|
||||||
If a relative path is configured, then the function converts this
|
If a relative path is configured, then the function converts this
|
||||||
into an absolute path with the project directory as root path.
|
into an absolute path with the project directory as root path.
|
||||||
If the configuration is unset, a falsy value is returned.
|
If the configuration is unset, None is returned.
|
||||||
"""
|
"""
|
||||||
value = getattr(self, name)
|
value = getattr(self, name)
|
||||||
if value:
|
if not value:
|
||||||
value = Path(value)
|
return None
|
||||||
|
|
||||||
if not value.is_absolute():
|
cfgpath = Path(value)
|
||||||
value = self.project_dir / value
|
|
||||||
|
|
||||||
value = value.resolve()
|
if not cfgpath.is_absolute():
|
||||||
|
cfgpath = self.project_dir / cfgpath
|
||||||
|
|
||||||
return value
|
return cfgpath.resolve()
|
||||||
|
|
||||||
def get_libpq_dsn(self):
|
|
||||||
|
def get_libpq_dsn(self) -> str:
|
||||||
""" Get configured database DSN converted into the key/value format
|
""" Get configured database DSN converted into the key/value format
|
||||||
understood by libpq and psycopg.
|
understood by libpq and psycopg.
|
||||||
"""
|
"""
|
||||||
dsn = self.DATABASE_DSN
|
dsn = self.DATABASE_DSN
|
||||||
|
|
||||||
def quote_param(param):
|
def quote_param(param: str) -> str:
|
||||||
key, val = param.split('=')
|
key, val = param.split('=')
|
||||||
val = val.replace('\\', '\\\\').replace("'", "\\'")
|
val = val.replace('\\', '\\\\').replace("'", "\\'")
|
||||||
if ' ' in val:
|
if ' ' in val:
|
||||||
@@ -147,7 +158,7 @@ class Configuration:
|
|||||||
return dsn
|
return dsn
|
||||||
|
|
||||||
|
|
||||||
def get_import_style_file(self):
|
def get_import_style_file(self) -> Path:
|
||||||
""" Return the import style file as a path object. Translates the
|
""" Return the import style file as a path object. Translates the
|
||||||
name of the standard styles automatically into a file in the
|
name of the standard styles automatically into a file in the
|
||||||
config style.
|
config style.
|
||||||
@@ -160,7 +171,7 @@ class Configuration:
|
|||||||
return self.find_config_file('', 'IMPORT_STYLE')
|
return self.find_config_file('', 'IMPORT_STYLE')
|
||||||
|
|
||||||
|
|
||||||
def get_os_env(self):
|
def get_os_env(self) -> Dict[str, Optional[str]]:
|
||||||
""" Return a copy of the OS environment with the Nominatim configuration
|
""" Return a copy of the OS environment with the Nominatim configuration
|
||||||
merged in.
|
merged in.
|
||||||
"""
|
"""
|
||||||
@@ -170,7 +181,8 @@ class Configuration:
|
|||||||
return env
|
return env
|
||||||
|
|
||||||
|
|
||||||
def load_sub_configuration(self, filename, config=None):
|
def load_sub_configuration(self, filename: StrPath,
|
||||||
|
config: Optional[str] = None) -> Any:
|
||||||
""" Load additional configuration from a file. `filename` is the name
|
""" Load additional configuration from a file. `filename` is the name
|
||||||
of the configuration file. The file is first searched in the
|
of the configuration file. The file is first searched in the
|
||||||
project directory and then in the global settings dirctory.
|
project directory and then in the global settings dirctory.
|
||||||
@@ -207,16 +219,17 @@ class Configuration:
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
def find_config_file(self, filename, config=None):
|
def find_config_file(self, filename: StrPath,
|
||||||
|
config: Optional[str] = None) -> Path:
|
||||||
""" Resolve the location of a configuration file given a filename and
|
""" Resolve the location of a configuration file given a filename and
|
||||||
an optional configuration option with the file name.
|
an optional configuration option with the file name.
|
||||||
Raises a UsageError when the file cannot be found or is not
|
Raises a UsageError when the file cannot be found or is not
|
||||||
a regular file.
|
a regular file.
|
||||||
"""
|
"""
|
||||||
if config is not None:
|
if config is not None:
|
||||||
cfg_filename = getattr(self, config)
|
cfg_value = getattr(self, config)
|
||||||
if cfg_filename:
|
if cfg_value:
|
||||||
cfg_filename = Path(cfg_filename)
|
cfg_filename = Path(cfg_value)
|
||||||
|
|
||||||
if cfg_filename.is_absolute():
|
if cfg_filename.is_absolute():
|
||||||
cfg_filename = cfg_filename.resolve()
|
cfg_filename = cfg_filename.resolve()
|
||||||
@@ -240,7 +253,7 @@ class Configuration:
|
|||||||
raise UsageError("Config file not found.")
|
raise UsageError("Config file not found.")
|
||||||
|
|
||||||
|
|
||||||
def _load_from_yaml(self, cfgfile):
|
def _load_from_yaml(self, cfgfile: Path) -> Any:
|
||||||
""" Load a YAML configuration file. This installs a special handler that
|
""" Load a YAML configuration file. This installs a special handler that
|
||||||
allows to include other YAML files using the '!include' operator.
|
allows to include other YAML files using the '!include' operator.
|
||||||
"""
|
"""
|
||||||
@@ -249,7 +262,7 @@ class Configuration:
|
|||||||
return yaml.safe_load(cfgfile.read_text(encoding='utf-8'))
|
return yaml.safe_load(cfgfile.read_text(encoding='utf-8'))
|
||||||
|
|
||||||
|
|
||||||
def _yaml_include_representer(self, loader, node):
|
def _yaml_include_representer(self, loader: Any, node: yaml.Node) -> Any:
|
||||||
""" Handler for the '!include' operator in YAML files.
|
""" Handler for the '!include' operator in YAML files.
|
||||||
|
|
||||||
When the filename is relative, then the file is first searched in the
|
When the filename is relative, then the file is first searched in the
|
||||||
|
|||||||
@@ -7,13 +7,17 @@
|
|||||||
"""
|
"""
|
||||||
Functions for importing and managing static country information.
|
Functions for importing and managing static country information.
|
||||||
"""
|
"""
|
||||||
|
from typing import Dict, Any, Iterable, Tuple, Optional, Container, overload
|
||||||
|
from pathlib import Path
|
||||||
import psycopg2.extras
|
import psycopg2.extras
|
||||||
|
|
||||||
from nominatim.db import utils as db_utils
|
from nominatim.db import utils as db_utils
|
||||||
from nominatim.db.connection import connect
|
from nominatim.db.connection import connect, Connection
|
||||||
from nominatim.errors import UsageError
|
from nominatim.errors import UsageError
|
||||||
|
from nominatim.config import Configuration
|
||||||
|
from nominatim.tokenizer.base import AbstractTokenizer
|
||||||
|
|
||||||
def _flatten_name_list(names):
|
def _flatten_name_list(names: Any) -> Dict[str, str]:
|
||||||
if names is None:
|
if names is None:
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
@@ -41,11 +45,11 @@ class _CountryInfo:
|
|||||||
""" Caches country-specific properties from the configuration file.
|
""" Caches country-specific properties from the configuration file.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
self._info = {}
|
self._info: Dict[str, Dict[str, Any]] = {}
|
||||||
|
|
||||||
|
|
||||||
def load(self, config):
|
def load(self, config: Configuration) -> None:
|
||||||
""" Load the country properties from the configuration files,
|
""" Load the country properties from the configuration files,
|
||||||
if they are not loaded yet.
|
if they are not loaded yet.
|
||||||
"""
|
"""
|
||||||
@@ -61,12 +65,12 @@ class _CountryInfo:
|
|||||||
prop['names'] = _flatten_name_list(prop.get('names'))
|
prop['names'] = _flatten_name_list(prop.get('names'))
|
||||||
|
|
||||||
|
|
||||||
def items(self):
|
def items(self) -> Iterable[Tuple[str, Dict[str, Any]]]:
|
||||||
""" Return tuples of (country_code, property dict) as iterable.
|
""" Return tuples of (country_code, property dict) as iterable.
|
||||||
"""
|
"""
|
||||||
return self._info.items()
|
return self._info.items()
|
||||||
|
|
||||||
def get(self, country_code):
|
def get(self, country_code: str) -> Dict[str, Any]:
|
||||||
""" Get country information for the country with the given country code.
|
""" Get country information for the country with the given country code.
|
||||||
"""
|
"""
|
||||||
return self._info.get(country_code, {})
|
return self._info.get(country_code, {})
|
||||||
@@ -76,15 +80,22 @@ class _CountryInfo:
|
|||||||
_COUNTRY_INFO = _CountryInfo()
|
_COUNTRY_INFO = _CountryInfo()
|
||||||
|
|
||||||
|
|
||||||
def setup_country_config(config):
|
def setup_country_config(config: Configuration) -> None:
|
||||||
""" Load country properties from the configuration file.
|
""" Load country properties from the configuration file.
|
||||||
Needs to be called before using any other functions in this
|
Needs to be called before using any other functions in this
|
||||||
file.
|
file.
|
||||||
"""
|
"""
|
||||||
_COUNTRY_INFO.load(config)
|
_COUNTRY_INFO.load(config)
|
||||||
|
|
||||||
|
@overload
|
||||||
|
def iterate() -> Iterable[Tuple[str, Dict[str, Any]]]:
|
||||||
|
...
|
||||||
|
|
||||||
def iterate(prop=None):
|
@overload
|
||||||
|
def iterate(prop: str) -> Iterable[Tuple[str, Any]]:
|
||||||
|
...
|
||||||
|
|
||||||
|
def iterate(prop: Optional[str] = None) -> Iterable[Tuple[str, Dict[str, Any]]]:
|
||||||
""" Iterate over country code and properties.
|
""" Iterate over country code and properties.
|
||||||
|
|
||||||
When `prop` is None, all countries are returned with their complete
|
When `prop` is None, all countries are returned with their complete
|
||||||
@@ -100,7 +111,7 @@ def iterate(prop=None):
|
|||||||
return ((c, p[prop]) for c, p in _COUNTRY_INFO.items() if prop in p)
|
return ((c, p[prop]) for c, p in _COUNTRY_INFO.items() if prop in p)
|
||||||
|
|
||||||
|
|
||||||
def setup_country_tables(dsn, sql_dir, ignore_partitions=False):
|
def setup_country_tables(dsn: str, sql_dir: Path, ignore_partitions: bool = False) -> None:
|
||||||
""" Create and populate the tables with basic static data that provides
|
""" Create and populate the tables with basic static data that provides
|
||||||
the background for geocoding. Data is assumed to not yet exist.
|
the background for geocoding. Data is assumed to not yet exist.
|
||||||
"""
|
"""
|
||||||
@@ -112,7 +123,7 @@ def setup_country_tables(dsn, sql_dir, ignore_partitions=False):
|
|||||||
if ignore_partitions:
|
if ignore_partitions:
|
||||||
partition = 0
|
partition = 0
|
||||||
else:
|
else:
|
||||||
partition = props.get('partition')
|
partition = props.get('partition', 0)
|
||||||
lang = props['languages'][0] if len(
|
lang = props['languages'][0] if len(
|
||||||
props['languages']) == 1 else None
|
props['languages']) == 1 else None
|
||||||
|
|
||||||
@@ -135,13 +146,14 @@ def setup_country_tables(dsn, sql_dir, ignore_partitions=False):
|
|||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
|
|
||||||
def create_country_names(conn, tokenizer, languages=None):
|
def create_country_names(conn: Connection, tokenizer: AbstractTokenizer,
|
||||||
|
languages: Optional[Container[str]] = None) -> None:
|
||||||
""" Add default country names to search index. `languages` is a comma-
|
""" Add default country names to search index. `languages` is a comma-
|
||||||
separated list of language codes as used in OSM. If `languages` is not
|
separated list of language codes as used in OSM. If `languages` is not
|
||||||
empty then only name translations for the given languages are added
|
empty then only name translations for the given languages are added
|
||||||
to the index.
|
to the index.
|
||||||
"""
|
"""
|
||||||
def _include_key(key):
|
def _include_key(key: str) -> bool:
|
||||||
return ':' not in key or not languages or \
|
return ':' not in key or not languages or \
|
||||||
key[key.index(':') + 1:] in languages
|
key[key.index(':') + 1:] in languages
|
||||||
|
|
||||||
|
|||||||
@@ -8,18 +8,19 @@
|
|||||||
Wrapper around place information the indexer gets from the database and hands to
|
Wrapper around place information the indexer gets from the database and hands to
|
||||||
the tokenizer.
|
the tokenizer.
|
||||||
"""
|
"""
|
||||||
|
from typing import Optional, Mapping, Any
|
||||||
|
|
||||||
class PlaceInfo:
|
class PlaceInfo:
|
||||||
""" Data class containing all information the tokenizer gets about a
|
""" Data class containing all information the tokenizer gets about a
|
||||||
place it should process the names for.
|
place it should process the names for.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, info):
|
def __init__(self, info: Mapping[str, Any]) -> None:
|
||||||
self._info = info
|
self._info = info
|
||||||
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def name(self):
|
def name(self) -> Optional[Mapping[str, str]]:
|
||||||
""" A dictionary with the names of the place or None if the place
|
""" A dictionary with the names of the place or None if the place
|
||||||
has no names.
|
has no names.
|
||||||
"""
|
"""
|
||||||
@@ -27,7 +28,7 @@ class PlaceInfo:
|
|||||||
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def address(self):
|
def address(self) -> Optional[Mapping[str, str]]:
|
||||||
""" A dictionary with the address elements of the place
|
""" A dictionary with the address elements of the place
|
||||||
or None if no address information is available.
|
or None if no address information is available.
|
||||||
"""
|
"""
|
||||||
@@ -35,7 +36,7 @@ class PlaceInfo:
|
|||||||
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def country_code(self):
|
def country_code(self) -> Optional[str]:
|
||||||
""" The country code of the country the place is in. Guaranteed
|
""" The country code of the country the place is in. Guaranteed
|
||||||
to be a two-letter lower-case string or None, if no country
|
to be a two-letter lower-case string or None, if no country
|
||||||
could be found.
|
could be found.
|
||||||
@@ -44,20 +45,20 @@ class PlaceInfo:
|
|||||||
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def rank_address(self):
|
def rank_address(self) -> int:
|
||||||
""" The computed rank address before rank correction.
|
""" The computed rank address before rank correction.
|
||||||
"""
|
"""
|
||||||
return self._info.get('rank_address')
|
return self._info.get('rank_address', 0)
|
||||||
|
|
||||||
|
|
||||||
def is_a(self, key, value):
|
def is_a(self, key: str, value: str) -> bool:
|
||||||
""" Check if the place's primary tag corresponds to the given
|
""" Check if the place's primary tag corresponds to the given
|
||||||
key and value.
|
key and value.
|
||||||
"""
|
"""
|
||||||
return self._info.get('class') == key and self._info.get('type') == value
|
return self._info.get('class') == key and self._info.get('type') == value
|
||||||
|
|
||||||
|
|
||||||
def is_country(self):
|
def is_country(self) -> bool:
|
||||||
""" Check if the place is a valid country boundary.
|
""" Check if the place is a valid country boundary.
|
||||||
"""
|
"""
|
||||||
return self.rank_address == 4 \
|
return self.rank_address == 4 \
|
||||||
|
|||||||
@@ -8,6 +8,7 @@
|
|||||||
Functions for formatting postcodes according to their country-specific
|
Functions for formatting postcodes according to their country-specific
|
||||||
format.
|
format.
|
||||||
"""
|
"""
|
||||||
|
from typing import Any, Mapping, Optional, Set, Match
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from nominatim.errors import UsageError
|
from nominatim.errors import UsageError
|
||||||
@@ -17,7 +18,7 @@ class CountryPostcodeMatcher:
|
|||||||
""" Matches and formats a postcode according to a format definition
|
""" Matches and formats a postcode according to a format definition
|
||||||
of the given country.
|
of the given country.
|
||||||
"""
|
"""
|
||||||
def __init__(self, country_code, config):
|
def __init__(self, country_code: str, config: Mapping[str, Any]) -> None:
|
||||||
if 'pattern' not in config:
|
if 'pattern' not in config:
|
||||||
raise UsageError("Field 'pattern' required for 'postcode' "
|
raise UsageError("Field 'pattern' required for 'postcode' "
|
||||||
f"for country '{country_code}'")
|
f"for country '{country_code}'")
|
||||||
@@ -30,7 +31,7 @@ class CountryPostcodeMatcher:
|
|||||||
self.output = config.get('output', r'\g<0>')
|
self.output = config.get('output', r'\g<0>')
|
||||||
|
|
||||||
|
|
||||||
def match(self, postcode):
|
def match(self, postcode: str) -> Optional[Match[str]]:
|
||||||
""" Match the given postcode against the postcode pattern for this
|
""" Match the given postcode against the postcode pattern for this
|
||||||
matcher. Returns a `re.Match` object if the match was successful
|
matcher. Returns a `re.Match` object if the match was successful
|
||||||
and None otherwise.
|
and None otherwise.
|
||||||
@@ -44,7 +45,7 @@ class CountryPostcodeMatcher:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def normalize(self, match):
|
def normalize(self, match: Match[str]) -> str:
|
||||||
""" Return the default format of the postcode for the given match.
|
""" Return the default format of the postcode for the given match.
|
||||||
`match` must be a `re.Match` object previously returned by
|
`match` must be a `re.Match` object previously returned by
|
||||||
`match()`
|
`match()`
|
||||||
@@ -56,9 +57,9 @@ class PostcodeFormatter:
|
|||||||
""" Container for different postcode formats of the world and
|
""" Container for different postcode formats of the world and
|
||||||
access functions.
|
access functions.
|
||||||
"""
|
"""
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
# Objects without a country code can't have a postcode per definition.
|
# Objects without a country code can't have a postcode per definition.
|
||||||
self.country_without_postcode = {None}
|
self.country_without_postcode: Set[Optional[str]] = {None}
|
||||||
self.country_matcher = {}
|
self.country_matcher = {}
|
||||||
self.default_matcher = CountryPostcodeMatcher('', {'pattern': '.*'})
|
self.default_matcher = CountryPostcodeMatcher('', {'pattern': '.*'})
|
||||||
|
|
||||||
@@ -71,14 +72,14 @@ class PostcodeFormatter:
|
|||||||
raise UsageError(f"Invalid entry 'postcode' for country '{ccode}'")
|
raise UsageError(f"Invalid entry 'postcode' for country '{ccode}'")
|
||||||
|
|
||||||
|
|
||||||
def set_default_pattern(self, pattern):
|
def set_default_pattern(self, pattern: str) -> None:
|
||||||
""" Set the postcode match pattern to use, when a country does not
|
""" Set the postcode match pattern to use, when a country does not
|
||||||
have a specific pattern or is marked as country without postcode.
|
have a specific pattern.
|
||||||
"""
|
"""
|
||||||
self.default_matcher = CountryPostcodeMatcher('', {'pattern': pattern})
|
self.default_matcher = CountryPostcodeMatcher('', {'pattern': pattern})
|
||||||
|
|
||||||
|
|
||||||
def get_matcher(self, country_code):
|
def get_matcher(self, country_code: Optional[str]) -> Optional[CountryPostcodeMatcher]:
|
||||||
""" Return the CountryPostcodeMatcher for the given country.
|
""" Return the CountryPostcodeMatcher for the given country.
|
||||||
Returns None if the country doesn't have a postcode and the
|
Returns None if the country doesn't have a postcode and the
|
||||||
default matcher if there is no specific matcher configured for
|
default matcher if there is no specific matcher configured for
|
||||||
@@ -87,10 +88,12 @@ class PostcodeFormatter:
|
|||||||
if country_code in self.country_without_postcode:
|
if country_code in self.country_without_postcode:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
assert country_code is not None
|
||||||
|
|
||||||
return self.country_matcher.get(country_code, self.default_matcher)
|
return self.country_matcher.get(country_code, self.default_matcher)
|
||||||
|
|
||||||
|
|
||||||
def match(self, country_code, postcode):
|
def match(self, country_code: Optional[str], postcode: str) -> Optional[Match[str]]:
|
||||||
""" Match the given postcode against the postcode pattern for this
|
""" Match the given postcode against the postcode pattern for this
|
||||||
matcher. Returns a `re.Match` object if the country has a pattern
|
matcher. Returns a `re.Match` object if the country has a pattern
|
||||||
and the match was successful or None if the match failed.
|
and the match was successful or None if the match failed.
|
||||||
@@ -98,10 +101,12 @@ class PostcodeFormatter:
|
|||||||
if country_code in self.country_without_postcode:
|
if country_code in self.country_without_postcode:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
assert country_code is not None
|
||||||
|
|
||||||
return self.country_matcher.get(country_code, self.default_matcher).match(postcode)
|
return self.country_matcher.get(country_code, self.default_matcher).match(postcode)
|
||||||
|
|
||||||
|
|
||||||
def normalize(self, country_code, match):
|
def normalize(self, country_code: str, match: Match[str]) -> str:
|
||||||
""" Return the default format of the postcode for the given match.
|
""" Return the default format of the postcode for the given match.
|
||||||
`match` must be a `re.Match` object previously returned by
|
`match` must be a `re.Match` object previously returned by
|
||||||
`match()`
|
`match()`
|
||||||
|
|||||||
@@ -4,8 +4,9 @@
|
|||||||
#
|
#
|
||||||
# Copyright (C) 2022 by the Nominatim developer community.
|
# Copyright (C) 2022 by the Nominatim developer community.
|
||||||
# For a full list of authors see the git log.
|
# For a full list of authors see the git log.
|
||||||
""" Database helper functions for the indexer.
|
""" Non-blocking database connections.
|
||||||
"""
|
"""
|
||||||
|
from typing import Callable, Any, Optional, Iterator, Sequence
|
||||||
import logging
|
import logging
|
||||||
import select
|
import select
|
||||||
import time
|
import time
|
||||||
@@ -21,6 +22,8 @@ try:
|
|||||||
except ImportError:
|
except ImportError:
|
||||||
__has_psycopg2_errors__ = False
|
__has_psycopg2_errors__ = False
|
||||||
|
|
||||||
|
from nominatim.typing import T_cursor, Query
|
||||||
|
|
||||||
LOG = logging.getLogger()
|
LOG = logging.getLogger()
|
||||||
|
|
||||||
class DeadlockHandler:
|
class DeadlockHandler:
|
||||||
@@ -29,14 +32,14 @@ class DeadlockHandler:
|
|||||||
normally.
|
normally.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, handler, ignore_sql_errors=False):
|
def __init__(self, handler: Callable[[], None], ignore_sql_errors: bool = False) -> None:
|
||||||
self.handler = handler
|
self.handler = handler
|
||||||
self.ignore_sql_errors = ignore_sql_errors
|
self.ignore_sql_errors = ignore_sql_errors
|
||||||
|
|
||||||
def __enter__(self):
|
def __enter__(self) -> 'DeadlockHandler':
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def __exit__(self, exc_type, exc_value, traceback):
|
def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> bool:
|
||||||
if __has_psycopg2_errors__:
|
if __has_psycopg2_errors__:
|
||||||
if exc_type == psycopg2.errors.DeadlockDetected: # pylint: disable=E1101
|
if exc_type == psycopg2.errors.DeadlockDetected: # pylint: disable=E1101
|
||||||
self.handler()
|
self.handler()
|
||||||
@@ -57,26 +60,31 @@ class DBConnection:
|
|||||||
""" A single non-blocking database connection.
|
""" A single non-blocking database connection.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, dsn, cursor_factory=None, ignore_sql_errors=False):
|
def __init__(self, dsn: str,
|
||||||
self.current_query = None
|
cursor_factory: Optional[Callable[..., T_cursor]] = None,
|
||||||
self.current_params = None
|
ignore_sql_errors: bool = False) -> None:
|
||||||
self.dsn = dsn
|
self.dsn = dsn
|
||||||
|
|
||||||
|
self.current_query: Optional[Query] = None
|
||||||
|
self.current_params: Optional[Sequence[Any]] = None
|
||||||
self.ignore_sql_errors = ignore_sql_errors
|
self.ignore_sql_errors = ignore_sql_errors
|
||||||
|
|
||||||
self.conn = None
|
self.conn: Optional['psycopg2.connection'] = None
|
||||||
self.cursor = None
|
self.cursor: Optional['psycopg2.cursor'] = None
|
||||||
self.connect(cursor_factory=cursor_factory)
|
self.connect(cursor_factory=cursor_factory)
|
||||||
|
|
||||||
def close(self):
|
def close(self) -> None:
|
||||||
""" Close all open connections. Does not wait for pending requests.
|
""" Close all open connections. Does not wait for pending requests.
|
||||||
"""
|
"""
|
||||||
if self.conn is not None:
|
if self.conn is not None:
|
||||||
self.cursor.close()
|
if self.cursor is not None:
|
||||||
|
self.cursor.close() # type: ignore[no-untyped-call]
|
||||||
|
self.cursor = None
|
||||||
self.conn.close()
|
self.conn.close()
|
||||||
|
|
||||||
self.conn = None
|
self.conn = None
|
||||||
|
|
||||||
def connect(self, cursor_factory=None):
|
def connect(self, cursor_factory: Optional[Callable[..., T_cursor]] = None) -> None:
|
||||||
""" (Re)connect to the database. Creates an asynchronous connection
|
""" (Re)connect to the database. Creates an asynchronous connection
|
||||||
with JIT and parallel processing disabled. If a connection was
|
with JIT and parallel processing disabled. If a connection was
|
||||||
already open, it is closed and a new connection established.
|
already open, it is closed and a new connection established.
|
||||||
@@ -89,7 +97,10 @@ class DBConnection:
|
|||||||
self.conn = psycopg2.connect(**{'dsn': self.dsn, 'async': True})
|
self.conn = psycopg2.connect(**{'dsn': self.dsn, 'async': True})
|
||||||
self.wait()
|
self.wait()
|
||||||
|
|
||||||
self.cursor = self.conn.cursor(cursor_factory=cursor_factory)
|
if cursor_factory is not None:
|
||||||
|
self.cursor = self.conn.cursor(cursor_factory=cursor_factory)
|
||||||
|
else:
|
||||||
|
self.cursor = self.conn.cursor()
|
||||||
# Disable JIT and parallel workers as they are known to cause problems.
|
# Disable JIT and parallel workers as they are known to cause problems.
|
||||||
# Update pg_settings instead of using SET because it does not yield
|
# Update pg_settings instead of using SET because it does not yield
|
||||||
# errors on older versions of Postgres where the settings are not
|
# errors on older versions of Postgres where the settings are not
|
||||||
@@ -100,11 +111,15 @@ class DBConnection:
|
|||||||
WHERE name = 'max_parallel_workers_per_gather';""")
|
WHERE name = 'max_parallel_workers_per_gather';""")
|
||||||
self.wait()
|
self.wait()
|
||||||
|
|
||||||
def _deadlock_handler(self):
|
def _deadlock_handler(self) -> None:
|
||||||
LOG.info("Deadlock detected (params = %s), retry.", str(self.current_params))
|
LOG.info("Deadlock detected (params = %s), retry.", str(self.current_params))
|
||||||
|
assert self.cursor is not None
|
||||||
|
assert self.current_query is not None
|
||||||
|
assert self.current_params is not None
|
||||||
|
|
||||||
self.cursor.execute(self.current_query, self.current_params)
|
self.cursor.execute(self.current_query, self.current_params)
|
||||||
|
|
||||||
def wait(self):
|
def wait(self) -> None:
|
||||||
""" Block until any pending operation is done.
|
""" Block until any pending operation is done.
|
||||||
"""
|
"""
|
||||||
while True:
|
while True:
|
||||||
@@ -113,25 +128,29 @@ class DBConnection:
|
|||||||
self.current_query = None
|
self.current_query = None
|
||||||
return
|
return
|
||||||
|
|
||||||
def perform(self, sql, args=None):
|
def perform(self, sql: Query, args: Optional[Sequence[Any]] = None) -> None:
|
||||||
""" Send SQL query to the server. Returns immediately without
|
""" Send SQL query to the server. Returns immediately without
|
||||||
blocking.
|
blocking.
|
||||||
"""
|
"""
|
||||||
|
assert self.cursor is not None
|
||||||
self.current_query = sql
|
self.current_query = sql
|
||||||
self.current_params = args
|
self.current_params = args
|
||||||
self.cursor.execute(sql, args)
|
self.cursor.execute(sql, args)
|
||||||
|
|
||||||
def fileno(self):
|
def fileno(self) -> int:
|
||||||
""" File descriptor to wait for. (Makes this class select()able.)
|
""" File descriptor to wait for. (Makes this class select()able.)
|
||||||
"""
|
"""
|
||||||
|
assert self.conn is not None
|
||||||
return self.conn.fileno()
|
return self.conn.fileno()
|
||||||
|
|
||||||
def is_done(self):
|
def is_done(self) -> bool:
|
||||||
""" Check if the connection is available for a new query.
|
""" Check if the connection is available for a new query.
|
||||||
|
|
||||||
Also checks if the previous query has run into a deadlock.
|
Also checks if the previous query has run into a deadlock.
|
||||||
If so, then the previous query is repeated.
|
If so, then the previous query is repeated.
|
||||||
"""
|
"""
|
||||||
|
assert self.conn is not None
|
||||||
|
|
||||||
if self.current_query is None:
|
if self.current_query is None:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@@ -150,14 +169,14 @@ class WorkerPool:
|
|||||||
"""
|
"""
|
||||||
REOPEN_CONNECTIONS_AFTER = 100000
|
REOPEN_CONNECTIONS_AFTER = 100000
|
||||||
|
|
||||||
def __init__(self, dsn, pool_size, ignore_sql_errors=False):
|
def __init__(self, dsn: str, pool_size: int, ignore_sql_errors: bool = False) -> None:
|
||||||
self.threads = [DBConnection(dsn, ignore_sql_errors=ignore_sql_errors)
|
self.threads = [DBConnection(dsn, ignore_sql_errors=ignore_sql_errors)
|
||||||
for _ in range(pool_size)]
|
for _ in range(pool_size)]
|
||||||
self.free_workers = self._yield_free_worker()
|
self.free_workers = self._yield_free_worker()
|
||||||
self.wait_time = 0
|
self.wait_time = 0.0
|
||||||
|
|
||||||
|
|
||||||
def finish_all(self):
|
def finish_all(self) -> None:
|
||||||
""" Wait for all connection to finish.
|
""" Wait for all connection to finish.
|
||||||
"""
|
"""
|
||||||
for thread in self.threads:
|
for thread in self.threads:
|
||||||
@@ -166,22 +185,22 @@ class WorkerPool:
|
|||||||
|
|
||||||
self.free_workers = self._yield_free_worker()
|
self.free_workers = self._yield_free_worker()
|
||||||
|
|
||||||
def close(self):
|
def close(self) -> None:
|
||||||
""" Close all connections and clear the pool.
|
""" Close all connections and clear the pool.
|
||||||
"""
|
"""
|
||||||
for thread in self.threads:
|
for thread in self.threads:
|
||||||
thread.close()
|
thread.close()
|
||||||
self.threads = []
|
self.threads = []
|
||||||
self.free_workers = None
|
self.free_workers = iter([])
|
||||||
|
|
||||||
|
|
||||||
def next_free_worker(self):
|
def next_free_worker(self) -> DBConnection:
|
||||||
""" Get the next free connection.
|
""" Get the next free connection.
|
||||||
"""
|
"""
|
||||||
return next(self.free_workers)
|
return next(self.free_workers)
|
||||||
|
|
||||||
|
|
||||||
def _yield_free_worker(self):
|
def _yield_free_worker(self) -> Iterator[DBConnection]:
|
||||||
ready = self.threads
|
ready = self.threads
|
||||||
command_stat = 0
|
command_stat = 0
|
||||||
while True:
|
while True:
|
||||||
@@ -200,17 +219,17 @@ class WorkerPool:
|
|||||||
self.wait_time += time.time() - tstart
|
self.wait_time += time.time() - tstart
|
||||||
|
|
||||||
|
|
||||||
def _reconnect_threads(self):
|
def _reconnect_threads(self) -> None:
|
||||||
for thread in self.threads:
|
for thread in self.threads:
|
||||||
while not thread.is_done():
|
while not thread.is_done():
|
||||||
thread.wait()
|
thread.wait()
|
||||||
thread.connect()
|
thread.connect()
|
||||||
|
|
||||||
|
|
||||||
def __enter__(self):
|
def __enter__(self) -> 'WorkerPool':
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
||||||
def __exit__(self, exc_type, exc_value, traceback):
|
def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
|
||||||
self.finish_all()
|
self.finish_all()
|
||||||
self.close()
|
self.close()
|
||||||
|
|||||||
@@ -7,6 +7,7 @@
|
|||||||
"""
|
"""
|
||||||
Specialised connection and cursor functions.
|
Specialised connection and cursor functions.
|
||||||
"""
|
"""
|
||||||
|
from typing import Optional, Any, Callable, ContextManager, Dict, cast, overload, Tuple, Iterable
|
||||||
import contextlib
|
import contextlib
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
@@ -16,25 +17,27 @@ import psycopg2.extensions
|
|||||||
import psycopg2.extras
|
import psycopg2.extras
|
||||||
from psycopg2 import sql as pysql
|
from psycopg2 import sql as pysql
|
||||||
|
|
||||||
|
from nominatim.typing import SysEnv, Query, T_cursor
|
||||||
from nominatim.errors import UsageError
|
from nominatim.errors import UsageError
|
||||||
|
|
||||||
LOG = logging.getLogger()
|
LOG = logging.getLogger()
|
||||||
|
|
||||||
class _Cursor(psycopg2.extras.DictCursor):
|
class Cursor(psycopg2.extras.DictCursor):
|
||||||
""" A cursor returning dict-like objects and providing specialised
|
""" A cursor returning dict-like objects and providing specialised
|
||||||
execution functions.
|
execution functions.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# pylint: disable=arguments-renamed,arguments-differ
|
# pylint: disable=arguments-renamed,arguments-differ
|
||||||
def execute(self, query, args=None):
|
def execute(self, query: Query, args: Any = None) -> None:
|
||||||
""" Query execution that logs the SQL query when debugging is enabled.
|
""" Query execution that logs the SQL query when debugging is enabled.
|
||||||
"""
|
"""
|
||||||
LOG.debug(self.mogrify(query, args).decode('utf-8'))
|
if LOG.isEnabledFor(logging.DEBUG):
|
||||||
|
LOG.debug(self.mogrify(query, args).decode('utf-8')) # type: ignore[no-untyped-call]
|
||||||
|
|
||||||
super().execute(query, args)
|
super().execute(query, args)
|
||||||
|
|
||||||
|
|
||||||
def execute_values(self, sql, argslist, template=None):
|
def execute_values(self, sql: Query, argslist: Iterable[Tuple[Any, ...]],
|
||||||
|
template: Optional[Query] = None) -> None:
|
||||||
""" Wrapper for the psycopg2 convenience function to execute
|
""" Wrapper for the psycopg2 convenience function to execute
|
||||||
SQL for a list of values.
|
SQL for a list of values.
|
||||||
"""
|
"""
|
||||||
@@ -43,7 +46,7 @@ class _Cursor(psycopg2.extras.DictCursor):
|
|||||||
psycopg2.extras.execute_values(self, sql, argslist, template=template)
|
psycopg2.extras.execute_values(self, sql, argslist, template=template)
|
||||||
|
|
||||||
|
|
||||||
def scalar(self, sql, args=None):
|
def scalar(self, sql: Query, args: Any = None) -> Any:
|
||||||
""" Execute query that returns a single value. The value is returned.
|
""" Execute query that returns a single value. The value is returned.
|
||||||
If the query yields more than one row, a ValueError is raised.
|
If the query yields more than one row, a ValueError is raised.
|
||||||
"""
|
"""
|
||||||
@@ -52,10 +55,13 @@ class _Cursor(psycopg2.extras.DictCursor):
|
|||||||
if self.rowcount != 1:
|
if self.rowcount != 1:
|
||||||
raise RuntimeError("Query did not return a single row.")
|
raise RuntimeError("Query did not return a single row.")
|
||||||
|
|
||||||
return self.fetchone()[0]
|
result = self.fetchone() # type: ignore[no-untyped-call]
|
||||||
|
assert result is not None
|
||||||
|
|
||||||
|
return result[0]
|
||||||
|
|
||||||
|
|
||||||
def drop_table(self, name, if_exists=True, cascade=False):
|
def drop_table(self, name: str, if_exists: bool = True, cascade: bool = False) -> None:
|
||||||
""" Drop the table with the given name.
|
""" Drop the table with the given name.
|
||||||
Set `if_exists` to False if a non-existant table should raise
|
Set `if_exists` to False if a non-existant table should raise
|
||||||
an exception instead of just being ignored. If 'cascade' is set
|
an exception instead of just being ignored. If 'cascade' is set
|
||||||
@@ -71,27 +77,38 @@ class _Cursor(psycopg2.extras.DictCursor):
|
|||||||
self.execute(pysql.SQL(sql).format(pysql.Identifier(name)))
|
self.execute(pysql.SQL(sql).format(pysql.Identifier(name)))
|
||||||
|
|
||||||
|
|
||||||
class _Connection(psycopg2.extensions.connection):
|
class Connection(psycopg2.extensions.connection):
|
||||||
""" A connection that provides the specialised cursor by default and
|
""" A connection that provides the specialised cursor by default and
|
||||||
adds convenience functions for administrating the database.
|
adds convenience functions for administrating the database.
|
||||||
"""
|
"""
|
||||||
|
@overload # type: ignore[override]
|
||||||
|
def cursor(self) -> Cursor:
|
||||||
|
...
|
||||||
|
|
||||||
def cursor(self, cursor_factory=_Cursor, **kwargs):
|
@overload
|
||||||
|
def cursor(self, name: str) -> Cursor:
|
||||||
|
...
|
||||||
|
|
||||||
|
@overload
|
||||||
|
def cursor(self, cursor_factory: Callable[..., T_cursor]) -> T_cursor:
|
||||||
|
...
|
||||||
|
|
||||||
|
def cursor(self, cursor_factory = Cursor, **kwargs): # type: ignore
|
||||||
""" Return a new cursor. By default the specialised cursor is returned.
|
""" Return a new cursor. By default the specialised cursor is returned.
|
||||||
"""
|
"""
|
||||||
return super().cursor(cursor_factory=cursor_factory, **kwargs)
|
return super().cursor(cursor_factory=cursor_factory, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
def table_exists(self, table):
|
def table_exists(self, table: str) -> bool:
|
||||||
""" Check that a table with the given name exists in the database.
|
""" Check that a table with the given name exists in the database.
|
||||||
"""
|
"""
|
||||||
with self.cursor() as cur:
|
with self.cursor() as cur:
|
||||||
num = cur.scalar("""SELECT count(*) FROM pg_tables
|
num = cur.scalar("""SELECT count(*) FROM pg_tables
|
||||||
WHERE tablename = %s and schemaname = 'public'""", (table, ))
|
WHERE tablename = %s and schemaname = 'public'""", (table, ))
|
||||||
return num == 1
|
return num == 1 if isinstance(num, int) else False
|
||||||
|
|
||||||
|
|
||||||
def table_has_column(self, table, column):
|
def table_has_column(self, table: str, column: str) -> bool:
|
||||||
""" Check if the table 'table' exists and has a column with name 'column'.
|
""" Check if the table 'table' exists and has a column with name 'column'.
|
||||||
"""
|
"""
|
||||||
with self.cursor() as cur:
|
with self.cursor() as cur:
|
||||||
@@ -99,10 +116,10 @@ class _Connection(psycopg2.extensions.connection):
|
|||||||
WHERE table_name = %s
|
WHERE table_name = %s
|
||||||
and column_name = %s""",
|
and column_name = %s""",
|
||||||
(table, column))
|
(table, column))
|
||||||
return has_column > 0
|
return has_column > 0 if isinstance(has_column, int) else False
|
||||||
|
|
||||||
|
|
||||||
def index_exists(self, index, table=None):
|
def index_exists(self, index: str, table: Optional[str] = None) -> bool:
|
||||||
""" Check that an index with the given name exists in the database.
|
""" Check that an index with the given name exists in the database.
|
||||||
If table is not None then the index must relate to the given
|
If table is not None then the index must relate to the given
|
||||||
table.
|
table.
|
||||||
@@ -114,13 +131,15 @@ class _Connection(psycopg2.extensions.connection):
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
if table is not None:
|
if table is not None:
|
||||||
row = cur.fetchone()
|
row = cur.fetchone() # type: ignore[no-untyped-call]
|
||||||
|
if row is None or not isinstance(row[0], str):
|
||||||
|
return False
|
||||||
return row[0] == table
|
return row[0] == table
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
def drop_table(self, name, if_exists=True, cascade=False):
|
def drop_table(self, name: str, if_exists: bool = True, cascade: bool = False) -> None:
|
||||||
""" Drop the table with the given name.
|
""" Drop the table with the given name.
|
||||||
Set `if_exists` to False if a non-existant table should raise
|
Set `if_exists` to False if a non-existant table should raise
|
||||||
an exception instead of just being ignored.
|
an exception instead of just being ignored.
|
||||||
@@ -130,18 +149,18 @@ class _Connection(psycopg2.extensions.connection):
|
|||||||
self.commit()
|
self.commit()
|
||||||
|
|
||||||
|
|
||||||
def server_version_tuple(self):
|
def server_version_tuple(self) -> Tuple[int, int]:
|
||||||
""" Return the server version as a tuple of (major, minor).
|
""" Return the server version as a tuple of (major, minor).
|
||||||
Converts correctly for pre-10 and post-10 PostgreSQL versions.
|
Converts correctly for pre-10 and post-10 PostgreSQL versions.
|
||||||
"""
|
"""
|
||||||
version = self.server_version
|
version = self.server_version
|
||||||
if version < 100000:
|
if version < 100000:
|
||||||
return (int(version / 10000), (version % 10000) / 100)
|
return (int(version / 10000), int((version % 10000) / 100))
|
||||||
|
|
||||||
return (int(version / 10000), version % 10000)
|
return (int(version / 10000), version % 10000)
|
||||||
|
|
||||||
|
|
||||||
def postgis_version_tuple(self):
|
def postgis_version_tuple(self) -> Tuple[int, int]:
|
||||||
""" Return the postgis version installed in the database as a
|
""" Return the postgis version installed in the database as a
|
||||||
tuple of (major, minor). Assumes that the PostGIS extension
|
tuple of (major, minor). Assumes that the PostGIS extension
|
||||||
has been installed already.
|
has been installed already.
|
||||||
@@ -149,19 +168,28 @@ class _Connection(psycopg2.extensions.connection):
|
|||||||
with self.cursor() as cur:
|
with self.cursor() as cur:
|
||||||
version = cur.scalar('SELECT postgis_lib_version()')
|
version = cur.scalar('SELECT postgis_lib_version()')
|
||||||
|
|
||||||
return tuple((int(x) for x in version.split('.')[:2]))
|
version_parts = version.split('.')
|
||||||
|
if len(version_parts) < 2:
|
||||||
|
raise UsageError(f"Error fetching Postgis version. Bad format: {version}")
|
||||||
|
|
||||||
|
return (int(version_parts[0]), int(version_parts[1]))
|
||||||
|
|
||||||
def connect(dsn):
|
class ConnectionContext(ContextManager[Connection]):
|
||||||
|
""" Context manager of the connection that also provides direct access
|
||||||
|
to the underlying connection.
|
||||||
|
"""
|
||||||
|
connection: Connection
|
||||||
|
|
||||||
|
def connect(dsn: str) -> ConnectionContext:
|
||||||
""" Open a connection to the database using the specialised connection
|
""" Open a connection to the database using the specialised connection
|
||||||
factory. The returned object may be used in conjunction with 'with'.
|
factory. The returned object may be used in conjunction with 'with'.
|
||||||
When used outside a context manager, use the `connection` attribute
|
When used outside a context manager, use the `connection` attribute
|
||||||
to get the connection.
|
to get the connection.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
conn = psycopg2.connect(dsn, connection_factory=_Connection)
|
conn = psycopg2.connect(dsn, connection_factory=Connection)
|
||||||
ctxmgr = contextlib.closing(conn)
|
ctxmgr = cast(ConnectionContext, contextlib.closing(conn))
|
||||||
ctxmgr.connection = conn
|
ctxmgr.connection = cast(Connection, conn)
|
||||||
return ctxmgr
|
return ctxmgr
|
||||||
except psycopg2.OperationalError as err:
|
except psycopg2.OperationalError as err:
|
||||||
raise UsageError(f"Cannot connect to database: {err}") from err
|
raise UsageError(f"Cannot connect to database: {err}") from err
|
||||||
@@ -199,7 +227,8 @@ _PG_CONNECTION_STRINGS = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def get_pg_env(dsn, base_env=None):
|
def get_pg_env(dsn: str,
|
||||||
|
base_env: Optional[SysEnv] = None) -> Dict[str, str]:
|
||||||
""" Return a copy of `base_env` with the environment variables for
|
""" Return a copy of `base_env` with the environment variables for
|
||||||
PostgresSQL set up from the given database connection string.
|
PostgresSQL set up from the given database connection string.
|
||||||
If `base_env` is None, then the OS environment is used as a base
|
If `base_env` is None, then the OS environment is used as a base
|
||||||
@@ -207,7 +236,7 @@ def get_pg_env(dsn, base_env=None):
|
|||||||
"""
|
"""
|
||||||
env = dict(base_env if base_env is not None else os.environ)
|
env = dict(base_env if base_env is not None else os.environ)
|
||||||
|
|
||||||
for param, value in psycopg2.extensions.parse_dsn(dsn).items():
|
for param, value in psycopg2.extensions.parse_dsn(dsn).items(): # type: ignore
|
||||||
if param in _PG_CONNECTION_STRINGS:
|
if param in _PG_CONNECTION_STRINGS:
|
||||||
env[_PG_CONNECTION_STRINGS[param]] = value
|
env[_PG_CONNECTION_STRINGS[param]] = value
|
||||||
else:
|
else:
|
||||||
|
|||||||
@@ -7,8 +7,11 @@
|
|||||||
"""
|
"""
|
||||||
Query and access functions for the in-database property table.
|
Query and access functions for the in-database property table.
|
||||||
"""
|
"""
|
||||||
|
from typing import Optional, cast
|
||||||
|
|
||||||
def set_property(conn, name, value):
|
from nominatim.db.connection import Connection
|
||||||
|
|
||||||
|
def set_property(conn: Connection, name: str, value: str) -> None:
|
||||||
""" Add or replace the propery with the given name.
|
""" Add or replace the propery with the given name.
|
||||||
"""
|
"""
|
||||||
with conn.cursor() as cur:
|
with conn.cursor() as cur:
|
||||||
@@ -23,8 +26,9 @@ def set_property(conn, name, value):
|
|||||||
cur.execute(sql, (value, name))
|
cur.execute(sql, (value, name))
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
def get_property(conn, name):
|
|
||||||
""" Return the current value of the given propery or None if the property
|
def get_property(conn: Connection, name: str) -> Optional[str]:
|
||||||
|
""" Return the current value of the given property or None if the property
|
||||||
is not set.
|
is not set.
|
||||||
"""
|
"""
|
||||||
if not conn.table_exists('nominatim_properties'):
|
if not conn.table_exists('nominatim_properties'):
|
||||||
@@ -34,4 +38,7 @@ def get_property(conn, name):
|
|||||||
cur.execute('SELECT value FROM nominatim_properties WHERE property = %s',
|
cur.execute('SELECT value FROM nominatim_properties WHERE property = %s',
|
||||||
(name, ))
|
(name, ))
|
||||||
|
|
||||||
return cur.fetchone()[0] if cur.rowcount > 0 else None
|
if cur.rowcount == 0:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return cast(Optional[str], cur.fetchone()[0]) # type: ignore[no-untyped-call]
|
||||||
|
|||||||
@@ -7,10 +7,13 @@
|
|||||||
"""
|
"""
|
||||||
Preprocessing of SQL files.
|
Preprocessing of SQL files.
|
||||||
"""
|
"""
|
||||||
|
from typing import Set, Dict, Any
|
||||||
import jinja2
|
import jinja2
|
||||||
|
|
||||||
|
from nominatim.db.connection import Connection
|
||||||
|
from nominatim.config import Configuration
|
||||||
|
|
||||||
def _get_partitions(conn):
|
def _get_partitions(conn: Connection) -> Set[int]:
|
||||||
""" Get the set of partitions currently in use.
|
""" Get the set of partitions currently in use.
|
||||||
"""
|
"""
|
||||||
with conn.cursor() as cur:
|
with conn.cursor() as cur:
|
||||||
@@ -22,7 +25,7 @@ def _get_partitions(conn):
|
|||||||
return partitions
|
return partitions
|
||||||
|
|
||||||
|
|
||||||
def _get_tables(conn):
|
def _get_tables(conn: Connection) -> Set[str]:
|
||||||
""" Return the set of tables currently in use.
|
""" Return the set of tables currently in use.
|
||||||
Only includes non-partitioned
|
Only includes non-partitioned
|
||||||
"""
|
"""
|
||||||
@@ -32,7 +35,7 @@ def _get_tables(conn):
|
|||||||
return set((row[0] for row in list(cur)))
|
return set((row[0] for row in list(cur)))
|
||||||
|
|
||||||
|
|
||||||
def _setup_tablespace_sql(config):
|
def _setup_tablespace_sql(config: Configuration) -> Dict[str, str]:
|
||||||
""" Returns a dict with tablespace expressions for the different tablespace
|
""" Returns a dict with tablespace expressions for the different tablespace
|
||||||
kinds depending on whether a tablespace is configured or not.
|
kinds depending on whether a tablespace is configured or not.
|
||||||
"""
|
"""
|
||||||
@@ -47,7 +50,7 @@ def _setup_tablespace_sql(config):
|
|||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
def _setup_postgresql_features(conn):
|
def _setup_postgresql_features(conn: Connection) -> Dict[str, Any]:
|
||||||
""" Set up a dictionary with various optional Postgresql/Postgis features that
|
""" Set up a dictionary with various optional Postgresql/Postgis features that
|
||||||
depend on the database version.
|
depend on the database version.
|
||||||
"""
|
"""
|
||||||
@@ -69,11 +72,11 @@ class SQLPreprocessor:
|
|||||||
and follows its syntax.
|
and follows its syntax.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, conn, config):
|
def __init__(self, conn: Connection, config: Configuration) -> None:
|
||||||
self.env = jinja2.Environment(autoescape=False,
|
self.env = jinja2.Environment(autoescape=False,
|
||||||
loader=jinja2.FileSystemLoader(str(config.lib_dir.sql)))
|
loader=jinja2.FileSystemLoader(str(config.lib_dir.sql)))
|
||||||
|
|
||||||
db_info = {}
|
db_info: Dict[str, Any] = {}
|
||||||
db_info['partitions'] = _get_partitions(conn)
|
db_info['partitions'] = _get_partitions(conn)
|
||||||
db_info['tables'] = _get_tables(conn)
|
db_info['tables'] = _get_tables(conn)
|
||||||
db_info['reverse_only'] = 'search_name' not in db_info['tables']
|
db_info['reverse_only'] = 'search_name' not in db_info['tables']
|
||||||
@@ -84,7 +87,7 @@ class SQLPreprocessor:
|
|||||||
self.env.globals['postgres'] = _setup_postgresql_features(conn)
|
self.env.globals['postgres'] = _setup_postgresql_features(conn)
|
||||||
|
|
||||||
|
|
||||||
def run_sql_file(self, conn, name, **kwargs):
|
def run_sql_file(self, conn: Connection, name: str, **kwargs: Any) -> None:
|
||||||
""" Execute the given SQL file on the connection. The keyword arguments
|
""" Execute the given SQL file on the connection. The keyword arguments
|
||||||
may supply additional parameters for preprocessing.
|
may supply additional parameters for preprocessing.
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -7,17 +7,29 @@
|
|||||||
"""
|
"""
|
||||||
Access and helper functions for the status and status log table.
|
Access and helper functions for the status and status log table.
|
||||||
"""
|
"""
|
||||||
|
from typing import Optional, Tuple, cast
|
||||||
import datetime as dt
|
import datetime as dt
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
from nominatim.db.connection import Connection
|
||||||
from nominatim.tools.exec_utils import get_url
|
from nominatim.tools.exec_utils import get_url
|
||||||
from nominatim.errors import UsageError
|
from nominatim.errors import UsageError
|
||||||
|
from nominatim.typing import TypedDict
|
||||||
|
|
||||||
LOG = logging.getLogger()
|
LOG = logging.getLogger()
|
||||||
ISODATE_FORMAT = '%Y-%m-%dT%H:%M:%S'
|
ISODATE_FORMAT = '%Y-%m-%dT%H:%M:%S'
|
||||||
|
|
||||||
def compute_database_date(conn):
|
|
||||||
|
class StatusRow(TypedDict):
|
||||||
|
""" Dictionary of columns of the import_status table.
|
||||||
|
"""
|
||||||
|
lastimportdate: dt.datetime
|
||||||
|
sequence_id: Optional[int]
|
||||||
|
indexed: Optional[bool]
|
||||||
|
|
||||||
|
|
||||||
|
def compute_database_date(conn: Connection) -> dt.datetime:
|
||||||
""" Determine the date of the database from the newest object in the
|
""" Determine the date of the database from the newest object in the
|
||||||
data base.
|
data base.
|
||||||
"""
|
"""
|
||||||
@@ -49,10 +61,12 @@ def compute_database_date(conn):
|
|||||||
return dt.datetime.strptime(match.group(1), ISODATE_FORMAT).replace(tzinfo=dt.timezone.utc)
|
return dt.datetime.strptime(match.group(1), ISODATE_FORMAT).replace(tzinfo=dt.timezone.utc)
|
||||||
|
|
||||||
|
|
||||||
def set_status(conn, date, seq=None, indexed=True):
|
def set_status(conn: Connection, date: Optional[dt.datetime],
|
||||||
|
seq: Optional[int] = None, indexed: bool = True) -> None:
|
||||||
""" Replace the current status with the given status. If date is `None`
|
""" Replace the current status with the given status. If date is `None`
|
||||||
then only sequence and indexed will be updated as given. Otherwise
|
then only sequence and indexed will be updated as given. Otherwise
|
||||||
the whole status is replaced.
|
the whole status is replaced.
|
||||||
|
The change will be committed to the database.
|
||||||
"""
|
"""
|
||||||
assert date is None or date.tzinfo == dt.timezone.utc
|
assert date is None or date.tzinfo == dt.timezone.utc
|
||||||
with conn.cursor() as cur:
|
with conn.cursor() as cur:
|
||||||
@@ -67,7 +81,7 @@ def set_status(conn, date, seq=None, indexed=True):
|
|||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
|
|
||||||
def get_status(conn):
|
def get_status(conn: Connection) -> Tuple[Optional[dt.datetime], Optional[int], Optional[bool]]:
|
||||||
""" Return the current status as a triple of (date, sequence, indexed).
|
""" Return the current status as a triple of (date, sequence, indexed).
|
||||||
If status has not been set up yet, a triple of None is returned.
|
If status has not been set up yet, a triple of None is returned.
|
||||||
"""
|
"""
|
||||||
@@ -76,11 +90,11 @@ def get_status(conn):
|
|||||||
if cur.rowcount < 1:
|
if cur.rowcount < 1:
|
||||||
return None, None, None
|
return None, None, None
|
||||||
|
|
||||||
row = cur.fetchone()
|
row = cast(StatusRow, cur.fetchone()) # type: ignore[no-untyped-call]
|
||||||
return row['lastimportdate'], row['sequence_id'], row['indexed']
|
return row['lastimportdate'], row['sequence_id'], row['indexed']
|
||||||
|
|
||||||
|
|
||||||
def set_indexed(conn, state):
|
def set_indexed(conn: Connection, state: bool) -> None:
|
||||||
""" Set the indexed flag in the status table to the given state.
|
""" Set the indexed flag in the status table to the given state.
|
||||||
"""
|
"""
|
||||||
with conn.cursor() as cur:
|
with conn.cursor() as cur:
|
||||||
@@ -88,7 +102,8 @@ def set_indexed(conn, state):
|
|||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
|
|
||||||
def log_status(conn, start, event, batchsize=None):
|
def log_status(conn: Connection, start: dt.datetime,
|
||||||
|
event: str, batchsize: Optional[int] = None) -> None:
|
||||||
""" Write a new status line to the `import_osmosis_log` table.
|
""" Write a new status line to the `import_osmosis_log` table.
|
||||||
"""
|
"""
|
||||||
with conn.cursor() as cur:
|
with conn.cursor() as cur:
|
||||||
@@ -96,3 +111,4 @@ def log_status(conn, start, event, batchsize=None):
|
|||||||
(batchend, batchseq, batchsize, starttime, endtime, event)
|
(batchend, batchseq, batchsize, starttime, endtime, event)
|
||||||
SELECT lastimportdate, sequence_id, %s, %s, now(), %s FROM import_status""",
|
SELECT lastimportdate, sequence_id, %s, %s, now(), %s FROM import_status""",
|
||||||
(batchsize, start, event))
|
(batchsize, start, event))
|
||||||
|
conn.commit()
|
||||||
|
|||||||
@@ -7,17 +7,21 @@
|
|||||||
"""
|
"""
|
||||||
Helper functions for handling DB accesses.
|
Helper functions for handling DB accesses.
|
||||||
"""
|
"""
|
||||||
|
from typing import IO, Optional, Union, Any, Iterable
|
||||||
import subprocess
|
import subprocess
|
||||||
import logging
|
import logging
|
||||||
import gzip
|
import gzip
|
||||||
import io
|
import io
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
from nominatim.db.connection import get_pg_env
|
from nominatim.db.connection import get_pg_env, Cursor
|
||||||
from nominatim.errors import UsageError
|
from nominatim.errors import UsageError
|
||||||
|
|
||||||
LOG = logging.getLogger()
|
LOG = logging.getLogger()
|
||||||
|
|
||||||
def _pipe_to_proc(proc, fdesc):
|
def _pipe_to_proc(proc: 'subprocess.Popen[bytes]',
|
||||||
|
fdesc: Union[IO[bytes], gzip.GzipFile]) -> int:
|
||||||
|
assert proc.stdin is not None
|
||||||
chunk = fdesc.read(2048)
|
chunk = fdesc.read(2048)
|
||||||
while chunk and proc.poll() is None:
|
while chunk and proc.poll() is None:
|
||||||
try:
|
try:
|
||||||
@@ -28,7 +32,10 @@ def _pipe_to_proc(proc, fdesc):
|
|||||||
|
|
||||||
return len(chunk)
|
return len(chunk)
|
||||||
|
|
||||||
def execute_file(dsn, fname, ignore_errors=False, pre_code=None, post_code=None):
|
def execute_file(dsn: str, fname: Path,
|
||||||
|
ignore_errors: bool = False,
|
||||||
|
pre_code: Optional[str] = None,
|
||||||
|
post_code: Optional[str] = None) -> None:
|
||||||
""" Read an SQL file and run its contents against the given database
|
""" Read an SQL file and run its contents against the given database
|
||||||
using psql. Use `pre_code` and `post_code` to run extra commands
|
using psql. Use `pre_code` and `post_code` to run extra commands
|
||||||
before or after executing the file. The commands are run within the
|
before or after executing the file. The commands are run within the
|
||||||
@@ -42,6 +49,7 @@ def execute_file(dsn, fname, ignore_errors=False, pre_code=None, post_code=None)
|
|||||||
cmd.append('--quiet')
|
cmd.append('--quiet')
|
||||||
|
|
||||||
with subprocess.Popen(cmd, env=get_pg_env(dsn), stdin=subprocess.PIPE) as proc:
|
with subprocess.Popen(cmd, env=get_pg_env(dsn), stdin=subprocess.PIPE) as proc:
|
||||||
|
assert proc.stdin is not None
|
||||||
try:
|
try:
|
||||||
if not LOG.isEnabledFor(logging.INFO):
|
if not LOG.isEnabledFor(logging.INFO):
|
||||||
proc.stdin.write('set client_min_messages to WARNING;'.encode('utf-8'))
|
proc.stdin.write('set client_min_messages to WARNING;'.encode('utf-8'))
|
||||||
@@ -76,20 +84,20 @@ class CopyBuffer:
|
|||||||
""" Data collector for the copy_from command.
|
""" Data collector for the copy_from command.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
self.buffer = io.StringIO()
|
self.buffer = io.StringIO()
|
||||||
|
|
||||||
|
|
||||||
def __enter__(self):
|
def __enter__(self) -> 'CopyBuffer':
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
||||||
def __exit__(self, exc_type, exc_value, traceback):
|
def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
|
||||||
if self.buffer is not None:
|
if self.buffer is not None:
|
||||||
self.buffer.close()
|
self.buffer.close()
|
||||||
|
|
||||||
|
|
||||||
def add(self, *data):
|
def add(self, *data: Any) -> None:
|
||||||
""" Add another row of data to the copy buffer.
|
""" Add another row of data to the copy buffer.
|
||||||
"""
|
"""
|
||||||
first = True
|
first = True
|
||||||
@@ -105,9 +113,9 @@ class CopyBuffer:
|
|||||||
self.buffer.write('\n')
|
self.buffer.write('\n')
|
||||||
|
|
||||||
|
|
||||||
def copy_out(self, cur, table, columns=None):
|
def copy_out(self, cur: Cursor, table: str, columns: Optional[Iterable[str]] = None) -> None:
|
||||||
""" Copy all collected data into the given table.
|
""" Copy all collected data into the given table.
|
||||||
"""
|
"""
|
||||||
if self.buffer.tell() > 0:
|
if self.buffer.tell() > 0:
|
||||||
self.buffer.seek(0)
|
self.buffer.seek(0)
|
||||||
cur.copy_from(self.buffer, table, columns=columns)
|
cur.copy_from(self.buffer, table, columns=columns) # type: ignore[no-untyped-call]
|
||||||
|
|||||||
@@ -7,15 +7,18 @@
|
|||||||
"""
|
"""
|
||||||
Main work horse for indexing (computing addresses) the database.
|
Main work horse for indexing (computing addresses) the database.
|
||||||
"""
|
"""
|
||||||
|
from typing import Optional, Any, cast
|
||||||
import logging
|
import logging
|
||||||
import time
|
import time
|
||||||
|
|
||||||
import psycopg2.extras
|
import psycopg2.extras
|
||||||
|
|
||||||
|
from nominatim.tokenizer.base import AbstractTokenizer
|
||||||
from nominatim.indexer.progress import ProgressLogger
|
from nominatim.indexer.progress import ProgressLogger
|
||||||
from nominatim.indexer import runners
|
from nominatim.indexer import runners
|
||||||
from nominatim.db.async_connection import DBConnection, WorkerPool
|
from nominatim.db.async_connection import DBConnection, WorkerPool
|
||||||
from nominatim.db.connection import connect
|
from nominatim.db.connection import connect, Connection, Cursor
|
||||||
|
from nominatim.typing import DictCursorResults
|
||||||
|
|
||||||
LOG = logging.getLogger()
|
LOG = logging.getLogger()
|
||||||
|
|
||||||
@@ -23,10 +26,11 @@ LOG = logging.getLogger()
|
|||||||
class PlaceFetcher:
|
class PlaceFetcher:
|
||||||
""" Asynchronous connection that fetches place details for processing.
|
""" Asynchronous connection that fetches place details for processing.
|
||||||
"""
|
"""
|
||||||
def __init__(self, dsn, setup_conn):
|
def __init__(self, dsn: str, setup_conn: Connection) -> None:
|
||||||
self.wait_time = 0
|
self.wait_time = 0.0
|
||||||
self.current_ids = None
|
self.current_ids: Optional[DictCursorResults] = None
|
||||||
self.conn = DBConnection(dsn, cursor_factory=psycopg2.extras.DictCursor)
|
self.conn: Optional[DBConnection] = DBConnection(dsn,
|
||||||
|
cursor_factory=psycopg2.extras.DictCursor)
|
||||||
|
|
||||||
with setup_conn.cursor() as cur:
|
with setup_conn.cursor() as cur:
|
||||||
# need to fetch those manually because register_hstore cannot
|
# need to fetch those manually because register_hstore cannot
|
||||||
@@ -37,7 +41,7 @@ class PlaceFetcher:
|
|||||||
psycopg2.extras.register_hstore(self.conn.conn, oid=hstore_oid,
|
psycopg2.extras.register_hstore(self.conn.conn, oid=hstore_oid,
|
||||||
array_oid=hstore_array_oid)
|
array_oid=hstore_array_oid)
|
||||||
|
|
||||||
def close(self):
|
def close(self) -> None:
|
||||||
""" Close the underlying asynchronous connection.
|
""" Close the underlying asynchronous connection.
|
||||||
"""
|
"""
|
||||||
if self.conn:
|
if self.conn:
|
||||||
@@ -45,44 +49,46 @@ class PlaceFetcher:
|
|||||||
self.conn = None
|
self.conn = None
|
||||||
|
|
||||||
|
|
||||||
def fetch_next_batch(self, cur, runner):
|
def fetch_next_batch(self, cur: Cursor, runner: runners.Runner) -> bool:
|
||||||
""" Send a request for the next batch of places.
|
""" Send a request for the next batch of places.
|
||||||
If details for the places are required, they will be fetched
|
If details for the places are required, they will be fetched
|
||||||
asynchronously.
|
asynchronously.
|
||||||
|
|
||||||
Returns true if there is still data available.
|
Returns true if there is still data available.
|
||||||
"""
|
"""
|
||||||
ids = cur.fetchmany(100)
|
ids = cast(Optional[DictCursorResults], cur.fetchmany(100))
|
||||||
|
|
||||||
if not ids:
|
if not ids:
|
||||||
self.current_ids = None
|
self.current_ids = None
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if hasattr(runner, 'get_place_details'):
|
assert self.conn is not None
|
||||||
runner.get_place_details(self.conn, ids)
|
self.current_ids = runner.get_place_details(self.conn, ids)
|
||||||
self.current_ids = []
|
|
||||||
else:
|
|
||||||
self.current_ids = ids
|
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def get_batch(self):
|
def get_batch(self) -> DictCursorResults:
|
||||||
""" Get the next batch of data, previously requested with
|
""" Get the next batch of data, previously requested with
|
||||||
`fetch_next_batch`.
|
`fetch_next_batch`.
|
||||||
"""
|
"""
|
||||||
|
assert self.conn is not None
|
||||||
|
assert self.conn.cursor is not None
|
||||||
|
|
||||||
if self.current_ids is not None and not self.current_ids:
|
if self.current_ids is not None and not self.current_ids:
|
||||||
tstart = time.time()
|
tstart = time.time()
|
||||||
self.conn.wait()
|
self.conn.wait()
|
||||||
self.wait_time += time.time() - tstart
|
self.wait_time += time.time() - tstart
|
||||||
self.current_ids = self.conn.cursor.fetchall()
|
self.current_ids = cast(Optional[DictCursorResults],
|
||||||
|
self.conn.cursor.fetchall())
|
||||||
|
|
||||||
return self.current_ids
|
return self.current_ids if self.current_ids is not None else []
|
||||||
|
|
||||||
def __enter__(self):
|
def __enter__(self) -> 'PlaceFetcher':
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
||||||
def __exit__(self, exc_type, exc_value, traceback):
|
def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
|
||||||
|
assert self.conn is not None
|
||||||
self.conn.wait()
|
self.conn.wait()
|
||||||
self.close()
|
self.close()
|
||||||
|
|
||||||
@@ -91,13 +97,13 @@ class Indexer:
|
|||||||
""" Main indexing routine.
|
""" Main indexing routine.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, dsn, tokenizer, num_threads):
|
def __init__(self, dsn: str, tokenizer: AbstractTokenizer, num_threads: int):
|
||||||
self.dsn = dsn
|
self.dsn = dsn
|
||||||
self.tokenizer = tokenizer
|
self.tokenizer = tokenizer
|
||||||
self.num_threads = num_threads
|
self.num_threads = num_threads
|
||||||
|
|
||||||
|
|
||||||
def has_pending(self):
|
def has_pending(self) -> bool:
|
||||||
""" Check if any data still needs indexing.
|
""" Check if any data still needs indexing.
|
||||||
This function must only be used after the import has finished.
|
This function must only be used after the import has finished.
|
||||||
Otherwise it will be very expensive.
|
Otherwise it will be very expensive.
|
||||||
@@ -108,7 +114,7 @@ class Indexer:
|
|||||||
return cur.rowcount > 0
|
return cur.rowcount > 0
|
||||||
|
|
||||||
|
|
||||||
def index_full(self, analyse=True):
|
def index_full(self, analyse: bool = True) -> None:
|
||||||
""" Index the complete database. This will first index boundaries
|
""" Index the complete database. This will first index boundaries
|
||||||
followed by all other objects. When `analyse` is True, then the
|
followed by all other objects. When `analyse` is True, then the
|
||||||
database will be analysed at the appropriate places to
|
database will be analysed at the appropriate places to
|
||||||
@@ -117,7 +123,7 @@ class Indexer:
|
|||||||
with connect(self.dsn) as conn:
|
with connect(self.dsn) as conn:
|
||||||
conn.autocommit = True
|
conn.autocommit = True
|
||||||
|
|
||||||
def _analyze():
|
def _analyze() -> None:
|
||||||
if analyse:
|
if analyse:
|
||||||
with conn.cursor() as cur:
|
with conn.cursor() as cur:
|
||||||
cur.execute('ANALYZE')
|
cur.execute('ANALYZE')
|
||||||
@@ -138,7 +144,7 @@ class Indexer:
|
|||||||
_analyze()
|
_analyze()
|
||||||
|
|
||||||
|
|
||||||
def index_boundaries(self, minrank, maxrank):
|
def index_boundaries(self, minrank: int, maxrank: int) -> None:
|
||||||
""" Index only administrative boundaries within the given rank range.
|
""" Index only administrative boundaries within the given rank range.
|
||||||
"""
|
"""
|
||||||
LOG.warning("Starting indexing boundaries using %s threads",
|
LOG.warning("Starting indexing boundaries using %s threads",
|
||||||
@@ -148,7 +154,7 @@ class Indexer:
|
|||||||
for rank in range(max(minrank, 4), min(maxrank, 26)):
|
for rank in range(max(minrank, 4), min(maxrank, 26)):
|
||||||
self._index(runners.BoundaryRunner(rank, analyzer))
|
self._index(runners.BoundaryRunner(rank, analyzer))
|
||||||
|
|
||||||
def index_by_rank(self, minrank, maxrank):
|
def index_by_rank(self, minrank: int, maxrank: int) -> None:
|
||||||
""" Index all entries of placex in the given rank range (inclusive)
|
""" Index all entries of placex in the given rank range (inclusive)
|
||||||
in order of their address rank.
|
in order of their address rank.
|
||||||
|
|
||||||
@@ -168,7 +174,7 @@ class Indexer:
|
|||||||
self._index(runners.InterpolationRunner(analyzer), 20)
|
self._index(runners.InterpolationRunner(analyzer), 20)
|
||||||
|
|
||||||
|
|
||||||
def index_postcodes(self):
|
def index_postcodes(self) -> None:
|
||||||
"""Index the entries ofthe location_postcode table.
|
"""Index the entries ofthe location_postcode table.
|
||||||
"""
|
"""
|
||||||
LOG.warning("Starting indexing postcodes using %s threads", self.num_threads)
|
LOG.warning("Starting indexing postcodes using %s threads", self.num_threads)
|
||||||
@@ -176,7 +182,7 @@ class Indexer:
|
|||||||
self._index(runners.PostcodeRunner(), 20)
|
self._index(runners.PostcodeRunner(), 20)
|
||||||
|
|
||||||
|
|
||||||
def update_status_table(self):
|
def update_status_table(self) -> None:
|
||||||
""" Update the status in the status table to 'indexed'.
|
""" Update the status in the status table to 'indexed'.
|
||||||
"""
|
"""
|
||||||
with connect(self.dsn) as conn:
|
with connect(self.dsn) as conn:
|
||||||
@@ -185,7 +191,7 @@ class Indexer:
|
|||||||
|
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
def _index(self, runner, batch=1):
|
def _index(self, runner: runners.Runner, batch: int = 1) -> None:
|
||||||
""" Index a single rank or table. `runner` describes the SQL to use
|
""" Index a single rank or table. `runner` describes the SQL to use
|
||||||
for indexing. `batch` describes the number of objects that
|
for indexing. `batch` describes the number of objects that
|
||||||
should be processed with a single SQL statement
|
should be processed with a single SQL statement
|
||||||
|
|||||||
@@ -22,7 +22,7 @@ class ProgressLogger:
|
|||||||
should be reported.
|
should be reported.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, name, total, log_interval=1):
|
def __init__(self, name: str, total: int, log_interval: int = 1) -> None:
|
||||||
self.name = name
|
self.name = name
|
||||||
self.total_places = total
|
self.total_places = total
|
||||||
self.done_places = 0
|
self.done_places = 0
|
||||||
@@ -30,7 +30,7 @@ class ProgressLogger:
|
|||||||
self.log_interval = log_interval
|
self.log_interval = log_interval
|
||||||
self.next_info = INITIAL_PROGRESS if LOG.isEnabledFor(logging.WARNING) else total + 1
|
self.next_info = INITIAL_PROGRESS if LOG.isEnabledFor(logging.WARNING) else total + 1
|
||||||
|
|
||||||
def add(self, num=1):
|
def add(self, num: int = 1) -> None:
|
||||||
""" Mark `num` places as processed. Print a log message if the
|
""" Mark `num` places as processed. Print a log message if the
|
||||||
logging is at least info and the log interval has passed.
|
logging is at least info and the log interval has passed.
|
||||||
"""
|
"""
|
||||||
@@ -55,14 +55,14 @@ class ProgressLogger:
|
|||||||
|
|
||||||
self.next_info += int(places_per_sec) * self.log_interval
|
self.next_info += int(places_per_sec) * self.log_interval
|
||||||
|
|
||||||
def done(self):
|
def done(self) -> None:
|
||||||
""" Print final statistics about the progress.
|
""" Print final statistics about the progress.
|
||||||
"""
|
"""
|
||||||
rank_end_time = datetime.now()
|
rank_end_time = datetime.now()
|
||||||
|
|
||||||
if rank_end_time == self.rank_start_time:
|
if rank_end_time == self.rank_start_time:
|
||||||
diff_seconds = 0
|
diff_seconds = 0.0
|
||||||
places_per_sec = self.done_places
|
places_per_sec = float(self.done_places)
|
||||||
else:
|
else:
|
||||||
diff_seconds = (rank_end_time - self.rank_start_time).total_seconds()
|
diff_seconds = (rank_end_time - self.rank_start_time).total_seconds()
|
||||||
places_per_sec = self.done_places / diff_seconds
|
places_per_sec = self.done_places / diff_seconds
|
||||||
|
|||||||
@@ -8,35 +8,48 @@
|
|||||||
Mix-ins that provide the actual commands for the indexer for various indexing
|
Mix-ins that provide the actual commands for the indexer for various indexing
|
||||||
tasks.
|
tasks.
|
||||||
"""
|
"""
|
||||||
|
from typing import Any, List
|
||||||
import functools
|
import functools
|
||||||
|
|
||||||
from psycopg2 import sql as pysql
|
from psycopg2 import sql as pysql
|
||||||
import psycopg2.extras
|
import psycopg2.extras
|
||||||
|
|
||||||
from nominatim.data.place_info import PlaceInfo
|
from nominatim.data.place_info import PlaceInfo
|
||||||
|
from nominatim.tokenizer.base import AbstractAnalyzer
|
||||||
|
from nominatim.db.async_connection import DBConnection
|
||||||
|
from nominatim.typing import Query, DictCursorResult, DictCursorResults, Protocol
|
||||||
|
|
||||||
# pylint: disable=C0111
|
# pylint: disable=C0111
|
||||||
|
|
||||||
def _mk_valuelist(template, num):
|
def _mk_valuelist(template: str, num: int) -> pysql.Composed:
|
||||||
return pysql.SQL(',').join([pysql.SQL(template)] * num)
|
return pysql.SQL(',').join([pysql.SQL(template)] * num)
|
||||||
|
|
||||||
def _analyze_place(place, analyzer):
|
def _analyze_place(place: DictCursorResult, analyzer: AbstractAnalyzer) -> psycopg2.extras.Json:
|
||||||
return psycopg2.extras.Json(analyzer.process_place(PlaceInfo(place)))
|
return psycopg2.extras.Json(analyzer.process_place(PlaceInfo(place)))
|
||||||
|
|
||||||
|
|
||||||
|
class Runner(Protocol):
|
||||||
|
def name(self) -> str: ...
|
||||||
|
def sql_count_objects(self) -> Query: ...
|
||||||
|
def sql_get_objects(self) -> Query: ...
|
||||||
|
def get_place_details(self, worker: DBConnection,
|
||||||
|
ids: DictCursorResults) -> DictCursorResults: ...
|
||||||
|
def index_places(self, worker: DBConnection, places: DictCursorResults) -> None: ...
|
||||||
|
|
||||||
|
|
||||||
class AbstractPlacexRunner:
|
class AbstractPlacexRunner:
|
||||||
""" Returns SQL commands for indexing of the placex table.
|
""" Returns SQL commands for indexing of the placex table.
|
||||||
"""
|
"""
|
||||||
SELECT_SQL = pysql.SQL('SELECT place_id FROM placex ')
|
SELECT_SQL = pysql.SQL('SELECT place_id FROM placex ')
|
||||||
UPDATE_LINE = "(%s, %s::hstore, %s::hstore, %s::int, %s::jsonb)"
|
UPDATE_LINE = "(%s, %s::hstore, %s::hstore, %s::int, %s::jsonb)"
|
||||||
|
|
||||||
def __init__(self, rank, analyzer):
|
def __init__(self, rank: int, analyzer: AbstractAnalyzer) -> None:
|
||||||
self.rank = rank
|
self.rank = rank
|
||||||
self.analyzer = analyzer
|
self.analyzer = analyzer
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
@functools.lru_cache(maxsize=1)
|
@functools.lru_cache(maxsize=1)
|
||||||
def _index_sql(num_places):
|
def _index_sql(self, num_places: int) -> pysql.Composed:
|
||||||
return pysql.SQL(
|
return pysql.SQL(
|
||||||
""" UPDATE placex
|
""" UPDATE placex
|
||||||
SET indexed_status = 0, address = v.addr, token_info = v.ti,
|
SET indexed_status = 0, address = v.addr, token_info = v.ti,
|
||||||
@@ -46,16 +59,17 @@ class AbstractPlacexRunner:
|
|||||||
""").format(_mk_valuelist(AbstractPlacexRunner.UPDATE_LINE, num_places))
|
""").format(_mk_valuelist(AbstractPlacexRunner.UPDATE_LINE, num_places))
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
def get_place_details(self, worker: DBConnection, ids: DictCursorResults) -> DictCursorResults:
|
||||||
def get_place_details(worker, ids):
|
|
||||||
worker.perform("""SELECT place_id, extra.*
|
worker.perform("""SELECT place_id, extra.*
|
||||||
FROM placex, LATERAL placex_indexing_prepare(placex) as extra
|
FROM placex, LATERAL placex_indexing_prepare(placex) as extra
|
||||||
WHERE place_id IN %s""",
|
WHERE place_id IN %s""",
|
||||||
(tuple((p[0] for p in ids)), ))
|
(tuple((p[0] for p in ids)), ))
|
||||||
|
|
||||||
|
return []
|
||||||
|
|
||||||
def index_places(self, worker, places):
|
|
||||||
values = []
|
def index_places(self, worker: DBConnection, places: DictCursorResults) -> None:
|
||||||
|
values: List[Any] = []
|
||||||
for place in places:
|
for place in places:
|
||||||
for field in ('place_id', 'name', 'address', 'linked_place_id'):
|
for field in ('place_id', 'name', 'address', 'linked_place_id'):
|
||||||
values.append(place[field])
|
values.append(place[field])
|
||||||
@@ -68,15 +82,15 @@ class RankRunner(AbstractPlacexRunner):
|
|||||||
""" Returns SQL commands for indexing one rank within the placex table.
|
""" Returns SQL commands for indexing one rank within the placex table.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def name(self):
|
def name(self) -> str:
|
||||||
return f"rank {self.rank}"
|
return f"rank {self.rank}"
|
||||||
|
|
||||||
def sql_count_objects(self):
|
def sql_count_objects(self) -> pysql.Composed:
|
||||||
return pysql.SQL("""SELECT count(*) FROM placex
|
return pysql.SQL("""SELECT count(*) FROM placex
|
||||||
WHERE rank_address = {} and indexed_status > 0
|
WHERE rank_address = {} and indexed_status > 0
|
||||||
""").format(pysql.Literal(self.rank))
|
""").format(pysql.Literal(self.rank))
|
||||||
|
|
||||||
def sql_get_objects(self):
|
def sql_get_objects(self) -> pysql.Composed:
|
||||||
return self.SELECT_SQL + pysql.SQL(
|
return self.SELECT_SQL + pysql.SQL(
|
||||||
"""WHERE indexed_status > 0 and rank_address = {}
|
"""WHERE indexed_status > 0 and rank_address = {}
|
||||||
ORDER BY geometry_sector
|
ORDER BY geometry_sector
|
||||||
@@ -88,17 +102,17 @@ class BoundaryRunner(AbstractPlacexRunner):
|
|||||||
of a certain rank.
|
of a certain rank.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def name(self):
|
def name(self) -> str:
|
||||||
return f"boundaries rank {self.rank}"
|
return f"boundaries rank {self.rank}"
|
||||||
|
|
||||||
def sql_count_objects(self):
|
def sql_count_objects(self) -> pysql.Composed:
|
||||||
return pysql.SQL("""SELECT count(*) FROM placex
|
return pysql.SQL("""SELECT count(*) FROM placex
|
||||||
WHERE indexed_status > 0
|
WHERE indexed_status > 0
|
||||||
AND rank_search = {}
|
AND rank_search = {}
|
||||||
AND class = 'boundary' and type = 'administrative'
|
AND class = 'boundary' and type = 'administrative'
|
||||||
""").format(pysql.Literal(self.rank))
|
""").format(pysql.Literal(self.rank))
|
||||||
|
|
||||||
def sql_get_objects(self):
|
def sql_get_objects(self) -> pysql.Composed:
|
||||||
return self.SELECT_SQL + pysql.SQL(
|
return self.SELECT_SQL + pysql.SQL(
|
||||||
"""WHERE indexed_status > 0 and rank_search = {}
|
"""WHERE indexed_status > 0 and rank_search = {}
|
||||||
and class = 'boundary' and type = 'administrative'
|
and class = 'boundary' and type = 'administrative'
|
||||||
@@ -111,37 +125,33 @@ class InterpolationRunner:
|
|||||||
location_property_osmline.
|
location_property_osmline.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, analyzer):
|
def __init__(self, analyzer: AbstractAnalyzer) -> None:
|
||||||
self.analyzer = analyzer
|
self.analyzer = analyzer
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
def name(self) -> str:
|
||||||
def name():
|
|
||||||
return "interpolation lines (location_property_osmline)"
|
return "interpolation lines (location_property_osmline)"
|
||||||
|
|
||||||
@staticmethod
|
def sql_count_objects(self) -> str:
|
||||||
def sql_count_objects():
|
|
||||||
return """SELECT count(*) FROM location_property_osmline
|
return """SELECT count(*) FROM location_property_osmline
|
||||||
WHERE indexed_status > 0"""
|
WHERE indexed_status > 0"""
|
||||||
|
|
||||||
@staticmethod
|
def sql_get_objects(self) -> str:
|
||||||
def sql_get_objects():
|
|
||||||
return """SELECT place_id
|
return """SELECT place_id
|
||||||
FROM location_property_osmline
|
FROM location_property_osmline
|
||||||
WHERE indexed_status > 0
|
WHERE indexed_status > 0
|
||||||
ORDER BY geometry_sector"""
|
ORDER BY geometry_sector"""
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
def get_place_details(self, worker: DBConnection, ids: DictCursorResults) -> DictCursorResults:
|
||||||
def get_place_details(worker, ids):
|
|
||||||
worker.perform("""SELECT place_id, get_interpolation_address(address, osm_id) as address
|
worker.perform("""SELECT place_id, get_interpolation_address(address, osm_id) as address
|
||||||
FROM location_property_osmline WHERE place_id IN %s""",
|
FROM location_property_osmline WHERE place_id IN %s""",
|
||||||
(tuple((p[0] for p in ids)), ))
|
(tuple((p[0] for p in ids)), ))
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
@functools.lru_cache(maxsize=1)
|
@functools.lru_cache(maxsize=1)
|
||||||
def _index_sql(num_places):
|
def _index_sql(self, num_places: int) -> pysql.Composed:
|
||||||
return pysql.SQL("""UPDATE location_property_osmline
|
return pysql.SQL("""UPDATE location_property_osmline
|
||||||
SET indexed_status = 0, address = v.addr, token_info = v.ti
|
SET indexed_status = 0, address = v.addr, token_info = v.ti
|
||||||
FROM (VALUES {}) as v(id, addr, ti)
|
FROM (VALUES {}) as v(id, addr, ti)
|
||||||
@@ -149,8 +159,8 @@ class InterpolationRunner:
|
|||||||
""").format(_mk_valuelist("(%s, %s::hstore, %s::jsonb)", num_places))
|
""").format(_mk_valuelist("(%s, %s::hstore, %s::jsonb)", num_places))
|
||||||
|
|
||||||
|
|
||||||
def index_places(self, worker, places):
|
def index_places(self, worker: DBConnection, places: DictCursorResults) -> None:
|
||||||
values = []
|
values: List[Any] = []
|
||||||
for place in places:
|
for place in places:
|
||||||
values.extend((place[x] for x in ('place_id', 'address')))
|
values.extend((place[x] for x in ('place_id', 'address')))
|
||||||
values.append(_analyze_place(place, self.analyzer))
|
values.append(_analyze_place(place, self.analyzer))
|
||||||
@@ -159,26 +169,28 @@ class InterpolationRunner:
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
class PostcodeRunner:
|
class PostcodeRunner(Runner):
|
||||||
""" Provides the SQL commands for indexing the location_postcode table.
|
""" Provides the SQL commands for indexing the location_postcode table.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@staticmethod
|
def name(self) -> str:
|
||||||
def name():
|
|
||||||
return "postcodes (location_postcode)"
|
return "postcodes (location_postcode)"
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def sql_count_objects():
|
def sql_count_objects(self) -> str:
|
||||||
return 'SELECT count(*) FROM location_postcode WHERE indexed_status > 0'
|
return 'SELECT count(*) FROM location_postcode WHERE indexed_status > 0'
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def sql_get_objects():
|
def sql_get_objects(self) -> str:
|
||||||
return """SELECT place_id FROM location_postcode
|
return """SELECT place_id FROM location_postcode
|
||||||
WHERE indexed_status > 0
|
WHERE indexed_status > 0
|
||||||
ORDER BY country_code, postcode"""
|
ORDER BY country_code, postcode"""
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def index_places(worker, ids):
|
def get_place_details(self, worker: DBConnection, ids: DictCursorResults) -> DictCursorResults:
|
||||||
|
return ids
|
||||||
|
|
||||||
|
def index_places(self, worker: DBConnection, places: DictCursorResults) -> None:
|
||||||
worker.perform(pysql.SQL("""UPDATE location_postcode SET indexed_status = 0
|
worker.perform(pysql.SQL("""UPDATE location_postcode SET indexed_status = 0
|
||||||
WHERE place_id IN ({})""")
|
WHERE place_id IN ({})""")
|
||||||
.format(pysql.SQL(',').join((pysql.Literal(i[0]) for i in ids))))
|
.format(pysql.SQL(',').join((pysql.Literal(i[0]) for i in places))))
|
||||||
|
|||||||
@@ -9,12 +9,12 @@ Abstract class defintions for tokenizers. These base classes are here
|
|||||||
mainly for documentation purposes.
|
mainly for documentation purposes.
|
||||||
"""
|
"""
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import List, Tuple, Dict, Any
|
from typing import List, Tuple, Dict, Any, Optional, Iterable
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
from nominatim.config import Configuration
|
from nominatim.config import Configuration
|
||||||
from nominatim.data.place_info import PlaceInfo
|
from nominatim.data.place_info import PlaceInfo
|
||||||
|
from nominatim.typing import Protocol
|
||||||
# pylint: disable=unnecessary-pass
|
|
||||||
|
|
||||||
class AbstractAnalyzer(ABC):
|
class AbstractAnalyzer(ABC):
|
||||||
""" The analyzer provides the functions for analysing names and building
|
""" The analyzer provides the functions for analysing names and building
|
||||||
@@ -28,7 +28,7 @@ class AbstractAnalyzer(ABC):
|
|||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
||||||
def __exit__(self, exc_type, exc_value, traceback) -> None:
|
def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
|
||||||
self.close()
|
self.close()
|
||||||
|
|
||||||
|
|
||||||
@@ -80,7 +80,8 @@ class AbstractAnalyzer(ABC):
|
|||||||
|
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def update_special_phrases(self, phrases: List[Tuple[str, str, str, str]],
|
def update_special_phrases(self,
|
||||||
|
phrases: Iterable[Tuple[str, str, str, str]],
|
||||||
should_replace: bool) -> None:
|
should_replace: bool) -> None:
|
||||||
""" Update the tokenizer's special phrase tokens from the given
|
""" Update the tokenizer's special phrase tokens from the given
|
||||||
list of special phrases.
|
list of special phrases.
|
||||||
@@ -95,7 +96,7 @@ class AbstractAnalyzer(ABC):
|
|||||||
|
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def add_country_names(self, country_code: str, names: Dict[str, str]):
|
def add_country_names(self, country_code: str, names: Dict[str, str]) -> None:
|
||||||
""" Add the given names to the tokenizer's list of country tokens.
|
""" Add the given names to the tokenizer's list of country tokens.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
@@ -186,7 +187,7 @@ class AbstractTokenizer(ABC):
|
|||||||
|
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def check_database(self, config: Configuration) -> str:
|
def check_database(self, config: Configuration) -> Optional[str]:
|
||||||
""" Check that the database is set up correctly and ready for being
|
""" Check that the database is set up correctly and ready for being
|
||||||
queried.
|
queried.
|
||||||
|
|
||||||
@@ -230,3 +231,13 @@ class AbstractTokenizer(ABC):
|
|||||||
When used outside the with construct, the caller must ensure to
|
When used outside the with construct, the caller must ensure to
|
||||||
call the close() function before destructing the analyzer.
|
call the close() function before destructing the analyzer.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class TokenizerModule(Protocol):
|
||||||
|
""" Interface that must be exported by modules that implement their
|
||||||
|
own tokenizer.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def create(self, dsn: str, data_dir: Path) -> AbstractTokenizer:
|
||||||
|
""" Factory for new tokenizers.
|
||||||
|
"""
|
||||||
|
|||||||
@@ -19,17 +19,20 @@ database.
|
|||||||
A tokenizer usually also includes PHP code for querying. The appropriate PHP
|
A tokenizer usually also includes PHP code for querying. The appropriate PHP
|
||||||
normalizer module is installed, when the tokenizer is created.
|
normalizer module is installed, when the tokenizer is created.
|
||||||
"""
|
"""
|
||||||
|
from typing import Optional
|
||||||
import logging
|
import logging
|
||||||
import importlib
|
import importlib
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from ..errors import UsageError
|
from nominatim.errors import UsageError
|
||||||
from ..db import properties
|
from nominatim.db import properties
|
||||||
from ..db.connection import connect
|
from nominatim.db.connection import connect
|
||||||
|
from nominatim.config import Configuration
|
||||||
|
from nominatim.tokenizer.base import AbstractTokenizer, TokenizerModule
|
||||||
|
|
||||||
LOG = logging.getLogger()
|
LOG = logging.getLogger()
|
||||||
|
|
||||||
def _import_tokenizer(name):
|
def _import_tokenizer(name: str) -> TokenizerModule:
|
||||||
""" Load the tokenizer.py module from project directory.
|
""" Load the tokenizer.py module from project directory.
|
||||||
"""
|
"""
|
||||||
src_file = Path(__file__).parent / (name + '_tokenizer.py')
|
src_file = Path(__file__).parent / (name + '_tokenizer.py')
|
||||||
@@ -41,7 +44,8 @@ def _import_tokenizer(name):
|
|||||||
return importlib.import_module('nominatim.tokenizer.' + name + '_tokenizer')
|
return importlib.import_module('nominatim.tokenizer.' + name + '_tokenizer')
|
||||||
|
|
||||||
|
|
||||||
def create_tokenizer(config, init_db=True, module_name=None):
|
def create_tokenizer(config: Configuration, init_db: bool = True,
|
||||||
|
module_name: Optional[str] = None) -> AbstractTokenizer:
|
||||||
""" Create a new tokenizer as defined by the given configuration.
|
""" Create a new tokenizer as defined by the given configuration.
|
||||||
|
|
||||||
The tokenizer data and code is copied into the 'tokenizer' directory
|
The tokenizer data and code is copied into the 'tokenizer' directory
|
||||||
@@ -70,7 +74,7 @@ def create_tokenizer(config, init_db=True, module_name=None):
|
|||||||
return tokenizer
|
return tokenizer
|
||||||
|
|
||||||
|
|
||||||
def get_tokenizer_for_db(config):
|
def get_tokenizer_for_db(config: Configuration) -> AbstractTokenizer:
|
||||||
""" Instantiate a tokenizer for an existing database.
|
""" Instantiate a tokenizer for an existing database.
|
||||||
|
|
||||||
The function looks up the appropriate tokenizer in the database
|
The function looks up the appropriate tokenizer in the database
|
||||||
|
|||||||
@@ -7,16 +7,19 @@
|
|||||||
"""
|
"""
|
||||||
Helper class to create ICU rules from a configuration file.
|
Helper class to create ICU rules from a configuration file.
|
||||||
"""
|
"""
|
||||||
|
from typing import Mapping, Any, Dict, Optional
|
||||||
import importlib
|
import importlib
|
||||||
import io
|
import io
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from nominatim.config import flatten_config_list
|
from nominatim.config import flatten_config_list, Configuration
|
||||||
from nominatim.db.properties import set_property, get_property
|
from nominatim.db.properties import set_property, get_property
|
||||||
|
from nominatim.db.connection import Connection
|
||||||
from nominatim.errors import UsageError
|
from nominatim.errors import UsageError
|
||||||
from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
|
from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
|
||||||
from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
|
from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
|
||||||
|
from nominatim.tokenizer.token_analysis.base import AnalysisModule, Analyser
|
||||||
import nominatim.data.country_info
|
import nominatim.data.country_info
|
||||||
|
|
||||||
LOG = logging.getLogger()
|
LOG = logging.getLogger()
|
||||||
@@ -26,7 +29,7 @@ DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration"
|
|||||||
DBCFG_IMPORT_ANALYSIS_RULES = "tokenizer_import_analysis_rules"
|
DBCFG_IMPORT_ANALYSIS_RULES = "tokenizer_import_analysis_rules"
|
||||||
|
|
||||||
|
|
||||||
def _get_section(rules, section):
|
def _get_section(rules: Mapping[str, Any], section: str) -> Any:
|
||||||
""" Get the section named 'section' from the rules. If the section does
|
""" Get the section named 'section' from the rules. If the section does
|
||||||
not exist, raise a usage error with a meaningful message.
|
not exist, raise a usage error with a meaningful message.
|
||||||
"""
|
"""
|
||||||
@@ -41,7 +44,7 @@ class ICURuleLoader:
|
|||||||
""" Compiler for ICU rules from a tokenizer configuration file.
|
""" Compiler for ICU rules from a tokenizer configuration file.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config: Configuration) -> None:
|
||||||
rules = config.load_sub_configuration('icu_tokenizer.yaml',
|
rules = config.load_sub_configuration('icu_tokenizer.yaml',
|
||||||
config='TOKENIZER_CONFIG')
|
config='TOKENIZER_CONFIG')
|
||||||
|
|
||||||
@@ -57,17 +60,27 @@ class ICURuleLoader:
|
|||||||
self.sanitizer_rules = rules.get('sanitizers', [])
|
self.sanitizer_rules = rules.get('sanitizers', [])
|
||||||
|
|
||||||
|
|
||||||
def load_config_from_db(self, conn):
|
def load_config_from_db(self, conn: Connection) -> None:
|
||||||
""" Get previously saved parts of the configuration from the
|
""" Get previously saved parts of the configuration from the
|
||||||
database.
|
database.
|
||||||
"""
|
"""
|
||||||
self.normalization_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
|
rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
|
||||||
self.transliteration_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
|
if rules is not None:
|
||||||
self.analysis_rules = json.loads(get_property(conn, DBCFG_IMPORT_ANALYSIS_RULES))
|
self.normalization_rules = rules
|
||||||
|
|
||||||
|
rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
|
||||||
|
if rules is not None:
|
||||||
|
self.transliteration_rules = rules
|
||||||
|
|
||||||
|
rules = get_property(conn, DBCFG_IMPORT_ANALYSIS_RULES)
|
||||||
|
if rules:
|
||||||
|
self.analysis_rules = json.loads(rules)
|
||||||
|
else:
|
||||||
|
self.analysis_rules = []
|
||||||
self._setup_analysis()
|
self._setup_analysis()
|
||||||
|
|
||||||
|
|
||||||
def save_config_to_db(self, conn):
|
def save_config_to_db(self, conn: Connection) -> None:
|
||||||
""" Save the part of the configuration that cannot be changed into
|
""" Save the part of the configuration that cannot be changed into
|
||||||
the database.
|
the database.
|
||||||
"""
|
"""
|
||||||
@@ -76,20 +89,20 @@ class ICURuleLoader:
|
|||||||
set_property(conn, DBCFG_IMPORT_ANALYSIS_RULES, json.dumps(self.analysis_rules))
|
set_property(conn, DBCFG_IMPORT_ANALYSIS_RULES, json.dumps(self.analysis_rules))
|
||||||
|
|
||||||
|
|
||||||
def make_sanitizer(self):
|
def make_sanitizer(self) -> PlaceSanitizer:
|
||||||
""" Create a place sanitizer from the configured rules.
|
""" Create a place sanitizer from the configured rules.
|
||||||
"""
|
"""
|
||||||
return PlaceSanitizer(self.sanitizer_rules)
|
return PlaceSanitizer(self.sanitizer_rules)
|
||||||
|
|
||||||
|
|
||||||
def make_token_analysis(self):
|
def make_token_analysis(self) -> ICUTokenAnalysis:
|
||||||
""" Create a token analyser from the reviouly loaded rules.
|
""" Create a token analyser from the reviouly loaded rules.
|
||||||
"""
|
"""
|
||||||
return ICUTokenAnalysis(self.normalization_rules,
|
return ICUTokenAnalysis(self.normalization_rules,
|
||||||
self.transliteration_rules, self.analysis)
|
self.transliteration_rules, self.analysis)
|
||||||
|
|
||||||
|
|
||||||
def get_search_rules(self):
|
def get_search_rules(self) -> str:
|
||||||
""" Return the ICU rules to be used during search.
|
""" Return the ICU rules to be used during search.
|
||||||
The rules combine normalization and transliteration.
|
The rules combine normalization and transliteration.
|
||||||
"""
|
"""
|
||||||
@@ -102,22 +115,22 @@ class ICURuleLoader:
|
|||||||
return rules.getvalue()
|
return rules.getvalue()
|
||||||
|
|
||||||
|
|
||||||
def get_normalization_rules(self):
|
def get_normalization_rules(self) -> str:
|
||||||
""" Return rules for normalisation of a term.
|
""" Return rules for normalisation of a term.
|
||||||
"""
|
"""
|
||||||
return self.normalization_rules
|
return self.normalization_rules
|
||||||
|
|
||||||
|
|
||||||
def get_transliteration_rules(self):
|
def get_transliteration_rules(self) -> str:
|
||||||
""" Return the rules for converting a string into its asciii representation.
|
""" Return the rules for converting a string into its asciii representation.
|
||||||
"""
|
"""
|
||||||
return self.transliteration_rules
|
return self.transliteration_rules
|
||||||
|
|
||||||
|
|
||||||
def _setup_analysis(self):
|
def _setup_analysis(self) -> None:
|
||||||
""" Process the rules used for creating the various token analyzers.
|
""" Process the rules used for creating the various token analyzers.
|
||||||
"""
|
"""
|
||||||
self.analysis = {}
|
self.analysis: Dict[Optional[str], TokenAnalyzerRule] = {}
|
||||||
|
|
||||||
if not isinstance(self.analysis_rules, list):
|
if not isinstance(self.analysis_rules, list):
|
||||||
raise UsageError("Configuration section 'token-analysis' must be a list.")
|
raise UsageError("Configuration section 'token-analysis' must be a list.")
|
||||||
@@ -135,7 +148,7 @@ class ICURuleLoader:
|
|||||||
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _cfg_to_icu_rules(rules, section):
|
def _cfg_to_icu_rules(rules: Mapping[str, Any], section: str) -> str:
|
||||||
""" Load an ICU ruleset from the given section. If the section is a
|
""" Load an ICU ruleset from the given section. If the section is a
|
||||||
simple string, it is interpreted as a file name and the rules are
|
simple string, it is interpreted as a file name and the rules are
|
||||||
loaded verbatim from the given file. The filename is expected to be
|
loaded verbatim from the given file. The filename is expected to be
|
||||||
@@ -155,12 +168,16 @@ class TokenAnalyzerRule:
|
|||||||
and creates a new token analyzer on request.
|
and creates a new token analyzer on request.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, rules, normalization_rules):
|
def __init__(self, rules: Mapping[str, Any], normalization_rules: str) -> None:
|
||||||
# Find the analysis module
|
# Find the analysis module
|
||||||
module_name = 'nominatim.tokenizer.token_analysis.' \
|
module_name = 'nominatim.tokenizer.token_analysis.' \
|
||||||
+ _get_section(rules, 'analyzer').replace('-', '_')
|
+ _get_section(rules, 'analyzer').replace('-', '_')
|
||||||
analysis_mod = importlib.import_module(module_name)
|
self._analysis_mod: AnalysisModule = importlib.import_module(module_name)
|
||||||
self.create = analysis_mod.create
|
|
||||||
|
|
||||||
# Load the configuration.
|
# Load the configuration.
|
||||||
self.config = analysis_mod.configure(rules, normalization_rules)
|
self.config = self._analysis_mod.configure(rules, normalization_rules)
|
||||||
|
|
||||||
|
def create(self, normalizer: Any, transliterator: Any) -> Analyser:
|
||||||
|
""" Create a new analyser instance for the given rule.
|
||||||
|
"""
|
||||||
|
return self._analysis_mod.create(normalizer, transliterator, self.config)
|
||||||
|
|||||||
@@ -8,15 +8,22 @@
|
|||||||
Container class collecting all components required to transform an OSM name
|
Container class collecting all components required to transform an OSM name
|
||||||
into a Nominatim token.
|
into a Nominatim token.
|
||||||
"""
|
"""
|
||||||
|
from typing import Mapping, Optional, TYPE_CHECKING
|
||||||
from icu import Transliterator
|
from icu import Transliterator
|
||||||
|
|
||||||
|
from nominatim.tokenizer.token_analysis.base import Analyser
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from typing import Any
|
||||||
|
from nominatim.tokenizer.icu_rule_loader import TokenAnalyzerRule # pylint: disable=cyclic-import
|
||||||
|
|
||||||
class ICUTokenAnalysis:
|
class ICUTokenAnalysis:
|
||||||
""" Container class collecting the transliterators and token analysis
|
""" Container class collecting the transliterators and token analysis
|
||||||
modules for a single NameAnalyser instance.
|
modules for a single NameAnalyser instance.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, norm_rules, trans_rules, analysis_rules):
|
def __init__(self, norm_rules: str, trans_rules: str,
|
||||||
|
analysis_rules: Mapping[Optional[str], 'TokenAnalyzerRule']):
|
||||||
self.normalizer = Transliterator.createFromRules("icu_normalization",
|
self.normalizer = Transliterator.createFromRules("icu_normalization",
|
||||||
norm_rules)
|
norm_rules)
|
||||||
trans_rules += ";[:Space:]+ > ' '"
|
trans_rules += ";[:Space:]+ > ' '"
|
||||||
@@ -25,11 +32,11 @@ class ICUTokenAnalysis:
|
|||||||
self.search = Transliterator.createFromRules("icu_search",
|
self.search = Transliterator.createFromRules("icu_search",
|
||||||
norm_rules + trans_rules)
|
norm_rules + trans_rules)
|
||||||
|
|
||||||
self.analysis = {name: arules.create(self.normalizer, self.to_ascii, arules.config)
|
self.analysis = {name: arules.create(self.normalizer, self.to_ascii)
|
||||||
for name, arules in analysis_rules.items()}
|
for name, arules in analysis_rules.items()}
|
||||||
|
|
||||||
|
|
||||||
def get_analyzer(self, name):
|
def get_analyzer(self, name: Optional[str]) -> Analyser:
|
||||||
""" Return the given named analyzer. If no analyzer with that
|
""" Return the given named analyzer. If no analyzer with that
|
||||||
name exists, return the default analyzer.
|
name exists, return the default analyzer.
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -8,41 +8,48 @@
|
|||||||
Tokenizer implementing normalisation as used before Nominatim 4 but using
|
Tokenizer implementing normalisation as used before Nominatim 4 but using
|
||||||
libICU instead of the PostgreSQL module.
|
libICU instead of the PostgreSQL module.
|
||||||
"""
|
"""
|
||||||
|
from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \
|
||||||
|
Dict, Set, Iterable
|
||||||
import itertools
|
import itertools
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
from textwrap import dedent
|
from textwrap import dedent
|
||||||
|
|
||||||
from nominatim.db.connection import connect
|
from nominatim.db.connection import connect, Connection, Cursor
|
||||||
|
from nominatim.config import Configuration
|
||||||
from nominatim.db.utils import CopyBuffer
|
from nominatim.db.utils import CopyBuffer
|
||||||
from nominatim.db.sql_preprocessor import SQLPreprocessor
|
from nominatim.db.sql_preprocessor import SQLPreprocessor
|
||||||
from nominatim.data.place_info import PlaceInfo
|
from nominatim.data.place_info import PlaceInfo
|
||||||
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
|
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
|
||||||
|
from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
|
||||||
|
from nominatim.tokenizer.sanitizers.base import PlaceName
|
||||||
|
from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
|
||||||
from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
|
from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
|
||||||
|
|
||||||
DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
|
DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
|
||||||
|
|
||||||
LOG = logging.getLogger()
|
LOG = logging.getLogger()
|
||||||
|
|
||||||
def create(dsn, data_dir):
|
def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
|
||||||
""" Create a new instance of the tokenizer provided by this module.
|
""" Create a new instance of the tokenizer provided by this module.
|
||||||
"""
|
"""
|
||||||
return LegacyICUTokenizer(dsn, data_dir)
|
return ICUTokenizer(dsn, data_dir)
|
||||||
|
|
||||||
|
|
||||||
class LegacyICUTokenizer(AbstractTokenizer):
|
class ICUTokenizer(AbstractTokenizer):
|
||||||
""" This tokenizer uses libICU to covert names and queries to ASCII.
|
""" This tokenizer uses libICU to covert names and queries to ASCII.
|
||||||
Otherwise it uses the same algorithms and data structures as the
|
Otherwise it uses the same algorithms and data structures as the
|
||||||
normalization routines in Nominatim 3.
|
normalization routines in Nominatim 3.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, dsn, data_dir):
|
def __init__(self, dsn: str, data_dir: Path) -> None:
|
||||||
self.dsn = dsn
|
self.dsn = dsn
|
||||||
self.data_dir = data_dir
|
self.data_dir = data_dir
|
||||||
self.loader = None
|
self.loader: Optional[ICURuleLoader] = None
|
||||||
|
|
||||||
|
|
||||||
def init_new_db(self, config, init_db=True):
|
def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
|
||||||
""" Set up a new tokenizer for the database.
|
""" Set up a new tokenizer for the database.
|
||||||
|
|
||||||
This copies all necessary data in the project directory to make
|
This copies all necessary data in the project directory to make
|
||||||
@@ -58,7 +65,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
|
|||||||
self._init_db_tables(config)
|
self._init_db_tables(config)
|
||||||
|
|
||||||
|
|
||||||
def init_from_project(self, config):
|
def init_from_project(self, config: Configuration) -> None:
|
||||||
""" Initialise the tokenizer from the project directory.
|
""" Initialise the tokenizer from the project directory.
|
||||||
"""
|
"""
|
||||||
self.loader = ICURuleLoader(config)
|
self.loader = ICURuleLoader(config)
|
||||||
@@ -69,7 +76,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
|
|||||||
self._install_php(config.lib_dir.php, overwrite=False)
|
self._install_php(config.lib_dir.php, overwrite=False)
|
||||||
|
|
||||||
|
|
||||||
def finalize_import(self, config):
|
def finalize_import(self, config: Configuration) -> None:
|
||||||
""" Do any required postprocessing to make the tokenizer data ready
|
""" Do any required postprocessing to make the tokenizer data ready
|
||||||
for use.
|
for use.
|
||||||
"""
|
"""
|
||||||
@@ -78,7 +85,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
|
|||||||
sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
|
sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
|
||||||
|
|
||||||
|
|
||||||
def update_sql_functions(self, config):
|
def update_sql_functions(self, config: Configuration) -> None:
|
||||||
""" Reimport the SQL functions for this tokenizer.
|
""" Reimport the SQL functions for this tokenizer.
|
||||||
"""
|
"""
|
||||||
with connect(self.dsn) as conn:
|
with connect(self.dsn) as conn:
|
||||||
@@ -86,14 +93,14 @@ class LegacyICUTokenizer(AbstractTokenizer):
|
|||||||
sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
|
sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
|
||||||
|
|
||||||
|
|
||||||
def check_database(self, config):
|
def check_database(self, config: Configuration) -> None:
|
||||||
""" Check that the tokenizer is set up correctly.
|
""" Check that the tokenizer is set up correctly.
|
||||||
"""
|
"""
|
||||||
# Will throw an error if there is an issue.
|
# Will throw an error if there is an issue.
|
||||||
self.init_from_project(config)
|
self.init_from_project(config)
|
||||||
|
|
||||||
|
|
||||||
def update_statistics(self):
|
def update_statistics(self) -> None:
|
||||||
""" Recompute frequencies for all name words.
|
""" Recompute frequencies for all name words.
|
||||||
"""
|
"""
|
||||||
with connect(self.dsn) as conn:
|
with connect(self.dsn) as conn:
|
||||||
@@ -113,7 +120,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
|
|||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
|
|
||||||
def _cleanup_housenumbers(self):
|
def _cleanup_housenumbers(self) -> None:
|
||||||
""" Remove unused house numbers.
|
""" Remove unused house numbers.
|
||||||
"""
|
"""
|
||||||
with connect(self.dsn) as conn:
|
with connect(self.dsn) as conn:
|
||||||
@@ -148,7 +155,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
def update_word_tokens(self):
|
def update_word_tokens(self) -> None:
|
||||||
""" Remove unused tokens.
|
""" Remove unused tokens.
|
||||||
"""
|
"""
|
||||||
LOG.warning("Cleaning up housenumber tokens.")
|
LOG.warning("Cleaning up housenumber tokens.")
|
||||||
@@ -156,7 +163,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
|
|||||||
LOG.warning("Tokenizer house-keeping done.")
|
LOG.warning("Tokenizer house-keeping done.")
|
||||||
|
|
||||||
|
|
||||||
def name_analyzer(self):
|
def name_analyzer(self) -> 'ICUNameAnalyzer':
|
||||||
""" Create a new analyzer for tokenizing names and queries
|
""" Create a new analyzer for tokenizing names and queries
|
||||||
using this tokinzer. Analyzers are context managers and should
|
using this tokinzer. Analyzers are context managers and should
|
||||||
be used accordingly:
|
be used accordingly:
|
||||||
@@ -171,13 +178,15 @@ class LegacyICUTokenizer(AbstractTokenizer):
|
|||||||
|
|
||||||
Analyzers are not thread-safe. You need to instantiate one per thread.
|
Analyzers are not thread-safe. You need to instantiate one per thread.
|
||||||
"""
|
"""
|
||||||
return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
|
assert self.loader is not None
|
||||||
self.loader.make_token_analysis())
|
return ICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
|
||||||
|
self.loader.make_token_analysis())
|
||||||
|
|
||||||
|
|
||||||
def _install_php(self, phpdir, overwrite=True):
|
def _install_php(self, phpdir: Path, overwrite: bool = True) -> None:
|
||||||
""" Install the php script for the tokenizer.
|
""" Install the php script for the tokenizer.
|
||||||
"""
|
"""
|
||||||
|
assert self.loader is not None
|
||||||
php_file = self.data_dir / "tokenizer.php"
|
php_file = self.data_dir / "tokenizer.php"
|
||||||
|
|
||||||
if not php_file.exists() or overwrite:
|
if not php_file.exists() or overwrite:
|
||||||
@@ -189,15 +198,16 @@ class LegacyICUTokenizer(AbstractTokenizer):
|
|||||||
require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
|
require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
|
||||||
|
|
||||||
|
|
||||||
def _save_config(self):
|
def _save_config(self) -> None:
|
||||||
""" Save the configuration that needs to remain stable for the given
|
""" Save the configuration that needs to remain stable for the given
|
||||||
database as database properties.
|
database as database properties.
|
||||||
"""
|
"""
|
||||||
|
assert self.loader is not None
|
||||||
with connect(self.dsn) as conn:
|
with connect(self.dsn) as conn:
|
||||||
self.loader.save_config_to_db(conn)
|
self.loader.save_config_to_db(conn)
|
||||||
|
|
||||||
|
|
||||||
def _init_db_tables(self, config):
|
def _init_db_tables(self, config: Configuration) -> None:
|
||||||
""" Set up the word table and fill it with pre-computed word
|
""" Set up the word table and fill it with pre-computed word
|
||||||
frequencies.
|
frequencies.
|
||||||
"""
|
"""
|
||||||
@@ -207,15 +217,16 @@ class LegacyICUTokenizer(AbstractTokenizer):
|
|||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
|
|
||||||
class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
class ICUNameAnalyzer(AbstractAnalyzer):
|
||||||
""" The legacy analyzer uses the ICU library for splitting names.
|
""" The ICU analyzer uses the ICU library for splitting names.
|
||||||
|
|
||||||
Each instance opens a connection to the database to request the
|
Each instance opens a connection to the database to request the
|
||||||
normalization.
|
normalization.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, dsn, sanitizer, token_analysis):
|
def __init__(self, dsn: str, sanitizer: PlaceSanitizer,
|
||||||
self.conn = connect(dsn).connection
|
token_analysis: ICUTokenAnalysis) -> None:
|
||||||
|
self.conn: Optional[Connection] = connect(dsn).connection
|
||||||
self.conn.autocommit = True
|
self.conn.autocommit = True
|
||||||
self.sanitizer = sanitizer
|
self.sanitizer = sanitizer
|
||||||
self.token_analysis = token_analysis
|
self.token_analysis = token_analysis
|
||||||
@@ -223,7 +234,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
self._cache = _TokenCache()
|
self._cache = _TokenCache()
|
||||||
|
|
||||||
|
|
||||||
def close(self):
|
def close(self) -> None:
|
||||||
""" Free all resources used by the analyzer.
|
""" Free all resources used by the analyzer.
|
||||||
"""
|
"""
|
||||||
if self.conn:
|
if self.conn:
|
||||||
@@ -231,20 +242,20 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
self.conn = None
|
self.conn = None
|
||||||
|
|
||||||
|
|
||||||
def _search_normalized(self, name):
|
def _search_normalized(self, name: str) -> str:
|
||||||
""" Return the search token transliteration of the given name.
|
""" Return the search token transliteration of the given name.
|
||||||
"""
|
"""
|
||||||
return self.token_analysis.search.transliterate(name).strip()
|
return cast(str, self.token_analysis.search.transliterate(name)).strip()
|
||||||
|
|
||||||
|
|
||||||
def _normalized(self, name):
|
def _normalized(self, name: str) -> str:
|
||||||
""" Return the normalized version of the given name with all
|
""" Return the normalized version of the given name with all
|
||||||
non-relevant information removed.
|
non-relevant information removed.
|
||||||
"""
|
"""
|
||||||
return self.token_analysis.normalizer.transliterate(name).strip()
|
return cast(str, self.token_analysis.normalizer.transliterate(name)).strip()
|
||||||
|
|
||||||
|
|
||||||
def get_word_token_info(self, words):
|
def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
|
||||||
""" Return token information for the given list of words.
|
""" Return token information for the given list of words.
|
||||||
If a word starts with # it is assumed to be a full name
|
If a word starts with # it is assumed to be a full name
|
||||||
otherwise is a partial name.
|
otherwise is a partial name.
|
||||||
@@ -255,6 +266,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
The function is used for testing and debugging only
|
The function is used for testing and debugging only
|
||||||
and not necessarily efficient.
|
and not necessarily efficient.
|
||||||
"""
|
"""
|
||||||
|
assert self.conn is not None
|
||||||
full_tokens = {}
|
full_tokens = {}
|
||||||
partial_tokens = {}
|
partial_tokens = {}
|
||||||
for word in words:
|
for word in words:
|
||||||
@@ -277,7 +289,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
+ [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
|
+ [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
|
||||||
|
|
||||||
|
|
||||||
def normalize_postcode(self, postcode):
|
def normalize_postcode(self, postcode: str) -> str:
|
||||||
""" Convert the postcode to a standardized form.
|
""" Convert the postcode to a standardized form.
|
||||||
|
|
||||||
This function must yield exactly the same result as the SQL function
|
This function must yield exactly the same result as the SQL function
|
||||||
@@ -286,10 +298,11 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
return postcode.strip().upper()
|
return postcode.strip().upper()
|
||||||
|
|
||||||
|
|
||||||
def update_postcodes_from_db(self):
|
def update_postcodes_from_db(self) -> None:
|
||||||
""" Update postcode tokens in the word table from the location_postcode
|
""" Update postcode tokens in the word table from the location_postcode
|
||||||
table.
|
table.
|
||||||
"""
|
"""
|
||||||
|
assert self.conn is not None
|
||||||
analyzer = self.token_analysis.analysis.get('@postcode')
|
analyzer = self.token_analysis.analysis.get('@postcode')
|
||||||
|
|
||||||
with self.conn.cursor() as cur:
|
with self.conn.cursor() as cur:
|
||||||
@@ -324,13 +337,15 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
self._delete_unused_postcode_words(word_entries - needed_entries)
|
self._delete_unused_postcode_words(word_entries - needed_entries)
|
||||||
self._add_missing_postcode_words(needed_entries - word_entries)
|
self._add_missing_postcode_words(needed_entries - word_entries)
|
||||||
|
|
||||||
def _delete_unused_postcode_words(self, tokens):
|
def _delete_unused_postcode_words(self, tokens: Iterable[str]) -> None:
|
||||||
|
assert self.conn is not None
|
||||||
if tokens:
|
if tokens:
|
||||||
with self.conn.cursor() as cur:
|
with self.conn.cursor() as cur:
|
||||||
cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
|
cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
|
||||||
(list(tokens), ))
|
(list(tokens), ))
|
||||||
|
|
||||||
def _add_missing_postcode_words(self, tokens):
|
def _add_missing_postcode_words(self, tokens: Iterable[str]) -> None:
|
||||||
|
assert self.conn is not None
|
||||||
if not tokens:
|
if not tokens:
|
||||||
return
|
return
|
||||||
|
|
||||||
@@ -341,10 +356,12 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
if '@' in postcode_name:
|
if '@' in postcode_name:
|
||||||
term, variant = postcode_name.split('@', 2)
|
term, variant = postcode_name.split('@', 2)
|
||||||
term = self._search_normalized(term)
|
term = self._search_normalized(term)
|
||||||
variants = {term}
|
if analyzer is None:
|
||||||
if analyzer is not None:
|
variants = [term]
|
||||||
variants.update(analyzer.get_variants_ascii(variant))
|
else:
|
||||||
variants = list(variants)
|
variants = analyzer.get_variants_ascii(variant)
|
||||||
|
if term not in variants:
|
||||||
|
variants.append(term)
|
||||||
else:
|
else:
|
||||||
variants = [self._search_normalized(postcode_name)]
|
variants = [self._search_normalized(postcode_name)]
|
||||||
terms.append((postcode_name, variants))
|
terms.append((postcode_name, variants))
|
||||||
@@ -358,12 +375,14 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
def update_special_phrases(self, phrases, should_replace):
|
def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
|
||||||
|
should_replace: bool) -> None:
|
||||||
""" Replace the search index for special phrases with the new phrases.
|
""" Replace the search index for special phrases with the new phrases.
|
||||||
If `should_replace` is True, then the previous set of will be
|
If `should_replace` is True, then the previous set of will be
|
||||||
completely replaced. Otherwise the phrases are added to the
|
completely replaced. Otherwise the phrases are added to the
|
||||||
already existing ones.
|
already existing ones.
|
||||||
"""
|
"""
|
||||||
|
assert self.conn is not None
|
||||||
norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
|
norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
|
||||||
for p in phrases))
|
for p in phrases))
|
||||||
|
|
||||||
@@ -386,7 +405,9 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
len(norm_phrases), added, deleted)
|
len(norm_phrases), added, deleted)
|
||||||
|
|
||||||
|
|
||||||
def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
|
def _add_special_phrases(self, cursor: Cursor,
|
||||||
|
new_phrases: Set[Tuple[str, str, str, str]],
|
||||||
|
existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
|
||||||
""" Add all phrases to the database that are not yet there.
|
""" Add all phrases to the database that are not yet there.
|
||||||
"""
|
"""
|
||||||
to_add = new_phrases - existing_phrases
|
to_add = new_phrases - existing_phrases
|
||||||
@@ -407,8 +428,9 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
return added
|
return added
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
def _remove_special_phrases(self, cursor: Cursor,
|
||||||
def _remove_special_phrases(cursor, new_phrases, existing_phrases):
|
new_phrases: Set[Tuple[str, str, str, str]],
|
||||||
|
existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
|
||||||
""" Remove all phrases from the databse that are no longer in the
|
""" Remove all phrases from the databse that are no longer in the
|
||||||
new phrase list.
|
new phrase list.
|
||||||
"""
|
"""
|
||||||
@@ -425,7 +447,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
return len(to_delete)
|
return len(to_delete)
|
||||||
|
|
||||||
|
|
||||||
def add_country_names(self, country_code, names):
|
def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
|
||||||
""" Add default names for the given country to the search index.
|
""" Add default names for the given country to the search index.
|
||||||
"""
|
"""
|
||||||
# Make sure any name preprocessing for country names applies.
|
# Make sure any name preprocessing for country names applies.
|
||||||
@@ -437,10 +459,12 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
internal=True)
|
internal=True)
|
||||||
|
|
||||||
|
|
||||||
def _add_country_full_names(self, country_code, names, internal=False):
|
def _add_country_full_names(self, country_code: str, names: Sequence[PlaceName],
|
||||||
|
internal: bool = False) -> None:
|
||||||
""" Add names for the given country from an already sanitized
|
""" Add names for the given country from an already sanitized
|
||||||
name list.
|
name list.
|
||||||
"""
|
"""
|
||||||
|
assert self.conn is not None
|
||||||
word_tokens = set()
|
word_tokens = set()
|
||||||
for name in names:
|
for name in names:
|
||||||
norm_name = self._search_normalized(name.name)
|
norm_name = self._search_normalized(name.name)
|
||||||
@@ -453,7 +477,8 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
FROM word
|
FROM word
|
||||||
WHERE type = 'C' and word = %s""",
|
WHERE type = 'C' and word = %s""",
|
||||||
(country_code, ))
|
(country_code, ))
|
||||||
existing_tokens = {True: set(), False: set()} # internal/external names
|
# internal/external names
|
||||||
|
existing_tokens: Dict[bool, Set[str]] = {True: set(), False: set()}
|
||||||
for word in cur:
|
for word in cur:
|
||||||
existing_tokens[word[1]].add(word[0])
|
existing_tokens[word[1]].add(word[0])
|
||||||
|
|
||||||
@@ -486,7 +511,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
cur.execute(sql, (country_code, list(new_tokens)))
|
cur.execute(sql, (country_code, list(new_tokens)))
|
||||||
|
|
||||||
|
|
||||||
def process_place(self, place):
|
def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
|
||||||
""" Determine tokenizer information about the given place.
|
""" Determine tokenizer information about the given place.
|
||||||
|
|
||||||
Returns a JSON-serializable structure that will be handed into
|
Returns a JSON-serializable structure that will be handed into
|
||||||
@@ -500,6 +525,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
token_info.set_names(*self._compute_name_tokens(names))
|
token_info.set_names(*self._compute_name_tokens(names))
|
||||||
|
|
||||||
if place.is_country():
|
if place.is_country():
|
||||||
|
assert place.country_code is not None
|
||||||
self._add_country_full_names(place.country_code, names)
|
self._add_country_full_names(place.country_code, names)
|
||||||
|
|
||||||
if address:
|
if address:
|
||||||
@@ -508,7 +534,8 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
return token_info.to_dict()
|
return token_info.to_dict()
|
||||||
|
|
||||||
|
|
||||||
def _process_place_address(self, token_info, address):
|
def _process_place_address(self, token_info: '_TokenInfo',
|
||||||
|
address: Sequence[PlaceName]) -> None:
|
||||||
for item in address:
|
for item in address:
|
||||||
if item.kind == 'postcode':
|
if item.kind == 'postcode':
|
||||||
token_info.set_postcode(self._add_postcode(item))
|
token_info.set_postcode(self._add_postcode(item))
|
||||||
@@ -524,12 +551,13 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
token_info.add_address_term(item.kind, self._compute_partial_tokens(item.name))
|
token_info.add_address_term(item.kind, self._compute_partial_tokens(item.name))
|
||||||
|
|
||||||
|
|
||||||
def _compute_housenumber_token(self, hnr):
|
def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]:
|
||||||
""" Normalize the housenumber and return the word token and the
|
""" Normalize the housenumber and return the word token and the
|
||||||
canonical form.
|
canonical form.
|
||||||
"""
|
"""
|
||||||
|
assert self.conn is not None
|
||||||
analyzer = self.token_analysis.analysis.get('@housenumber')
|
analyzer = self.token_analysis.analysis.get('@housenumber')
|
||||||
result = None, None
|
result: Tuple[Optional[int], Optional[str]] = (None, None)
|
||||||
|
|
||||||
if analyzer is None:
|
if analyzer is None:
|
||||||
# When no custom analyzer is set, simply normalize and transliterate
|
# When no custom analyzer is set, simply normalize and transliterate
|
||||||
@@ -539,7 +567,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
if result[0] is None:
|
if result[0] is None:
|
||||||
with self.conn.cursor() as cur:
|
with self.conn.cursor() as cur:
|
||||||
cur.execute("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
|
cur.execute("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
|
||||||
result = cur.fetchone()[0], norm_name
|
result = cur.fetchone()[0], norm_name # type: ignore[no-untyped-call]
|
||||||
self._cache.housenumbers[norm_name] = result
|
self._cache.housenumbers[norm_name] = result
|
||||||
else:
|
else:
|
||||||
# Otherwise use the analyzer to determine the canonical name.
|
# Otherwise use the analyzer to determine the canonical name.
|
||||||
@@ -554,16 +582,17 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
with self.conn.cursor() as cur:
|
with self.conn.cursor() as cur:
|
||||||
cur.execute("SELECT create_analyzed_hnr_id(%s, %s)",
|
cur.execute("SELECT create_analyzed_hnr_id(%s, %s)",
|
||||||
(norm_name, list(variants)))
|
(norm_name, list(variants)))
|
||||||
result = cur.fetchone()[0], variants[0]
|
result = cur.fetchone()[0], variants[0] # type: ignore[no-untyped-call]
|
||||||
self._cache.housenumbers[norm_name] = result
|
self._cache.housenumbers[norm_name] = result
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
def _compute_partial_tokens(self, name):
|
def _compute_partial_tokens(self, name: str) -> List[int]:
|
||||||
""" Normalize the given term, split it into partial words and return
|
""" Normalize the given term, split it into partial words and return
|
||||||
then token list for them.
|
then token list for them.
|
||||||
"""
|
"""
|
||||||
|
assert self.conn is not None
|
||||||
norm_name = self._search_normalized(name)
|
norm_name = self._search_normalized(name)
|
||||||
|
|
||||||
tokens = []
|
tokens = []
|
||||||
@@ -582,16 +611,18 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
(need_lookup, ))
|
(need_lookup, ))
|
||||||
|
|
||||||
for partial, token in cur:
|
for partial, token in cur:
|
||||||
|
assert token is not None
|
||||||
tokens.append(token)
|
tokens.append(token)
|
||||||
self._cache.partials[partial] = token
|
self._cache.partials[partial] = token
|
||||||
|
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
|
|
||||||
def _retrieve_full_tokens(self, name):
|
def _retrieve_full_tokens(self, name: str) -> List[int]:
|
||||||
""" Get the full name token for the given name, if it exists.
|
""" Get the full name token for the given name, if it exists.
|
||||||
The name is only retrived for the standard analyser.
|
The name is only retrived for the standard analyser.
|
||||||
"""
|
"""
|
||||||
|
assert self.conn is not None
|
||||||
norm_name = self._search_normalized(name)
|
norm_name = self._search_normalized(name)
|
||||||
|
|
||||||
# return cached if possible
|
# return cached if possible
|
||||||
@@ -608,12 +639,13 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
return full
|
return full
|
||||||
|
|
||||||
|
|
||||||
def _compute_name_tokens(self, names):
|
def _compute_name_tokens(self, names: Sequence[PlaceName]) -> Tuple[Set[int], Set[int]]:
|
||||||
""" Computes the full name and partial name tokens for the given
|
""" Computes the full name and partial name tokens for the given
|
||||||
dictionary of names.
|
dictionary of names.
|
||||||
"""
|
"""
|
||||||
full_tokens = set()
|
assert self.conn is not None
|
||||||
partial_tokens = set()
|
full_tokens: Set[int] = set()
|
||||||
|
partial_tokens: Set[int] = set()
|
||||||
|
|
||||||
for name in names:
|
for name in names:
|
||||||
analyzer_id = name.get_attr('analyzer')
|
analyzer_id = name.get_attr('analyzer')
|
||||||
@@ -633,19 +665,23 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
with self.conn.cursor() as cur:
|
with self.conn.cursor() as cur:
|
||||||
cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
|
cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
|
||||||
(token_id, variants))
|
(token_id, variants))
|
||||||
full, part = cur.fetchone()
|
full, part = cast(Tuple[int, List[int]],
|
||||||
|
cur.fetchone()) # type: ignore[no-untyped-call]
|
||||||
|
|
||||||
self._cache.names[token_id] = (full, part)
|
self._cache.names[token_id] = (full, part)
|
||||||
|
|
||||||
|
assert part is not None
|
||||||
|
|
||||||
full_tokens.add(full)
|
full_tokens.add(full)
|
||||||
partial_tokens.update(part)
|
partial_tokens.update(part)
|
||||||
|
|
||||||
return full_tokens, partial_tokens
|
return full_tokens, partial_tokens
|
||||||
|
|
||||||
|
|
||||||
def _add_postcode(self, item):
|
def _add_postcode(self, item: PlaceName) -> Optional[str]:
|
||||||
""" Make sure the normalized postcode is present in the word table.
|
""" Make sure the normalized postcode is present in the word table.
|
||||||
"""
|
"""
|
||||||
|
assert self.conn is not None
|
||||||
analyzer = self.token_analysis.analysis.get('@postcode')
|
analyzer = self.token_analysis.analysis.get('@postcode')
|
||||||
|
|
||||||
if analyzer is None:
|
if analyzer is None:
|
||||||
@@ -680,25 +716,24 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
class _TokenInfo:
|
class _TokenInfo:
|
||||||
""" Collect token information to be sent back to the database.
|
""" Collect token information to be sent back to the database.
|
||||||
"""
|
"""
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
self.names = None
|
self.names: Optional[str] = None
|
||||||
self.housenumbers = set()
|
self.housenumbers: Set[str] = set()
|
||||||
self.housenumber_tokens = set()
|
self.housenumber_tokens: Set[int] = set()
|
||||||
self.street_tokens = set()
|
self.street_tokens: Set[int] = set()
|
||||||
self.place_tokens = set()
|
self.place_tokens: Set[int] = set()
|
||||||
self.address_tokens = {}
|
self.address_tokens: Dict[str, str] = {}
|
||||||
self.postcode = None
|
self.postcode: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
def _mk_array(self, tokens: Iterable[Any]) -> str:
|
||||||
def _mk_array(tokens):
|
|
||||||
return f"{{{','.join((str(s) for s in tokens))}}}"
|
return f"{{{','.join((str(s) for s in tokens))}}}"
|
||||||
|
|
||||||
|
|
||||||
def to_dict(self):
|
def to_dict(self) -> Dict[str, Any]:
|
||||||
""" Return the token information in database importable format.
|
""" Return the token information in database importable format.
|
||||||
"""
|
"""
|
||||||
out = {}
|
out: Dict[str, Any] = {}
|
||||||
|
|
||||||
if self.names:
|
if self.names:
|
||||||
out['names'] = self.names
|
out['names'] = self.names
|
||||||
@@ -722,40 +757,41 @@ class _TokenInfo:
|
|||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
def set_names(self, fulls, partials):
|
def set_names(self, fulls: Iterable[int], partials: Iterable[int]) -> None:
|
||||||
""" Adds token information for the normalised names.
|
""" Adds token information for the normalised names.
|
||||||
"""
|
"""
|
||||||
self.names = self._mk_array(itertools.chain(fulls, partials))
|
self.names = self._mk_array(itertools.chain(fulls, partials))
|
||||||
|
|
||||||
|
|
||||||
def add_housenumber(self, token, hnr):
|
def add_housenumber(self, token: Optional[int], hnr: Optional[str]) -> None:
|
||||||
""" Extract housenumber information from a list of normalised
|
""" Extract housenumber information from a list of normalised
|
||||||
housenumbers.
|
housenumbers.
|
||||||
"""
|
"""
|
||||||
if token:
|
if token:
|
||||||
|
assert hnr is not None
|
||||||
self.housenumbers.add(hnr)
|
self.housenumbers.add(hnr)
|
||||||
self.housenumber_tokens.add(token)
|
self.housenumber_tokens.add(token)
|
||||||
|
|
||||||
|
|
||||||
def add_street(self, tokens):
|
def add_street(self, tokens: Iterable[int]) -> None:
|
||||||
""" Add addr:street match terms.
|
""" Add addr:street match terms.
|
||||||
"""
|
"""
|
||||||
self.street_tokens.update(tokens)
|
self.street_tokens.update(tokens)
|
||||||
|
|
||||||
|
|
||||||
def add_place(self, tokens):
|
def add_place(self, tokens: Iterable[int]) -> None:
|
||||||
""" Add addr:place search and match terms.
|
""" Add addr:place search and match terms.
|
||||||
"""
|
"""
|
||||||
self.place_tokens.update(tokens)
|
self.place_tokens.update(tokens)
|
||||||
|
|
||||||
|
|
||||||
def add_address_term(self, key, partials):
|
def add_address_term(self, key: str, partials: Iterable[int]) -> None:
|
||||||
""" Add additional address terms.
|
""" Add additional address terms.
|
||||||
"""
|
"""
|
||||||
if partials:
|
if partials:
|
||||||
self.address_tokens[key] = self._mk_array(partials)
|
self.address_tokens[key] = self._mk_array(partials)
|
||||||
|
|
||||||
def set_postcode(self, postcode):
|
def set_postcode(self, postcode: Optional[str]) -> None:
|
||||||
""" Set the postcode to the given one.
|
""" Set the postcode to the given one.
|
||||||
"""
|
"""
|
||||||
self.postcode = postcode
|
self.postcode = postcode
|
||||||
@@ -767,9 +803,9 @@ class _TokenCache:
|
|||||||
This cache is not thread-safe and needs to be instantiated per
|
This cache is not thread-safe and needs to be instantiated per
|
||||||
analyzer.
|
analyzer.
|
||||||
"""
|
"""
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
self.names = {}
|
self.names: Dict[str, Tuple[int, List[int]]] = {}
|
||||||
self.partials = {}
|
self.partials: Dict[str, int] = {}
|
||||||
self.fulls = {}
|
self.fulls: Dict[str, List[int]] = {}
|
||||||
self.postcodes = set()
|
self.postcodes: Set[str] = set()
|
||||||
self.housenumbers = {}
|
self.housenumbers: Dict[str, Tuple[Optional[int], Optional[str]]] = {}
|
||||||
|
|||||||
@@ -7,8 +7,11 @@
|
|||||||
"""
|
"""
|
||||||
Tokenizer implementing normalisation as used before Nominatim 4.
|
Tokenizer implementing normalisation as used before Nominatim 4.
|
||||||
"""
|
"""
|
||||||
|
from typing import Optional, Sequence, List, Tuple, Mapping, Any, Callable, \
|
||||||
|
cast, Dict, Set, Iterable
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
import logging
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
import re
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
from textwrap import dedent
|
from textwrap import dedent
|
||||||
@@ -17,10 +20,12 @@ from icu import Transliterator
|
|||||||
import psycopg2
|
import psycopg2
|
||||||
import psycopg2.extras
|
import psycopg2.extras
|
||||||
|
|
||||||
from nominatim.db.connection import connect
|
from nominatim.db.connection import connect, Connection
|
||||||
|
from nominatim.config import Configuration
|
||||||
from nominatim.db import properties
|
from nominatim.db import properties
|
||||||
from nominatim.db import utils as db_utils
|
from nominatim.db import utils as db_utils
|
||||||
from nominatim.db.sql_preprocessor import SQLPreprocessor
|
from nominatim.db.sql_preprocessor import SQLPreprocessor
|
||||||
|
from nominatim.data.place_info import PlaceInfo
|
||||||
from nominatim.errors import UsageError
|
from nominatim.errors import UsageError
|
||||||
from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
|
from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
|
||||||
|
|
||||||
@@ -29,13 +34,13 @@ DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
|
|||||||
|
|
||||||
LOG = logging.getLogger()
|
LOG = logging.getLogger()
|
||||||
|
|
||||||
def create(dsn, data_dir):
|
def create(dsn: str, data_dir: Path) -> 'LegacyTokenizer':
|
||||||
""" Create a new instance of the tokenizer provided by this module.
|
""" Create a new instance of the tokenizer provided by this module.
|
||||||
"""
|
"""
|
||||||
return LegacyTokenizer(dsn, data_dir)
|
return LegacyTokenizer(dsn, data_dir)
|
||||||
|
|
||||||
|
|
||||||
def _install_module(config_module_path, src_dir, module_dir):
|
def _install_module(config_module_path: str, src_dir: Path, module_dir: Path) -> str:
|
||||||
""" Copies the PostgreSQL normalisation module into the project
|
""" Copies the PostgreSQL normalisation module into the project
|
||||||
directory if necessary. For historical reasons the module is
|
directory if necessary. For historical reasons the module is
|
||||||
saved in the '/module' subdirectory and not with the other tokenizer
|
saved in the '/module' subdirectory and not with the other tokenizer
|
||||||
@@ -52,7 +57,7 @@ def _install_module(config_module_path, src_dir, module_dir):
|
|||||||
# Compatibility mode for builddir installations.
|
# Compatibility mode for builddir installations.
|
||||||
if module_dir.exists() and src_dir.samefile(module_dir):
|
if module_dir.exists() and src_dir.samefile(module_dir):
|
||||||
LOG.info('Running from build directory. Leaving database module as is.')
|
LOG.info('Running from build directory. Leaving database module as is.')
|
||||||
return module_dir
|
return str(module_dir)
|
||||||
|
|
||||||
# In any other case install the module in the project directory.
|
# In any other case install the module in the project directory.
|
||||||
if not module_dir.exists():
|
if not module_dir.exists():
|
||||||
@@ -64,10 +69,10 @@ def _install_module(config_module_path, src_dir, module_dir):
|
|||||||
|
|
||||||
LOG.info('Database module installed at %s', str(destfile))
|
LOG.info('Database module installed at %s', str(destfile))
|
||||||
|
|
||||||
return module_dir
|
return str(module_dir)
|
||||||
|
|
||||||
|
|
||||||
def _check_module(module_dir, conn):
|
def _check_module(module_dir: str, conn: Connection) -> None:
|
||||||
""" Try to use the PostgreSQL module to confirm that it is correctly
|
""" Try to use the PostgreSQL module to confirm that it is correctly
|
||||||
installed and accessible from PostgreSQL.
|
installed and accessible from PostgreSQL.
|
||||||
"""
|
"""
|
||||||
@@ -89,13 +94,13 @@ class LegacyTokenizer(AbstractTokenizer):
|
|||||||
calls to the database.
|
calls to the database.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, dsn, data_dir):
|
def __init__(self, dsn: str, data_dir: Path) -> None:
|
||||||
self.dsn = dsn
|
self.dsn = dsn
|
||||||
self.data_dir = data_dir
|
self.data_dir = data_dir
|
||||||
self.normalization = None
|
self.normalization: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
def init_new_db(self, config, init_db=True):
|
def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
|
||||||
""" Set up a new tokenizer for the database.
|
""" Set up a new tokenizer for the database.
|
||||||
|
|
||||||
This copies all necessary data in the project directory to make
|
This copies all necessary data in the project directory to make
|
||||||
@@ -119,7 +124,7 @@ class LegacyTokenizer(AbstractTokenizer):
|
|||||||
self._init_db_tables(config)
|
self._init_db_tables(config)
|
||||||
|
|
||||||
|
|
||||||
def init_from_project(self, config):
|
def init_from_project(self, config: Configuration) -> None:
|
||||||
""" Initialise the tokenizer from the project directory.
|
""" Initialise the tokenizer from the project directory.
|
||||||
"""
|
"""
|
||||||
with connect(self.dsn) as conn:
|
with connect(self.dsn) as conn:
|
||||||
@@ -132,7 +137,7 @@ class LegacyTokenizer(AbstractTokenizer):
|
|||||||
|
|
||||||
self._install_php(config, overwrite=False)
|
self._install_php(config, overwrite=False)
|
||||||
|
|
||||||
def finalize_import(self, config):
|
def finalize_import(self, config: Configuration) -> None:
|
||||||
""" Do any required postprocessing to make the tokenizer data ready
|
""" Do any required postprocessing to make the tokenizer data ready
|
||||||
for use.
|
for use.
|
||||||
"""
|
"""
|
||||||
@@ -141,7 +146,7 @@ class LegacyTokenizer(AbstractTokenizer):
|
|||||||
sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
|
sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
|
||||||
|
|
||||||
|
|
||||||
def update_sql_functions(self, config):
|
def update_sql_functions(self, config: Configuration) -> None:
|
||||||
""" Reimport the SQL functions for this tokenizer.
|
""" Reimport the SQL functions for this tokenizer.
|
||||||
"""
|
"""
|
||||||
with connect(self.dsn) as conn:
|
with connect(self.dsn) as conn:
|
||||||
@@ -154,7 +159,7 @@ class LegacyTokenizer(AbstractTokenizer):
|
|||||||
modulepath=modulepath)
|
modulepath=modulepath)
|
||||||
|
|
||||||
|
|
||||||
def check_database(self, _):
|
def check_database(self, _: Configuration) -> Optional[str]:
|
||||||
""" Check that the tokenizer is set up correctly.
|
""" Check that the tokenizer is set up correctly.
|
||||||
"""
|
"""
|
||||||
hint = """\
|
hint = """\
|
||||||
@@ -181,7 +186,7 @@ class LegacyTokenizer(AbstractTokenizer):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def migrate_database(self, config):
|
def migrate_database(self, config: Configuration) -> None:
|
||||||
""" Initialise the project directory of an existing database for
|
""" Initialise the project directory of an existing database for
|
||||||
use with this tokenizer.
|
use with this tokenizer.
|
||||||
|
|
||||||
@@ -198,7 +203,7 @@ class LegacyTokenizer(AbstractTokenizer):
|
|||||||
self._save_config(conn, config)
|
self._save_config(conn, config)
|
||||||
|
|
||||||
|
|
||||||
def update_statistics(self):
|
def update_statistics(self) -> None:
|
||||||
""" Recompute the frequency of full words.
|
""" Recompute the frequency of full words.
|
||||||
"""
|
"""
|
||||||
with connect(self.dsn) as conn:
|
with connect(self.dsn) as conn:
|
||||||
@@ -218,13 +223,13 @@ class LegacyTokenizer(AbstractTokenizer):
|
|||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
|
|
||||||
def update_word_tokens(self):
|
def update_word_tokens(self) -> None:
|
||||||
""" No house-keeping implemented for the legacy tokenizer.
|
""" No house-keeping implemented for the legacy tokenizer.
|
||||||
"""
|
"""
|
||||||
LOG.info("No tokenizer clean-up available.")
|
LOG.info("No tokenizer clean-up available.")
|
||||||
|
|
||||||
|
|
||||||
def name_analyzer(self):
|
def name_analyzer(self) -> 'LegacyNameAnalyzer':
|
||||||
""" Create a new analyzer for tokenizing names and queries
|
""" Create a new analyzer for tokenizing names and queries
|
||||||
using this tokinzer. Analyzers are context managers and should
|
using this tokinzer. Analyzers are context managers and should
|
||||||
be used accordingly:
|
be used accordingly:
|
||||||
@@ -244,7 +249,7 @@ class LegacyTokenizer(AbstractTokenizer):
|
|||||||
return LegacyNameAnalyzer(self.dsn, normalizer)
|
return LegacyNameAnalyzer(self.dsn, normalizer)
|
||||||
|
|
||||||
|
|
||||||
def _install_php(self, config, overwrite=True):
|
def _install_php(self, config: Configuration, overwrite: bool = True) -> None:
|
||||||
""" Install the php script for the tokenizer.
|
""" Install the php script for the tokenizer.
|
||||||
"""
|
"""
|
||||||
php_file = self.data_dir / "tokenizer.php"
|
php_file = self.data_dir / "tokenizer.php"
|
||||||
@@ -258,7 +263,7 @@ class LegacyTokenizer(AbstractTokenizer):
|
|||||||
"""), encoding='utf-8')
|
"""), encoding='utf-8')
|
||||||
|
|
||||||
|
|
||||||
def _init_db_tables(self, config):
|
def _init_db_tables(self, config: Configuration) -> None:
|
||||||
""" Set up the word table and fill it with pre-computed word
|
""" Set up the word table and fill it with pre-computed word
|
||||||
frequencies.
|
frequencies.
|
||||||
"""
|
"""
|
||||||
@@ -271,10 +276,12 @@ class LegacyTokenizer(AbstractTokenizer):
|
|||||||
db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
|
db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
|
||||||
|
|
||||||
|
|
||||||
def _save_config(self, conn, config):
|
def _save_config(self, conn: Connection, config: Configuration) -> None:
|
||||||
""" Save the configuration that needs to remain stable for the given
|
""" Save the configuration that needs to remain stable for the given
|
||||||
database as database properties.
|
database as database properties.
|
||||||
"""
|
"""
|
||||||
|
assert self.normalization is not None
|
||||||
|
|
||||||
properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
|
properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
|
||||||
properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
|
properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
|
||||||
|
|
||||||
@@ -287,8 +294,8 @@ class LegacyNameAnalyzer(AbstractAnalyzer):
|
|||||||
normalization.
|
normalization.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, dsn, normalizer):
|
def __init__(self, dsn: str, normalizer: Any):
|
||||||
self.conn = connect(dsn).connection
|
self.conn: Optional[Connection] = connect(dsn).connection
|
||||||
self.conn.autocommit = True
|
self.conn.autocommit = True
|
||||||
self.normalizer = normalizer
|
self.normalizer = normalizer
|
||||||
psycopg2.extras.register_hstore(self.conn)
|
psycopg2.extras.register_hstore(self.conn)
|
||||||
@@ -296,7 +303,7 @@ class LegacyNameAnalyzer(AbstractAnalyzer):
|
|||||||
self._cache = _TokenCache(self.conn)
|
self._cache = _TokenCache(self.conn)
|
||||||
|
|
||||||
|
|
||||||
def close(self):
|
def close(self) -> None:
|
||||||
""" Free all resources used by the analyzer.
|
""" Free all resources used by the analyzer.
|
||||||
"""
|
"""
|
||||||
if self.conn:
|
if self.conn:
|
||||||
@@ -304,7 +311,7 @@ class LegacyNameAnalyzer(AbstractAnalyzer):
|
|||||||
self.conn = None
|
self.conn = None
|
||||||
|
|
||||||
|
|
||||||
def get_word_token_info(self, words):
|
def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
|
||||||
""" Return token information for the given list of words.
|
""" Return token information for the given list of words.
|
||||||
If a word starts with # it is assumed to be a full name
|
If a word starts with # it is assumed to be a full name
|
||||||
otherwise is a partial name.
|
otherwise is a partial name.
|
||||||
@@ -315,6 +322,7 @@ class LegacyNameAnalyzer(AbstractAnalyzer):
|
|||||||
The function is used for testing and debugging only
|
The function is used for testing and debugging only
|
||||||
and not necessarily efficient.
|
and not necessarily efficient.
|
||||||
"""
|
"""
|
||||||
|
assert self.conn is not None
|
||||||
with self.conn.cursor() as cur:
|
with self.conn.cursor() as cur:
|
||||||
cur.execute("""SELECT t.term, word_token, word_id
|
cur.execute("""SELECT t.term, word_token, word_id
|
||||||
FROM word, (SELECT unnest(%s::TEXT[]) as term) t
|
FROM word, (SELECT unnest(%s::TEXT[]) as term) t
|
||||||
@@ -330,14 +338,14 @@ class LegacyNameAnalyzer(AbstractAnalyzer):
|
|||||||
return [(r[0], r[1], r[2]) for r in cur]
|
return [(r[0], r[1], r[2]) for r in cur]
|
||||||
|
|
||||||
|
|
||||||
def normalize(self, phrase):
|
def normalize(self, phrase: str) -> str:
|
||||||
""" Normalize the given phrase, i.e. remove all properties that
|
""" Normalize the given phrase, i.e. remove all properties that
|
||||||
are irrelevant for search.
|
are irrelevant for search.
|
||||||
"""
|
"""
|
||||||
return self.normalizer.transliterate(phrase)
|
return cast(str, self.normalizer.transliterate(phrase))
|
||||||
|
|
||||||
|
|
||||||
def normalize_postcode(self, postcode):
|
def normalize_postcode(self, postcode: str) -> str:
|
||||||
""" Convert the postcode to a standardized form.
|
""" Convert the postcode to a standardized form.
|
||||||
|
|
||||||
This function must yield exactly the same result as the SQL function
|
This function must yield exactly the same result as the SQL function
|
||||||
@@ -346,10 +354,12 @@ class LegacyNameAnalyzer(AbstractAnalyzer):
|
|||||||
return postcode.strip().upper()
|
return postcode.strip().upper()
|
||||||
|
|
||||||
|
|
||||||
def update_postcodes_from_db(self):
|
def update_postcodes_from_db(self) -> None:
|
||||||
""" Update postcode tokens in the word table from the location_postcode
|
""" Update postcode tokens in the word table from the location_postcode
|
||||||
table.
|
table.
|
||||||
"""
|
"""
|
||||||
|
assert self.conn is not None
|
||||||
|
|
||||||
with self.conn.cursor() as cur:
|
with self.conn.cursor() as cur:
|
||||||
# This finds us the rows in location_postcode and word that are
|
# This finds us the rows in location_postcode and word that are
|
||||||
# missing in the other table.
|
# missing in the other table.
|
||||||
@@ -383,9 +393,12 @@ class LegacyNameAnalyzer(AbstractAnalyzer):
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
def update_special_phrases(self, phrases, should_replace):
|
def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
|
||||||
|
should_replace: bool) -> None:
|
||||||
""" Replace the search index for special phrases with the new phrases.
|
""" Replace the search index for special phrases with the new phrases.
|
||||||
"""
|
"""
|
||||||
|
assert self.conn is not None
|
||||||
|
|
||||||
norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
|
norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
|
||||||
for p in phrases))
|
for p in phrases))
|
||||||
|
|
||||||
@@ -422,9 +435,11 @@ class LegacyNameAnalyzer(AbstractAnalyzer):
|
|||||||
len(norm_phrases), len(to_add), len(to_delete))
|
len(norm_phrases), len(to_add), len(to_delete))
|
||||||
|
|
||||||
|
|
||||||
def add_country_names(self, country_code, names):
|
def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
|
||||||
""" Add names for the given country to the search index.
|
""" Add names for the given country to the search index.
|
||||||
"""
|
"""
|
||||||
|
assert self.conn is not None
|
||||||
|
|
||||||
with self.conn.cursor() as cur:
|
with self.conn.cursor() as cur:
|
||||||
cur.execute(
|
cur.execute(
|
||||||
"""INSERT INTO word (word_id, word_token, country_code)
|
"""INSERT INTO word (word_id, word_token, country_code)
|
||||||
@@ -436,12 +451,14 @@ class LegacyNameAnalyzer(AbstractAnalyzer):
|
|||||||
""", (country_code, list(names.values()), country_code))
|
""", (country_code, list(names.values()), country_code))
|
||||||
|
|
||||||
|
|
||||||
def process_place(self, place):
|
def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
|
||||||
""" Determine tokenizer information about the given place.
|
""" Determine tokenizer information about the given place.
|
||||||
|
|
||||||
Returns a JSON-serialisable structure that will be handed into
|
Returns a JSON-serialisable structure that will be handed into
|
||||||
the database via the token_info field.
|
the database via the token_info field.
|
||||||
"""
|
"""
|
||||||
|
assert self.conn is not None
|
||||||
|
|
||||||
token_info = _TokenInfo(self._cache)
|
token_info = _TokenInfo(self._cache)
|
||||||
|
|
||||||
names = place.name
|
names = place.name
|
||||||
@@ -450,6 +467,7 @@ class LegacyNameAnalyzer(AbstractAnalyzer):
|
|||||||
token_info.add_names(self.conn, names)
|
token_info.add_names(self.conn, names)
|
||||||
|
|
||||||
if place.is_country():
|
if place.is_country():
|
||||||
|
assert place.country_code is not None
|
||||||
self.add_country_names(place.country_code, names)
|
self.add_country_names(place.country_code, names)
|
||||||
|
|
||||||
address = place.address
|
address = place.address
|
||||||
@@ -459,7 +477,8 @@ class LegacyNameAnalyzer(AbstractAnalyzer):
|
|||||||
return token_info.data
|
return token_info.data
|
||||||
|
|
||||||
|
|
||||||
def _process_place_address(self, token_info, address):
|
def _process_place_address(self, token_info: '_TokenInfo', address: Mapping[str, str]) -> None:
|
||||||
|
assert self.conn is not None
|
||||||
hnrs = []
|
hnrs = []
|
||||||
addr_terms = []
|
addr_terms = []
|
||||||
|
|
||||||
@@ -491,12 +510,12 @@ class LegacyNameAnalyzer(AbstractAnalyzer):
|
|||||||
class _TokenInfo:
|
class _TokenInfo:
|
||||||
""" Collect token information to be sent back to the database.
|
""" Collect token information to be sent back to the database.
|
||||||
"""
|
"""
|
||||||
def __init__(self, cache):
|
def __init__(self, cache: '_TokenCache') -> None:
|
||||||
self.cache = cache
|
self.cache = cache
|
||||||
self.data = {}
|
self.data: Dict[str, Any] = {}
|
||||||
|
|
||||||
|
|
||||||
def add_names(self, conn, names):
|
def add_names(self, conn: Connection, names: Mapping[str, str]) -> None:
|
||||||
""" Add token information for the names of the place.
|
""" Add token information for the names of the place.
|
||||||
"""
|
"""
|
||||||
with conn.cursor() as cur:
|
with conn.cursor() as cur:
|
||||||
@@ -505,7 +524,7 @@ class _TokenInfo:
|
|||||||
(names, ))
|
(names, ))
|
||||||
|
|
||||||
|
|
||||||
def add_housenumbers(self, conn, hnrs):
|
def add_housenumbers(self, conn: Connection, hnrs: Sequence[str]) -> None:
|
||||||
""" Extract housenumber information from the address.
|
""" Extract housenumber information from the address.
|
||||||
"""
|
"""
|
||||||
if len(hnrs) == 1:
|
if len(hnrs) == 1:
|
||||||
@@ -516,7 +535,7 @@ class _TokenInfo:
|
|||||||
return
|
return
|
||||||
|
|
||||||
# split numbers if necessary
|
# split numbers if necessary
|
||||||
simple_list = []
|
simple_list: List[str] = []
|
||||||
for hnr in hnrs:
|
for hnr in hnrs:
|
||||||
simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
|
simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
|
||||||
|
|
||||||
@@ -525,49 +544,53 @@ class _TokenInfo:
|
|||||||
|
|
||||||
with conn.cursor() as cur:
|
with conn.cursor() as cur:
|
||||||
cur.execute("SELECT * FROM create_housenumbers(%s)", (simple_list, ))
|
cur.execute("SELECT * FROM create_housenumbers(%s)", (simple_list, ))
|
||||||
self.data['hnr_tokens'], self.data['hnr'] = cur.fetchone()
|
self.data['hnr_tokens'], self.data['hnr'] = \
|
||||||
|
cur.fetchone() # type: ignore[no-untyped-call]
|
||||||
|
|
||||||
|
|
||||||
def set_postcode(self, postcode):
|
def set_postcode(self, postcode: str) -> None:
|
||||||
""" Set or replace the postcode token with the given value.
|
""" Set or replace the postcode token with the given value.
|
||||||
"""
|
"""
|
||||||
self.data['postcode'] = postcode
|
self.data['postcode'] = postcode
|
||||||
|
|
||||||
def add_street(self, conn, street):
|
def add_street(self, conn: Connection, street: str) -> None:
|
||||||
""" Add addr:street match terms.
|
""" Add addr:street match terms.
|
||||||
"""
|
"""
|
||||||
def _get_street(name):
|
def _get_street(name: str) -> List[int]:
|
||||||
with conn.cursor() as cur:
|
with conn.cursor() as cur:
|
||||||
return cur.scalar("SELECT word_ids_from_name(%s)::text", (name, ))
|
return cast(List[int],
|
||||||
|
cur.scalar("SELECT word_ids_from_name(%s)::text", (name, )))
|
||||||
|
|
||||||
tokens = self.cache.streets.get(street, _get_street)
|
tokens = self.cache.streets.get(street, _get_street)
|
||||||
if tokens:
|
if tokens:
|
||||||
self.data['street'] = tokens
|
self.data['street'] = tokens
|
||||||
|
|
||||||
|
|
||||||
def add_place(self, conn, place):
|
def add_place(self, conn: Connection, place: str) -> None:
|
||||||
""" Add addr:place search and match terms.
|
""" Add addr:place search and match terms.
|
||||||
"""
|
"""
|
||||||
def _get_place(name):
|
def _get_place(name: str) -> Tuple[List[int], List[int]]:
|
||||||
with conn.cursor() as cur:
|
with conn.cursor() as cur:
|
||||||
cur.execute("""SELECT make_keywords(hstore('name' , %s))::text,
|
cur.execute("""SELECT make_keywords(hstore('name' , %s))::text,
|
||||||
word_ids_from_name(%s)::text""",
|
word_ids_from_name(%s)::text""",
|
||||||
(name, name))
|
(name, name))
|
||||||
return cur.fetchone()
|
return cast(Tuple[List[int], List[int]],
|
||||||
|
cur.fetchone()) # type: ignore[no-untyped-call]
|
||||||
|
|
||||||
self.data['place_search'], self.data['place_match'] = \
|
self.data['place_search'], self.data['place_match'] = \
|
||||||
self.cache.places.get(place, _get_place)
|
self.cache.places.get(place, _get_place)
|
||||||
|
|
||||||
|
|
||||||
def add_address_terms(self, conn, terms):
|
def add_address_terms(self, conn: Connection, terms: Sequence[Tuple[str, str]]) -> None:
|
||||||
""" Add additional address terms.
|
""" Add additional address terms.
|
||||||
"""
|
"""
|
||||||
def _get_address_term(name):
|
def _get_address_term(name: str) -> Tuple[List[int], List[int]]:
|
||||||
with conn.cursor() as cur:
|
with conn.cursor() as cur:
|
||||||
cur.execute("""SELECT addr_ids_from_name(%s)::text,
|
cur.execute("""SELECT addr_ids_from_name(%s)::text,
|
||||||
word_ids_from_name(%s)::text""",
|
word_ids_from_name(%s)::text""",
|
||||||
(name, name))
|
(name, name))
|
||||||
return cur.fetchone()
|
return cast(Tuple[List[int], List[int]],
|
||||||
|
cur.fetchone()) # type: ignore[no-untyped-call]
|
||||||
|
|
||||||
tokens = {}
|
tokens = {}
|
||||||
for key, value in terms:
|
for key, value in terms:
|
||||||
@@ -584,13 +607,12 @@ class _LRU:
|
|||||||
produce the item when there is a cache miss.
|
produce the item when there is a cache miss.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, maxsize=128, init_data=None):
|
def __init__(self, maxsize: int = 128):
|
||||||
self.data = init_data or OrderedDict()
|
self.data: 'OrderedDict[str, Any]' = OrderedDict()
|
||||||
self.maxsize = maxsize
|
self.maxsize = maxsize
|
||||||
if init_data is not None and len(init_data) > maxsize:
|
|
||||||
self.maxsize = len(init_data)
|
|
||||||
|
|
||||||
def get(self, key, generator):
|
|
||||||
|
def get(self, key: str, generator: Callable[[str], Any]) -> Any:
|
||||||
""" Get the item with the given key from the cache. If nothing
|
""" Get the item with the given key from the cache. If nothing
|
||||||
is found in the cache, generate the value through the
|
is found in the cache, generate the value through the
|
||||||
generator function and store it in the cache.
|
generator function and store it in the cache.
|
||||||
@@ -613,7 +635,7 @@ class _TokenCache:
|
|||||||
This cache is not thread-safe and needs to be instantiated per
|
This cache is not thread-safe and needs to be instantiated per
|
||||||
analyzer.
|
analyzer.
|
||||||
"""
|
"""
|
||||||
def __init__(self, conn):
|
def __init__(self, conn: Connection):
|
||||||
# various LRU caches
|
# various LRU caches
|
||||||
self.streets = _LRU(maxsize=256)
|
self.streets = _LRU(maxsize=256)
|
||||||
self.places = _LRU(maxsize=128)
|
self.places = _LRU(maxsize=128)
|
||||||
@@ -623,18 +645,18 @@ class _TokenCache:
|
|||||||
with conn.cursor() as cur:
|
with conn.cursor() as cur:
|
||||||
cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text
|
cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text
|
||||||
FROM generate_series(1, 100) as i""")
|
FROM generate_series(1, 100) as i""")
|
||||||
self._cached_housenumbers = {str(r[0]): r[1] for r in cur}
|
self._cached_housenumbers: Dict[str, str] = {str(r[0]): r[1] for r in cur}
|
||||||
|
|
||||||
# For postcodes remember the ones that have already been added
|
# For postcodes remember the ones that have already been added
|
||||||
self.postcodes = set()
|
self.postcodes: Set[str] = set()
|
||||||
|
|
||||||
def get_housenumber(self, number):
|
def get_housenumber(self, number: str) -> Optional[str]:
|
||||||
""" Get a housenumber token from the cache.
|
""" Get a housenumber token from the cache.
|
||||||
"""
|
"""
|
||||||
return self._cached_housenumbers.get(number)
|
return self._cached_housenumbers.get(number)
|
||||||
|
|
||||||
|
|
||||||
def add_postcode(self, conn, postcode):
|
def add_postcode(self, conn: Connection, postcode: str) -> None:
|
||||||
""" Make sure the given postcode is in the database.
|
""" Make sure the given postcode is in the database.
|
||||||
"""
|
"""
|
||||||
if postcode not in self.postcodes:
|
if postcode not in self.postcodes:
|
||||||
|
|||||||
@@ -8,100 +8,13 @@
|
|||||||
Handler for cleaning name and address tags in place information before it
|
Handler for cleaning name and address tags in place information before it
|
||||||
is handed to the token analysis.
|
is handed to the token analysis.
|
||||||
"""
|
"""
|
||||||
|
from typing import Optional, List, Mapping, Sequence, Callable, Any, Tuple
|
||||||
import importlib
|
import importlib
|
||||||
|
|
||||||
from nominatim.errors import UsageError
|
from nominatim.errors import UsageError
|
||||||
from nominatim.tokenizer.sanitizers.config import SanitizerConfig
|
from nominatim.tokenizer.sanitizers.config import SanitizerConfig
|
||||||
|
from nominatim.tokenizer.sanitizers.base import SanitizerHandler, ProcessInfo, PlaceName
|
||||||
class PlaceName:
|
from nominatim.data.place_info import PlaceInfo
|
||||||
""" A searchable name for a place together with properties.
|
|
||||||
Every name object saves the name proper and two basic properties:
|
|
||||||
* 'kind' describes the name of the OSM key used without any suffixes
|
|
||||||
(i.e. the part after the colon removed)
|
|
||||||
* 'suffix' contains the suffix of the OSM tag, if any. The suffix
|
|
||||||
is the part of the key after the first colon.
|
|
||||||
In addition to that, the name may have arbitrary additional attributes.
|
|
||||||
Which attributes are used, depends on the token analyser.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, name, kind, suffix):
|
|
||||||
self.name = name
|
|
||||||
self.kind = kind
|
|
||||||
self.suffix = suffix
|
|
||||||
self.attr = {}
|
|
||||||
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return f"PlaceName(name='{self.name}',kind='{self.kind}',suffix='{self.suffix}')"
|
|
||||||
|
|
||||||
|
|
||||||
def clone(self, name=None, kind=None, suffix=None, attr=None):
|
|
||||||
""" Create a deep copy of the place name, optionally with the
|
|
||||||
given parameters replaced. In the attribute list only the given
|
|
||||||
keys are updated. The list is not replaced completely.
|
|
||||||
In particular, the function cannot to be used to remove an
|
|
||||||
attribute from a place name.
|
|
||||||
"""
|
|
||||||
newobj = PlaceName(name or self.name,
|
|
||||||
kind or self.kind,
|
|
||||||
suffix or self.suffix)
|
|
||||||
|
|
||||||
newobj.attr.update(self.attr)
|
|
||||||
if attr:
|
|
||||||
newobj.attr.update(attr)
|
|
||||||
|
|
||||||
return newobj
|
|
||||||
|
|
||||||
|
|
||||||
def set_attr(self, key, value):
|
|
||||||
""" Add the given property to the name. If the property was already
|
|
||||||
set, then the value is overwritten.
|
|
||||||
"""
|
|
||||||
self.attr[key] = value
|
|
||||||
|
|
||||||
|
|
||||||
def get_attr(self, key, default=None):
|
|
||||||
""" Return the given property or the value of 'default' if it
|
|
||||||
is not set.
|
|
||||||
"""
|
|
||||||
return self.attr.get(key, default)
|
|
||||||
|
|
||||||
|
|
||||||
def has_attr(self, key):
|
|
||||||
""" Check if the given attribute is set.
|
|
||||||
"""
|
|
||||||
return key in self.attr
|
|
||||||
|
|
||||||
|
|
||||||
class _ProcessInfo:
|
|
||||||
""" Container class for information handed into to handler functions.
|
|
||||||
The 'names' and 'address' members are mutable. A handler must change
|
|
||||||
them by either modifying the lists place or replacing the old content
|
|
||||||
with a new list.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, place):
|
|
||||||
self.place = place
|
|
||||||
self.names = self._convert_name_dict(place.name)
|
|
||||||
self.address = self._convert_name_dict(place.address)
|
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _convert_name_dict(names):
|
|
||||||
""" Convert a dictionary of names into a list of PlaceNames.
|
|
||||||
The dictionary key is split into the primary part of the key
|
|
||||||
and the suffix (the part after an optional colon).
|
|
||||||
"""
|
|
||||||
out = []
|
|
||||||
|
|
||||||
if names:
|
|
||||||
for key, value in names.items():
|
|
||||||
parts = key.split(':', 1)
|
|
||||||
out.append(PlaceName(value.strip(),
|
|
||||||
parts[0].strip(),
|
|
||||||
parts[1].strip() if len(parts) > 1 else None))
|
|
||||||
|
|
||||||
return out
|
|
||||||
|
|
||||||
|
|
||||||
class PlaceSanitizer:
|
class PlaceSanitizer:
|
||||||
@@ -109,24 +22,24 @@ class PlaceSanitizer:
|
|||||||
names and address before they are used by the token analysers.
|
names and address before they are used by the token analysers.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, rules):
|
def __init__(self, rules: Optional[Sequence[Mapping[str, Any]]]) -> None:
|
||||||
self.handlers = []
|
self.handlers: List[Callable[[ProcessInfo], None]] = []
|
||||||
|
|
||||||
if rules:
|
if rules:
|
||||||
for func in rules:
|
for func in rules:
|
||||||
if 'step' not in func:
|
if 'step' not in func:
|
||||||
raise UsageError("Sanitizer rule is missing the 'step' attribute.")
|
raise UsageError("Sanitizer rule is missing the 'step' attribute.")
|
||||||
module_name = 'nominatim.tokenizer.sanitizers.' + func['step'].replace('-', '_')
|
module_name = 'nominatim.tokenizer.sanitizers.' + func['step'].replace('-', '_')
|
||||||
handler_module = importlib.import_module(module_name)
|
handler_module: SanitizerHandler = importlib.import_module(module_name)
|
||||||
self.handlers.append(handler_module.create(SanitizerConfig(func)))
|
self.handlers.append(handler_module.create(SanitizerConfig(func)))
|
||||||
|
|
||||||
|
|
||||||
def process_names(self, place):
|
def process_names(self, place: PlaceInfo) -> Tuple[List[PlaceName], List[PlaceName]]:
|
||||||
""" Extract a sanitized list of names and address parts from the
|
""" Extract a sanitized list of names and address parts from the
|
||||||
given place. The function returns a tuple
|
given place. The function returns a tuple
|
||||||
(list of names, list of address names)
|
(list of names, list of address names)
|
||||||
"""
|
"""
|
||||||
obj = _ProcessInfo(place)
|
obj = ProcessInfo(place)
|
||||||
|
|
||||||
for func in self.handlers:
|
for func in self.handlers:
|
||||||
func(obj)
|
func(obj)
|
||||||
|
|||||||
119
nominatim/tokenizer/sanitizers/base.py
Normal file
119
nominatim/tokenizer/sanitizers/base.py
Normal file
@@ -0,0 +1,119 @@
|
|||||||
|
# SPDX-License-Identifier: GPL-2.0-only
|
||||||
|
#
|
||||||
|
# This file is part of Nominatim. (https://nominatim.org)
|
||||||
|
#
|
||||||
|
# Copyright (C) 2022 by the Nominatim developer community.
|
||||||
|
# For a full list of authors see the git log.
|
||||||
|
"""
|
||||||
|
Common data types and protocols for sanitizers.
|
||||||
|
"""
|
||||||
|
from typing import Optional, Dict, List, Mapping, Callable
|
||||||
|
|
||||||
|
from nominatim.tokenizer.sanitizers.config import SanitizerConfig
|
||||||
|
from nominatim.data.place_info import PlaceInfo
|
||||||
|
from nominatim.typing import Protocol, Final
|
||||||
|
|
||||||
|
class PlaceName:
|
||||||
|
""" A searchable name for a place together with properties.
|
||||||
|
Every name object saves the name proper and two basic properties:
|
||||||
|
* 'kind' describes the name of the OSM key used without any suffixes
|
||||||
|
(i.e. the part after the colon removed)
|
||||||
|
* 'suffix' contains the suffix of the OSM tag, if any. The suffix
|
||||||
|
is the part of the key after the first colon.
|
||||||
|
In addition to that, the name may have arbitrary additional attributes.
|
||||||
|
Which attributes are used, depends on the token analyser.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, name: str, kind: str, suffix: Optional[str]):
|
||||||
|
self.name = name
|
||||||
|
self.kind = kind
|
||||||
|
self.suffix = suffix
|
||||||
|
self.attr: Dict[str, str] = {}
|
||||||
|
|
||||||
|
|
||||||
|
def __repr__(self) -> str:
|
||||||
|
return f"PlaceName(name='{self.name}',kind='{self.kind}',suffix='{self.suffix}')"
|
||||||
|
|
||||||
|
|
||||||
|
def clone(self, name: Optional[str] = None,
|
||||||
|
kind: Optional[str] = None,
|
||||||
|
suffix: Optional[str] = None,
|
||||||
|
attr: Optional[Mapping[str, str]] = None) -> 'PlaceName':
|
||||||
|
""" Create a deep copy of the place name, optionally with the
|
||||||
|
given parameters replaced. In the attribute list only the given
|
||||||
|
keys are updated. The list is not replaced completely.
|
||||||
|
In particular, the function cannot to be used to remove an
|
||||||
|
attribute from a place name.
|
||||||
|
"""
|
||||||
|
newobj = PlaceName(name or self.name,
|
||||||
|
kind or self.kind,
|
||||||
|
suffix or self.suffix)
|
||||||
|
|
||||||
|
newobj.attr.update(self.attr)
|
||||||
|
if attr:
|
||||||
|
newobj.attr.update(attr)
|
||||||
|
|
||||||
|
return newobj
|
||||||
|
|
||||||
|
|
||||||
|
def set_attr(self, key: str, value: str) -> None:
|
||||||
|
""" Add the given property to the name. If the property was already
|
||||||
|
set, then the value is overwritten.
|
||||||
|
"""
|
||||||
|
self.attr[key] = value
|
||||||
|
|
||||||
|
|
||||||
|
def get_attr(self, key: str, default: Optional[str] = None) -> Optional[str]:
|
||||||
|
""" Return the given property or the value of 'default' if it
|
||||||
|
is not set.
|
||||||
|
"""
|
||||||
|
return self.attr.get(key, default)
|
||||||
|
|
||||||
|
|
||||||
|
def has_attr(self, key: str) -> bool:
|
||||||
|
""" Check if the given attribute is set.
|
||||||
|
"""
|
||||||
|
return key in self.attr
|
||||||
|
|
||||||
|
|
||||||
|
class ProcessInfo:
|
||||||
|
""" Container class for information handed into to handler functions.
|
||||||
|
The 'names' and 'address' members are mutable. A handler must change
|
||||||
|
them by either modifying the lists place or replacing the old content
|
||||||
|
with a new list.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, place: PlaceInfo):
|
||||||
|
self.place: Final = place
|
||||||
|
self.names = self._convert_name_dict(place.name)
|
||||||
|
self.address = self._convert_name_dict(place.address)
|
||||||
|
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _convert_name_dict(names: Optional[Mapping[str, str]]) -> List[PlaceName]:
|
||||||
|
""" Convert a dictionary of names into a list of PlaceNames.
|
||||||
|
The dictionary key is split into the primary part of the key
|
||||||
|
and the suffix (the part after an optional colon).
|
||||||
|
"""
|
||||||
|
out = []
|
||||||
|
|
||||||
|
if names:
|
||||||
|
for key, value in names.items():
|
||||||
|
parts = key.split(':', 1)
|
||||||
|
out.append(PlaceName(value.strip(),
|
||||||
|
parts[0].strip(),
|
||||||
|
parts[1].strip() if len(parts) > 1 else None))
|
||||||
|
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
class SanitizerHandler(Protocol):
|
||||||
|
""" Protocol for sanitizer modules.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def create(self, config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
|
||||||
|
"""
|
||||||
|
A sanitizer must define a single function `create`. It takes the
|
||||||
|
dictionary with the configuration information for the sanitizer and
|
||||||
|
returns a function that transforms name and address.
|
||||||
|
"""
|
||||||
@@ -24,11 +24,15 @@ Arguments:
|
|||||||
or a list of strings, where each string is a regular
|
or a list of strings, where each string is a regular
|
||||||
expression that must match the full house number value.
|
expression that must match the full house number value.
|
||||||
"""
|
"""
|
||||||
|
from typing import Callable, Iterator, List
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
from nominatim.tokenizer.sanitizers.base import ProcessInfo, PlaceName
|
||||||
|
from nominatim.tokenizer.sanitizers.config import SanitizerConfig
|
||||||
|
|
||||||
class _HousenumberSanitizer:
|
class _HousenumberSanitizer:
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config: SanitizerConfig) -> None:
|
||||||
self.filter_kind = config.get_filter_kind('housenumber')
|
self.filter_kind = config.get_filter_kind('housenumber')
|
||||||
self.split_regexp = config.get_delimiter()
|
self.split_regexp = config.get_delimiter()
|
||||||
|
|
||||||
@@ -37,13 +41,13 @@ class _HousenumberSanitizer:
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
def __call__(self, obj):
|
def __call__(self, obj: ProcessInfo) -> None:
|
||||||
if not obj.address:
|
if not obj.address:
|
||||||
return
|
return
|
||||||
|
|
||||||
new_address = []
|
new_address: List[PlaceName] = []
|
||||||
for item in obj.address:
|
for item in obj.address:
|
||||||
if self.filter_kind(item):
|
if self.filter_kind(item.kind):
|
||||||
if self._treat_as_name(item.name):
|
if self._treat_as_name(item.name):
|
||||||
obj.names.append(item.clone(kind='housenumber'))
|
obj.names.append(item.clone(kind='housenumber'))
|
||||||
else:
|
else:
|
||||||
@@ -56,7 +60,7 @@ class _HousenumberSanitizer:
|
|||||||
obj.address = new_address
|
obj.address = new_address
|
||||||
|
|
||||||
|
|
||||||
def sanitize(self, value):
|
def sanitize(self, value: str) -> Iterator[str]:
|
||||||
""" Extract housenumbers in a regularized format from an OSM value.
|
""" Extract housenumbers in a regularized format from an OSM value.
|
||||||
|
|
||||||
The function works as a generator that yields all valid housenumbers
|
The function works as a generator that yields all valid housenumbers
|
||||||
@@ -67,16 +71,15 @@ class _HousenumberSanitizer:
|
|||||||
yield from self._regularize(hnr)
|
yield from self._regularize(hnr)
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
def _regularize(self, hnr: str) -> Iterator[str]:
|
||||||
def _regularize(hnr):
|
|
||||||
yield hnr
|
yield hnr
|
||||||
|
|
||||||
|
|
||||||
def _treat_as_name(self, housenumber):
|
def _treat_as_name(self, housenumber: str) -> bool:
|
||||||
return any(r.fullmatch(housenumber) is not None for r in self.is_name_regexp)
|
return any(r.fullmatch(housenumber) is not None for r in self.is_name_regexp)
|
||||||
|
|
||||||
|
|
||||||
def create(config):
|
def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
|
||||||
""" Create a housenumber processing function.
|
""" Create a housenumber processing function.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|||||||
@@ -20,11 +20,15 @@ Arguments:
|
|||||||
objects that have no country assigned. These are always
|
objects that have no country assigned. These are always
|
||||||
assumed to have no postcode.
|
assumed to have no postcode.
|
||||||
"""
|
"""
|
||||||
|
from typing import Callable, Optional, Tuple
|
||||||
|
|
||||||
from nominatim.data.postcode_format import PostcodeFormatter
|
from nominatim.data.postcode_format import PostcodeFormatter
|
||||||
|
from nominatim.tokenizer.sanitizers.base import ProcessInfo
|
||||||
|
from nominatim.tokenizer.sanitizers.config import SanitizerConfig
|
||||||
|
|
||||||
class _PostcodeSanitizer:
|
class _PostcodeSanitizer:
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config: SanitizerConfig) -> None:
|
||||||
self.convert_to_address = config.get_bool('convert-to-address', True)
|
self.convert_to_address = config.get_bool('convert-to-address', True)
|
||||||
self.matcher = PostcodeFormatter()
|
self.matcher = PostcodeFormatter()
|
||||||
|
|
||||||
@@ -33,7 +37,7 @@ class _PostcodeSanitizer:
|
|||||||
self.matcher.set_default_pattern(default_pattern)
|
self.matcher.set_default_pattern(default_pattern)
|
||||||
|
|
||||||
|
|
||||||
def __call__(self, obj):
|
def __call__(self, obj: ProcessInfo) -> None:
|
||||||
if not obj.address:
|
if not obj.address:
|
||||||
return
|
return
|
||||||
|
|
||||||
@@ -52,7 +56,7 @@ class _PostcodeSanitizer:
|
|||||||
postcode.set_attr('variant', formatted[1])
|
postcode.set_attr('variant', formatted[1])
|
||||||
|
|
||||||
|
|
||||||
def scan(self, postcode, country):
|
def scan(self, postcode: str, country: Optional[str]) -> Optional[Tuple[str, str]]:
|
||||||
""" Check the postcode for correct formatting and return the
|
""" Check the postcode for correct formatting and return the
|
||||||
normalized version. Returns None if the postcode does not
|
normalized version. Returns None if the postcode does not
|
||||||
correspond to the oficial format of the given country.
|
correspond to the oficial format of the given country.
|
||||||
@@ -61,13 +65,15 @@ class _PostcodeSanitizer:
|
|||||||
if match is None:
|
if match is None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
assert country is not None
|
||||||
|
|
||||||
return self.matcher.normalize(country, match),\
|
return self.matcher.normalize(country, match),\
|
||||||
' '.join(filter(lambda p: p is not None, match.groups()))
|
' '.join(filter(lambda p: p is not None, match.groups()))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def create(config):
|
def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
|
||||||
""" Create a housenumber processing function.
|
""" Create a housenumber processing function.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|||||||
@@ -7,20 +7,28 @@
|
|||||||
"""
|
"""
|
||||||
Configuration for Sanitizers.
|
Configuration for Sanitizers.
|
||||||
"""
|
"""
|
||||||
|
from typing import Sequence, Optional, Pattern, Callable, Any, TYPE_CHECKING
|
||||||
from collections import UserDict
|
from collections import UserDict
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from nominatim.errors import UsageError
|
from nominatim.errors import UsageError
|
||||||
|
|
||||||
class SanitizerConfig(UserDict):
|
# working around missing generics in Python < 3.8
|
||||||
|
# See https://github.com/python/typing/issues/60#issuecomment-869757075
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
_BaseUserDict = UserDict[str, Any]
|
||||||
|
else:
|
||||||
|
_BaseUserDict = UserDict
|
||||||
|
|
||||||
|
class SanitizerConfig(_BaseUserDict):
|
||||||
""" Dictionary with configuration options for a sanitizer.
|
""" Dictionary with configuration options for a sanitizer.
|
||||||
|
|
||||||
In addition to the usualy dictionary function, the class provides
|
In addition to the usual dictionary function, the class provides
|
||||||
accessors to standard sanatizer options that are used by many of the
|
accessors to standard sanatizer options that are used by many of the
|
||||||
sanitizers.
|
sanitizers.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def get_string_list(self, param, default=tuple()):
|
def get_string_list(self, param: str, default: Sequence[str] = tuple()) -> Sequence[str]:
|
||||||
""" Extract a configuration parameter as a string list.
|
""" Extract a configuration parameter as a string list.
|
||||||
If the parameter value is a simple string, it is returned as a
|
If the parameter value is a simple string, it is returned as a
|
||||||
one-item list. If the parameter value does not exist, the given
|
one-item list. If the parameter value does not exist, the given
|
||||||
@@ -44,7 +52,7 @@ class SanitizerConfig(UserDict):
|
|||||||
return values
|
return values
|
||||||
|
|
||||||
|
|
||||||
def get_bool(self, param, default=None):
|
def get_bool(self, param: str, default: Optional[bool] = None) -> bool:
|
||||||
""" Extract a configuration parameter as a boolean.
|
""" Extract a configuration parameter as a boolean.
|
||||||
The parameter must be one of the yaml boolean values or an
|
The parameter must be one of the yaml boolean values or an
|
||||||
user error will be raised. If `default` is given, then the parameter
|
user error will be raised. If `default` is given, then the parameter
|
||||||
@@ -58,7 +66,7 @@ class SanitizerConfig(UserDict):
|
|||||||
return value
|
return value
|
||||||
|
|
||||||
|
|
||||||
def get_delimiter(self, default=',;'):
|
def get_delimiter(self, default: str = ',;') -> Pattern[str]:
|
||||||
""" Return the 'delimiter' parameter in the configuration as a
|
""" Return the 'delimiter' parameter in the configuration as a
|
||||||
compiled regular expression that can be used to split the names on the
|
compiled regular expression that can be used to split the names on the
|
||||||
delimiters. The regular expression makes sure that the resulting names
|
delimiters. The regular expression makes sure that the resulting names
|
||||||
@@ -76,7 +84,7 @@ class SanitizerConfig(UserDict):
|
|||||||
return re.compile('\\s*[{}]+\\s*'.format(''.join('\\' + d for d in delimiter_set)))
|
return re.compile('\\s*[{}]+\\s*'.format(''.join('\\' + d for d in delimiter_set)))
|
||||||
|
|
||||||
|
|
||||||
def get_filter_kind(self, *default):
|
def get_filter_kind(self, *default: str) -> Callable[[str], bool]:
|
||||||
""" Return a filter function for the name kind from the 'filter-kind'
|
""" Return a filter function for the name kind from the 'filter-kind'
|
||||||
config parameter. The filter functions takes a name item and returns
|
config parameter. The filter functions takes a name item and returns
|
||||||
True when the item passes the filter.
|
True when the item passes the filter.
|
||||||
@@ -93,4 +101,4 @@ class SanitizerConfig(UserDict):
|
|||||||
|
|
||||||
regexes = [re.compile(regex) for regex in filters]
|
regexes = [re.compile(regex) for regex in filters]
|
||||||
|
|
||||||
return lambda name: any(regex.fullmatch(name.kind) for regex in regexes)
|
return lambda name: any(regex.fullmatch(name) for regex in regexes)
|
||||||
|
|||||||
@@ -11,13 +11,18 @@ Arguments:
|
|||||||
delimiters: Define the set of characters to be used for
|
delimiters: Define the set of characters to be used for
|
||||||
splitting the list. (default: ',;')
|
splitting the list. (default: ',;')
|
||||||
"""
|
"""
|
||||||
def create(config):
|
from typing import Callable
|
||||||
|
|
||||||
|
from nominatim.tokenizer.sanitizers.base import ProcessInfo
|
||||||
|
from nominatim.tokenizer.sanitizers.config import SanitizerConfig
|
||||||
|
|
||||||
|
def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
|
||||||
""" Create a name processing function that splits name values with
|
""" Create a name processing function that splits name values with
|
||||||
multiple values into their components.
|
multiple values into their components.
|
||||||
"""
|
"""
|
||||||
regexp = config.get_delimiter()
|
regexp = config.get_delimiter()
|
||||||
|
|
||||||
def _process(obj):
|
def _process(obj: ProcessInfo) -> None:
|
||||||
if not obj.names:
|
if not obj.names:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|||||||
@@ -9,12 +9,17 @@ This sanitizer creates additional name variants for names that have
|
|||||||
addendums in brackets (e.g. "Halle (Saale)"). The additional variant contains
|
addendums in brackets (e.g. "Halle (Saale)"). The additional variant contains
|
||||||
only the main name part with the bracket part removed.
|
only the main name part with the bracket part removed.
|
||||||
"""
|
"""
|
||||||
|
from typing import Callable
|
||||||
|
|
||||||
def create(_):
|
from nominatim.tokenizer.sanitizers.base import ProcessInfo
|
||||||
|
from nominatim.tokenizer.sanitizers.config import SanitizerConfig
|
||||||
|
|
||||||
|
|
||||||
|
def create(_: SanitizerConfig) -> Callable[[ProcessInfo], None]:
|
||||||
""" Create a name processing function that creates additional name variants
|
""" Create a name processing function that creates additional name variants
|
||||||
for bracket addendums.
|
for bracket addendums.
|
||||||
"""
|
"""
|
||||||
def _process(obj):
|
def _process(obj: ProcessInfo) -> None:
|
||||||
""" Add variants for names that have a bracket extension.
|
""" Add variants for names that have a bracket extension.
|
||||||
"""
|
"""
|
||||||
if obj.names:
|
if obj.names:
|
||||||
|
|||||||
@@ -30,13 +30,17 @@ Arguments:
|
|||||||
any analyzer tagged) is retained. (default: replace)
|
any analyzer tagged) is retained. (default: replace)
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
from typing import Callable, Dict, Optional, List
|
||||||
|
|
||||||
from nominatim.data import country_info
|
from nominatim.data import country_info
|
||||||
|
from nominatim.tokenizer.sanitizers.base import ProcessInfo
|
||||||
|
from nominatim.tokenizer.sanitizers.config import SanitizerConfig
|
||||||
|
|
||||||
class _AnalyzerByLanguage:
|
class _AnalyzerByLanguage:
|
||||||
""" Processor for tagging the language of names in a place.
|
""" Processor for tagging the language of names in a place.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config: SanitizerConfig) -> None:
|
||||||
self.filter_kind = config.get_filter_kind()
|
self.filter_kind = config.get_filter_kind()
|
||||||
self.replace = config.get('mode', 'replace') != 'append'
|
self.replace = config.get('mode', 'replace') != 'append'
|
||||||
self.whitelist = config.get('whitelist')
|
self.whitelist = config.get('whitelist')
|
||||||
@@ -44,8 +48,8 @@ class _AnalyzerByLanguage:
|
|||||||
self._compute_default_languages(config.get('use-defaults', 'no'))
|
self._compute_default_languages(config.get('use-defaults', 'no'))
|
||||||
|
|
||||||
|
|
||||||
def _compute_default_languages(self, use_defaults):
|
def _compute_default_languages(self, use_defaults: str) -> None:
|
||||||
self.deflangs = {}
|
self.deflangs: Dict[Optional[str], List[str]] = {}
|
||||||
|
|
||||||
if use_defaults in ('mono', 'all'):
|
if use_defaults in ('mono', 'all'):
|
||||||
for ccode, clangs in country_info.iterate('languages'):
|
for ccode, clangs in country_info.iterate('languages'):
|
||||||
@@ -56,21 +60,21 @@ class _AnalyzerByLanguage:
|
|||||||
self.deflangs[ccode] = clangs
|
self.deflangs[ccode] = clangs
|
||||||
|
|
||||||
|
|
||||||
def _suffix_matches(self, suffix):
|
def _suffix_matches(self, suffix: str) -> bool:
|
||||||
if self.whitelist is None:
|
if self.whitelist is None:
|
||||||
return len(suffix) in (2, 3) and suffix.islower()
|
return len(suffix) in (2, 3) and suffix.islower()
|
||||||
|
|
||||||
return suffix in self.whitelist
|
return suffix in self.whitelist
|
||||||
|
|
||||||
|
|
||||||
def __call__(self, obj):
|
def __call__(self, obj: ProcessInfo) -> None:
|
||||||
if not obj.names:
|
if not obj.names:
|
||||||
return
|
return
|
||||||
|
|
||||||
more_names = []
|
more_names = []
|
||||||
|
|
||||||
for name in (n for n in obj.names
|
for name in (n for n in obj.names
|
||||||
if not n.has_attr('analyzer') and self.filter_kind(n)):
|
if not n.has_attr('analyzer') and self.filter_kind(n.kind)):
|
||||||
if name.suffix:
|
if name.suffix:
|
||||||
langs = [name.suffix] if self._suffix_matches(name.suffix) else None
|
langs = [name.suffix] if self._suffix_matches(name.suffix) else None
|
||||||
else:
|
else:
|
||||||
@@ -88,7 +92,7 @@ class _AnalyzerByLanguage:
|
|||||||
obj.names.extend(more_names)
|
obj.names.extend(more_names)
|
||||||
|
|
||||||
|
|
||||||
def create(config):
|
def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
|
||||||
""" Create a function that sets the analyzer property depending on the
|
""" Create a function that sets the analyzer property depending on the
|
||||||
language of the tag.
|
language of the tag.
|
||||||
"""
|
"""
|
||||||
|
|||||||
42
nominatim/tokenizer/token_analysis/base.py
Normal file
42
nominatim/tokenizer/token_analysis/base.py
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
# SPDX-License-Identifier: GPL-2.0-only
|
||||||
|
#
|
||||||
|
# This file is part of Nominatim. (https://nominatim.org)
|
||||||
|
#
|
||||||
|
# Copyright (C) 2022 by the Nominatim developer community.
|
||||||
|
# For a full list of authors see the git log.
|
||||||
|
"""
|
||||||
|
Common data types and protocols for analysers.
|
||||||
|
"""
|
||||||
|
from typing import Mapping, List, Any
|
||||||
|
|
||||||
|
from nominatim.typing import Protocol
|
||||||
|
|
||||||
|
class Analyser(Protocol):
|
||||||
|
""" Instance of the token analyser.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def normalize(self, name: str) -> str:
|
||||||
|
""" Return the normalized form of the name. This is the standard form
|
||||||
|
from which possible variants for the name can be derived.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def get_variants_ascii(self, norm_name: str) -> List[str]:
|
||||||
|
""" Compute the spelling variants for the given normalized name
|
||||||
|
and transliterate the result.
|
||||||
|
"""
|
||||||
|
|
||||||
|
class AnalysisModule(Protocol):
|
||||||
|
""" Protocol for analysis modules.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def configure(self, rules: Mapping[str, Any], normalization_rules: str) -> Any:
|
||||||
|
""" Prepare the configuration of the analysis module.
|
||||||
|
This function should prepare all data that can be shared
|
||||||
|
between instances of this analyser.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def create(self, normalizer: Any, transliterator: Any, config: Any) -> Analyser:
|
||||||
|
""" Create a new instance of the analyser.
|
||||||
|
A separate instance of the analyser is created for each thread
|
||||||
|
when used in multi-threading context.
|
||||||
|
"""
|
||||||
@@ -7,7 +7,8 @@
|
|||||||
"""
|
"""
|
||||||
Parser for configuration for variants.
|
Parser for configuration for variants.
|
||||||
"""
|
"""
|
||||||
from collections import defaultdict, namedtuple
|
from typing import Any, Iterator, Tuple, List, Optional, Set, NamedTuple
|
||||||
|
from collections import defaultdict
|
||||||
import itertools
|
import itertools
|
||||||
import re
|
import re
|
||||||
|
|
||||||
@@ -16,9 +17,15 @@ from icu import Transliterator
|
|||||||
from nominatim.config import flatten_config_list
|
from nominatim.config import flatten_config_list
|
||||||
from nominatim.errors import UsageError
|
from nominatim.errors import UsageError
|
||||||
|
|
||||||
ICUVariant = namedtuple('ICUVariant', ['source', 'replacement'])
|
class ICUVariant(NamedTuple):
|
||||||
|
""" A single replacement rule for variant creation.
|
||||||
|
"""
|
||||||
|
source: str
|
||||||
|
replacement: str
|
||||||
|
|
||||||
def get_variant_config(rules, normalization_rules):
|
|
||||||
|
def get_variant_config(in_rules: Any,
|
||||||
|
normalization_rules: str) -> Tuple[List[Tuple[str, List[str]]], str]:
|
||||||
""" Convert the variant definition from the configuration into
|
""" Convert the variant definition from the configuration into
|
||||||
replacement sets.
|
replacement sets.
|
||||||
|
|
||||||
@@ -26,11 +33,11 @@ def get_variant_config(rules, normalization_rules):
|
|||||||
used in the replacements.
|
used in the replacements.
|
||||||
"""
|
"""
|
||||||
immediate = defaultdict(list)
|
immediate = defaultdict(list)
|
||||||
chars = set()
|
chars: Set[str] = set()
|
||||||
|
|
||||||
if rules:
|
if in_rules:
|
||||||
vset = set()
|
vset: Set[ICUVariant] = set()
|
||||||
rules = flatten_config_list(rules, 'variants')
|
rules = flatten_config_list(in_rules, 'variants')
|
||||||
|
|
||||||
vmaker = _VariantMaker(normalization_rules)
|
vmaker = _VariantMaker(normalization_rules)
|
||||||
|
|
||||||
@@ -56,12 +63,12 @@ class _VariantMaker:
|
|||||||
All text in rules is normalized to make sure the variants match later.
|
All text in rules is normalized to make sure the variants match later.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, norm_rules):
|
def __init__(self, norm_rules: Any) -> None:
|
||||||
self.norm = Transliterator.createFromRules("rule_loader_normalization",
|
self.norm = Transliterator.createFromRules("rule_loader_normalization",
|
||||||
norm_rules)
|
norm_rules)
|
||||||
|
|
||||||
|
|
||||||
def compute(self, rule):
|
def compute(self, rule: Any) -> Iterator[ICUVariant]:
|
||||||
""" Generator for all ICUVariant tuples from a single variant rule.
|
""" Generator for all ICUVariant tuples from a single variant rule.
|
||||||
"""
|
"""
|
||||||
parts = re.split(r'(\|)?([=-])>', rule)
|
parts = re.split(r'(\|)?([=-])>', rule)
|
||||||
@@ -85,7 +92,7 @@ class _VariantMaker:
|
|||||||
yield ICUVariant(froms, tos)
|
yield ICUVariant(froms, tos)
|
||||||
|
|
||||||
|
|
||||||
def _parse_variant_word(self, name):
|
def _parse_variant_word(self, name: str) -> Optional[Tuple[str, str, str]]:
|
||||||
name = name.strip()
|
name = name.strip()
|
||||||
match = re.fullmatch(r'([~^]?)([^~$^]*)([~$]?)', name)
|
match = re.fullmatch(r'([~^]?)([^~$^]*)([~$]?)', name)
|
||||||
if match is None or (match.group(1) == '~' and match.group(3) == '~'):
|
if match is None or (match.group(1) == '~' and match.group(3) == '~'):
|
||||||
@@ -102,7 +109,8 @@ _FLAG_MATCH = {'^': '^ ',
|
|||||||
'': ' '}
|
'': ' '}
|
||||||
|
|
||||||
|
|
||||||
def _create_variants(src, preflag, postflag, repl, decompose):
|
def _create_variants(src: str, preflag: str, postflag: str,
|
||||||
|
repl: str, decompose: bool) -> Iterator[Tuple[str, str]]:
|
||||||
if preflag == '~':
|
if preflag == '~':
|
||||||
postfix = _FLAG_MATCH[postflag]
|
postfix = _FLAG_MATCH[postflag]
|
||||||
# suffix decomposition
|
# suffix decomposition
|
||||||
|
|||||||
@@ -7,6 +7,7 @@
|
|||||||
"""
|
"""
|
||||||
Generic processor for names that creates abbreviation variants.
|
Generic processor for names that creates abbreviation variants.
|
||||||
"""
|
"""
|
||||||
|
from typing import Mapping, Dict, Any, Iterable, Iterator, Optional, List, cast
|
||||||
import itertools
|
import itertools
|
||||||
|
|
||||||
import datrie
|
import datrie
|
||||||
@@ -17,10 +18,10 @@ from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantG
|
|||||||
|
|
||||||
### Configuration section
|
### Configuration section
|
||||||
|
|
||||||
def configure(rules, normalization_rules):
|
def configure(rules: Mapping[str, Any], normalization_rules: str) -> Dict[str, Any]:
|
||||||
""" Extract and preprocess the configuration for this module.
|
""" Extract and preprocess the configuration for this module.
|
||||||
"""
|
"""
|
||||||
config = {}
|
config: Dict[str, Any] = {}
|
||||||
|
|
||||||
config['replacements'], config['chars'] = get_variant_config(rules.get('variants'),
|
config['replacements'], config['chars'] = get_variant_config(rules.get('variants'),
|
||||||
normalization_rules)
|
normalization_rules)
|
||||||
@@ -47,7 +48,8 @@ def configure(rules, normalization_rules):
|
|||||||
|
|
||||||
### Analysis section
|
### Analysis section
|
||||||
|
|
||||||
def create(normalizer, transliterator, config):
|
def create(normalizer: Any, transliterator: Any,
|
||||||
|
config: Mapping[str, Any]) -> 'GenericTokenAnalysis':
|
||||||
""" Create a new token analysis instance for this module.
|
""" Create a new token analysis instance for this module.
|
||||||
"""
|
"""
|
||||||
return GenericTokenAnalysis(normalizer, transliterator, config)
|
return GenericTokenAnalysis(normalizer, transliterator, config)
|
||||||
@@ -58,7 +60,7 @@ class GenericTokenAnalysis:
|
|||||||
and provides the functions to apply the transformations.
|
and provides the functions to apply the transformations.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, norm, to_ascii, config):
|
def __init__(self, norm: Any, to_ascii: Any, config: Mapping[str, Any]) -> None:
|
||||||
self.norm = norm
|
self.norm = norm
|
||||||
self.to_ascii = to_ascii
|
self.to_ascii = to_ascii
|
||||||
self.variant_only = config['variant_only']
|
self.variant_only = config['variant_only']
|
||||||
@@ -75,14 +77,14 @@ class GenericTokenAnalysis:
|
|||||||
self.mutations = [MutationVariantGenerator(*cfg) for cfg in config['mutations']]
|
self.mutations = [MutationVariantGenerator(*cfg) for cfg in config['mutations']]
|
||||||
|
|
||||||
|
|
||||||
def normalize(self, name):
|
def normalize(self, name: str) -> str:
|
||||||
""" Return the normalized form of the name. This is the standard form
|
""" Return the normalized form of the name. This is the standard form
|
||||||
from which possible variants for the name can be derived.
|
from which possible variants for the name can be derived.
|
||||||
"""
|
"""
|
||||||
return self.norm.transliterate(name).strip()
|
return cast(str, self.norm.transliterate(name)).strip()
|
||||||
|
|
||||||
|
|
||||||
def get_variants_ascii(self, norm_name):
|
def get_variants_ascii(self, norm_name: str) -> List[str]:
|
||||||
""" Compute the spelling variants for the given normalized name
|
""" Compute the spelling variants for the given normalized name
|
||||||
and transliterate the result.
|
and transliterate the result.
|
||||||
"""
|
"""
|
||||||
@@ -94,7 +96,8 @@ class GenericTokenAnalysis:
|
|||||||
return [name for name in self._transliterate_unique_list(norm_name, variants) if name]
|
return [name for name in self._transliterate_unique_list(norm_name, variants) if name]
|
||||||
|
|
||||||
|
|
||||||
def _transliterate_unique_list(self, norm_name, iterable):
|
def _transliterate_unique_list(self, norm_name: str,
|
||||||
|
iterable: Iterable[str]) -> Iterator[Optional[str]]:
|
||||||
seen = set()
|
seen = set()
|
||||||
if self.variant_only:
|
if self.variant_only:
|
||||||
seen.add(norm_name)
|
seen.add(norm_name)
|
||||||
@@ -105,7 +108,7 @@ class GenericTokenAnalysis:
|
|||||||
yield self.to_ascii.transliterate(variant).strip()
|
yield self.to_ascii.transliterate(variant).strip()
|
||||||
|
|
||||||
|
|
||||||
def _generate_word_variants(self, norm_name):
|
def _generate_word_variants(self, norm_name: str) -> Iterable[str]:
|
||||||
baseform = '^ ' + norm_name + ' ^'
|
baseform = '^ ' + norm_name + ' ^'
|
||||||
baselen = len(baseform)
|
baselen = len(baseform)
|
||||||
partials = ['']
|
partials = ['']
|
||||||
|
|||||||
@@ -7,6 +7,7 @@
|
|||||||
"""
|
"""
|
||||||
Creator for mutation variants for the generic token analysis.
|
Creator for mutation variants for the generic token analysis.
|
||||||
"""
|
"""
|
||||||
|
from typing import Sequence, Iterable, Iterator, Tuple
|
||||||
import itertools
|
import itertools
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
@@ -15,7 +16,7 @@ from nominatim.errors import UsageError
|
|||||||
|
|
||||||
LOG = logging.getLogger()
|
LOG = logging.getLogger()
|
||||||
|
|
||||||
def _zigzag(outer, inner):
|
def _zigzag(outer: Iterable[str], inner: Iterable[str]) -> Iterator[str]:
|
||||||
return itertools.chain.from_iterable(itertools.zip_longest(outer, inner, fillvalue=''))
|
return itertools.chain.from_iterable(itertools.zip_longest(outer, inner, fillvalue=''))
|
||||||
|
|
||||||
|
|
||||||
@@ -26,7 +27,7 @@ class MutationVariantGenerator:
|
|||||||
patterns.
|
patterns.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, pattern, replacements):
|
def __init__(self, pattern: str, replacements: Sequence[str]):
|
||||||
self.pattern = re.compile(pattern)
|
self.pattern = re.compile(pattern)
|
||||||
self.replacements = replacements
|
self.replacements = replacements
|
||||||
|
|
||||||
@@ -36,7 +37,7 @@ class MutationVariantGenerator:
|
|||||||
raise UsageError("Bad mutation pattern in configuration.")
|
raise UsageError("Bad mutation pattern in configuration.")
|
||||||
|
|
||||||
|
|
||||||
def generate(self, names):
|
def generate(self, names: Iterable[str]) -> Iterator[str]:
|
||||||
""" Generator function for the name variants. 'names' is an iterable
|
""" Generator function for the name variants. 'names' is an iterable
|
||||||
over a set of names for which the variants are to be generated.
|
over a set of names for which the variants are to be generated.
|
||||||
"""
|
"""
|
||||||
@@ -49,7 +50,7 @@ class MutationVariantGenerator:
|
|||||||
yield ''.join(_zigzag(parts, seps))
|
yield ''.join(_zigzag(parts, seps))
|
||||||
|
|
||||||
|
|
||||||
def _fillers(self, num_parts):
|
def _fillers(self, num_parts: int) -> Iterator[Tuple[str, ...]]:
|
||||||
""" Returns a generator for strings to join the given number of string
|
""" Returns a generator for strings to join the given number of string
|
||||||
parts in all possible combinations.
|
parts in all possible combinations.
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -8,6 +8,7 @@
|
|||||||
Specialized processor for housenumbers. Analyses common housenumber patterns
|
Specialized processor for housenumbers. Analyses common housenumber patterns
|
||||||
and creates variants for them.
|
and creates variants for them.
|
||||||
"""
|
"""
|
||||||
|
from typing import Mapping, Any, List, cast
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator
|
from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator
|
||||||
@@ -19,14 +20,14 @@ RE_NAMED_PART = re.compile(r'[a-z]{4}')
|
|||||||
|
|
||||||
### Configuration section
|
### Configuration section
|
||||||
|
|
||||||
def configure(rules, normalization_rules): # pylint: disable=W0613
|
def configure(rules: Mapping[str, Any], normalization_rules: str) -> None: # pylint: disable=W0613
|
||||||
""" All behaviour is currently hard-coded.
|
""" All behaviour is currently hard-coded.
|
||||||
"""
|
"""
|
||||||
return None
|
return None
|
||||||
|
|
||||||
### Analysis section
|
### Analysis section
|
||||||
|
|
||||||
def create(normalizer, transliterator, config): # pylint: disable=W0613
|
def create(normalizer: Any, transliterator: Any, config: None) -> 'HousenumberTokenAnalysis': # pylint: disable=W0613
|
||||||
""" Create a new token analysis instance for this module.
|
""" Create a new token analysis instance for this module.
|
||||||
"""
|
"""
|
||||||
return HousenumberTokenAnalysis(normalizer, transliterator)
|
return HousenumberTokenAnalysis(normalizer, transliterator)
|
||||||
@@ -35,20 +36,20 @@ def create(normalizer, transliterator, config): # pylint: disable=W0613
|
|||||||
class HousenumberTokenAnalysis:
|
class HousenumberTokenAnalysis:
|
||||||
""" Detects common housenumber patterns and normalizes them.
|
""" Detects common housenumber patterns and normalizes them.
|
||||||
"""
|
"""
|
||||||
def __init__(self, norm, trans):
|
def __init__(self, norm: Any, trans: Any) -> None:
|
||||||
self.norm = norm
|
self.norm = norm
|
||||||
self.trans = trans
|
self.trans = trans
|
||||||
|
|
||||||
self.mutator = MutationVariantGenerator('␣', (' ', ''))
|
self.mutator = MutationVariantGenerator('␣', (' ', ''))
|
||||||
|
|
||||||
def normalize(self, name):
|
def normalize(self, name: str) -> str:
|
||||||
""" Return the normalized form of the housenumber.
|
""" Return the normalized form of the housenumber.
|
||||||
"""
|
"""
|
||||||
# shortcut for number-only numbers, which make up 90% of the data.
|
# shortcut for number-only numbers, which make up 90% of the data.
|
||||||
if RE_NON_DIGIT.search(name) is None:
|
if RE_NON_DIGIT.search(name) is None:
|
||||||
return name
|
return name
|
||||||
|
|
||||||
norm = self.trans.transliterate(self.norm.transliterate(name))
|
norm = cast(str, self.trans.transliterate(self.norm.transliterate(name)))
|
||||||
# If there is a significant non-numeric part, use as is.
|
# If there is a significant non-numeric part, use as is.
|
||||||
if RE_NAMED_PART.search(norm) is None:
|
if RE_NAMED_PART.search(norm) is None:
|
||||||
# Otherwise add optional spaces between digits and letters.
|
# Otherwise add optional spaces between digits and letters.
|
||||||
@@ -60,7 +61,7 @@ class HousenumberTokenAnalysis:
|
|||||||
|
|
||||||
return norm
|
return norm
|
||||||
|
|
||||||
def get_variants_ascii(self, norm_name):
|
def get_variants_ascii(self, norm_name: str) -> List[str]:
|
||||||
""" Compute the spelling variants for the given normalized housenumber.
|
""" Compute the spelling variants for the given normalized housenumber.
|
||||||
|
|
||||||
Generates variants for optional spaces (marked with '␣').
|
Generates variants for optional spaces (marked with '␣').
|
||||||
|
|||||||
@@ -8,19 +8,20 @@
|
|||||||
Specialized processor for postcodes. Supports a 'lookup' variant of the
|
Specialized processor for postcodes. Supports a 'lookup' variant of the
|
||||||
token, which produces variants with optional spaces.
|
token, which produces variants with optional spaces.
|
||||||
"""
|
"""
|
||||||
|
from typing import Mapping, Any, List
|
||||||
|
|
||||||
from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator
|
from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator
|
||||||
|
|
||||||
### Configuration section
|
### Configuration section
|
||||||
|
|
||||||
def configure(rules, normalization_rules): # pylint: disable=W0613
|
def configure(rules: Mapping[str, Any], normalization_rules: str) -> None: # pylint: disable=W0613
|
||||||
""" All behaviour is currently hard-coded.
|
""" All behaviour is currently hard-coded.
|
||||||
"""
|
"""
|
||||||
return None
|
return None
|
||||||
|
|
||||||
### Analysis section
|
### Analysis section
|
||||||
|
|
||||||
def create(normalizer, transliterator, config): # pylint: disable=W0613
|
def create(normalizer: Any, transliterator: Any, config: None) -> 'PostcodeTokenAnalysis': # pylint: disable=W0613
|
||||||
""" Create a new token analysis instance for this module.
|
""" Create a new token analysis instance for this module.
|
||||||
"""
|
"""
|
||||||
return PostcodeTokenAnalysis(normalizer, transliterator)
|
return PostcodeTokenAnalysis(normalizer, transliterator)
|
||||||
@@ -38,20 +39,20 @@ class PostcodeTokenAnalysis:
|
|||||||
and transliteration, so that postcodes are correctly recognised by
|
and transliteration, so that postcodes are correctly recognised by
|
||||||
the search algorithm.
|
the search algorithm.
|
||||||
"""
|
"""
|
||||||
def __init__(self, norm, trans):
|
def __init__(self, norm: Any, trans: Any) -> None:
|
||||||
self.norm = norm
|
self.norm = norm
|
||||||
self.trans = trans
|
self.trans = trans
|
||||||
|
|
||||||
self.mutator = MutationVariantGenerator(' ', (' ', ''))
|
self.mutator = MutationVariantGenerator(' ', (' ', ''))
|
||||||
|
|
||||||
|
|
||||||
def normalize(self, name):
|
def normalize(self, name: str) -> str:
|
||||||
""" Return the standard form of the postcode.
|
""" Return the standard form of the postcode.
|
||||||
"""
|
"""
|
||||||
return name.strip().upper()
|
return name.strip().upper()
|
||||||
|
|
||||||
|
|
||||||
def get_variants_ascii(self, norm_name):
|
def get_variants_ascii(self, norm_name: str) -> List[str]:
|
||||||
""" Compute the spelling variants for the given normalized postcode.
|
""" Compute the spelling variants for the given normalized postcode.
|
||||||
|
|
||||||
Takes the canonical form of the postcode, normalizes it using the
|
Takes the canonical form of the postcode, normalizes it using the
|
||||||
|
|||||||
@@ -7,6 +7,7 @@
|
|||||||
"""
|
"""
|
||||||
Function to add additional OSM data from a file or the API into the database.
|
Function to add additional OSM data from a file or the API into the database.
|
||||||
"""
|
"""
|
||||||
|
from typing import Any, MutableMapping
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import logging
|
import logging
|
||||||
import urllib
|
import urllib
|
||||||
@@ -15,7 +16,7 @@ from nominatim.tools.exec_utils import run_osm2pgsql, get_url
|
|||||||
|
|
||||||
LOG = logging.getLogger()
|
LOG = logging.getLogger()
|
||||||
|
|
||||||
def add_data_from_file(fname, options):
|
def add_data_from_file(fname: str, options: MutableMapping[str, Any]) -> int:
|
||||||
""" Adds data from a OSM file to the database. The file may be a normal
|
""" Adds data from a OSM file to the database. The file may be a normal
|
||||||
OSM file or a diff file in all formats supported by libosmium.
|
OSM file or a diff file in all formats supported by libosmium.
|
||||||
"""
|
"""
|
||||||
@@ -27,7 +28,8 @@ def add_data_from_file(fname, options):
|
|||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
def add_osm_object(osm_type, osm_id, use_main_api, options):
|
def add_osm_object(osm_type: str, osm_id: int, use_main_api: bool,
|
||||||
|
options: MutableMapping[str, Any]) -> int:
|
||||||
""" Add or update a single OSM object from the latest version of the
|
""" Add or update a single OSM object from the latest version of the
|
||||||
API.
|
API.
|
||||||
"""
|
"""
|
||||||
@@ -50,3 +52,5 @@ def add_osm_object(osm_type, osm_id, use_main_api, options):
|
|||||||
options['import_data'] = get_url(base_url).encode('utf-8')
|
options['import_data'] = get_url(base_url).encode('utf-8')
|
||||||
|
|
||||||
run_osm2pgsql(options)
|
run_osm2pgsql(options)
|
||||||
|
|
||||||
|
return 0
|
||||||
|
|||||||
@@ -7,22 +7,27 @@
|
|||||||
"""
|
"""
|
||||||
Functions for database analysis and maintenance.
|
Functions for database analysis and maintenance.
|
||||||
"""
|
"""
|
||||||
|
from typing import Optional, Tuple, Any, cast
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from psycopg2.extras import Json, register_hstore
|
from psycopg2.extras import Json, register_hstore
|
||||||
|
|
||||||
from nominatim.db.connection import connect
|
from nominatim.config import Configuration
|
||||||
|
from nominatim.db.connection import connect, Cursor
|
||||||
from nominatim.tokenizer import factory as tokenizer_factory
|
from nominatim.tokenizer import factory as tokenizer_factory
|
||||||
from nominatim.errors import UsageError
|
from nominatim.errors import UsageError
|
||||||
from nominatim.data.place_info import PlaceInfo
|
from nominatim.data.place_info import PlaceInfo
|
||||||
|
from nominatim.typing import DictCursorResult
|
||||||
|
|
||||||
LOG = logging.getLogger()
|
LOG = logging.getLogger()
|
||||||
|
|
||||||
def _get_place_info(cursor, osm_id, place_id):
|
def _get_place_info(cursor: Cursor, osm_id: Optional[str],
|
||||||
|
place_id: Optional[int]) -> DictCursorResult:
|
||||||
sql = """SELECT place_id, extra.*
|
sql = """SELECT place_id, extra.*
|
||||||
FROM placex, LATERAL placex_indexing_prepare(placex) as extra
|
FROM placex, LATERAL placex_indexing_prepare(placex) as extra
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
values: Tuple[Any, ...]
|
||||||
if osm_id:
|
if osm_id:
|
||||||
osm_type = osm_id[0].upper()
|
osm_type = osm_id[0].upper()
|
||||||
if osm_type not in 'NWR' or not osm_id[1:].isdigit():
|
if osm_type not in 'NWR' or not osm_id[1:].isdigit():
|
||||||
@@ -44,10 +49,11 @@ def _get_place_info(cursor, osm_id, place_id):
|
|||||||
LOG.fatal("OSM object %s not found in database.", osm_id)
|
LOG.fatal("OSM object %s not found in database.", osm_id)
|
||||||
raise UsageError("OSM object not found")
|
raise UsageError("OSM object not found")
|
||||||
|
|
||||||
return cursor.fetchone()
|
return cast(DictCursorResult, cursor.fetchone()) # type: ignore[no-untyped-call]
|
||||||
|
|
||||||
|
|
||||||
def analyse_indexing(config, osm_id=None, place_id=None):
|
def analyse_indexing(config: Configuration, osm_id: Optional[str] = None,
|
||||||
|
place_id: Optional[int] = None) -> None:
|
||||||
""" Analyse indexing of a single Nominatim object.
|
""" Analyse indexing of a single Nominatim object.
|
||||||
"""
|
"""
|
||||||
with connect(config.get_libpq_dsn()) as conn:
|
with connect(config.get_libpq_dsn()) as conn:
|
||||||
|
|||||||
@@ -7,10 +7,12 @@
|
|||||||
"""
|
"""
|
||||||
Collection of functions that check if the database is complete and functional.
|
Collection of functions that check if the database is complete and functional.
|
||||||
"""
|
"""
|
||||||
|
from typing import Callable, Optional, Any, Union, Tuple, Mapping, List
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from textwrap import dedent
|
from textwrap import dedent
|
||||||
|
|
||||||
from nominatim.db.connection import connect
|
from nominatim.config import Configuration
|
||||||
|
from nominatim.db.connection import connect, Connection
|
||||||
from nominatim.errors import UsageError
|
from nominatim.errors import UsageError
|
||||||
from nominatim.tokenizer import factory as tokenizer_factory
|
from nominatim.tokenizer import factory as tokenizer_factory
|
||||||
|
|
||||||
@@ -25,14 +27,17 @@ class CheckState(Enum):
|
|||||||
NOT_APPLICABLE = 3
|
NOT_APPLICABLE = 3
|
||||||
WARN = 4
|
WARN = 4
|
||||||
|
|
||||||
def _check(hint=None):
|
CheckResult = Union[CheckState, Tuple[CheckState, Mapping[str, Any]]]
|
||||||
|
CheckFunc = Callable[[Connection, Configuration], CheckResult]
|
||||||
|
|
||||||
|
def _check(hint: Optional[str] = None) -> Callable[[CheckFunc], CheckFunc]:
|
||||||
""" Decorator for checks. It adds the function to the list of
|
""" Decorator for checks. It adds the function to the list of
|
||||||
checks to execute and adds the code for printing progress messages.
|
checks to execute and adds the code for printing progress messages.
|
||||||
"""
|
"""
|
||||||
def decorator(func):
|
def decorator(func: CheckFunc) -> CheckFunc:
|
||||||
title = func.__doc__.split('\n', 1)[0].strip()
|
title = (func.__doc__ or '').split('\n', 1)[0].strip()
|
||||||
|
|
||||||
def run_check(conn, config):
|
def run_check(conn: Connection, config: Configuration) -> CheckState:
|
||||||
print(title, end=' ... ')
|
print(title, end=' ... ')
|
||||||
ret = func(conn, config)
|
ret = func(conn, config)
|
||||||
if isinstance(ret, tuple):
|
if isinstance(ret, tuple):
|
||||||
@@ -61,20 +66,20 @@ def _check(hint=None):
|
|||||||
|
|
||||||
class _BadConnection:
|
class _BadConnection:
|
||||||
|
|
||||||
def __init__(self, msg):
|
def __init__(self, msg: str) -> None:
|
||||||
self.msg = msg
|
self.msg = msg
|
||||||
|
|
||||||
def close(self):
|
def close(self) -> None:
|
||||||
""" Dummy function to provide the implementation.
|
""" Dummy function to provide the implementation.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def check_database(config):
|
def check_database(config: Configuration) -> int:
|
||||||
""" Run a number of checks on the database and return the status.
|
""" Run a number of checks on the database and return the status.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
conn = connect(config.get_libpq_dsn()).connection
|
conn = connect(config.get_libpq_dsn()).connection
|
||||||
except UsageError as err:
|
except UsageError as err:
|
||||||
conn = _BadConnection(str(err))
|
conn = _BadConnection(str(err)) # type: ignore[assignment]
|
||||||
|
|
||||||
overall_result = 0
|
overall_result = 0
|
||||||
for check in CHECKLIST:
|
for check in CHECKLIST:
|
||||||
@@ -89,7 +94,7 @@ def check_database(config):
|
|||||||
return overall_result
|
return overall_result
|
||||||
|
|
||||||
|
|
||||||
def _get_indexes(conn):
|
def _get_indexes(conn: Connection) -> List[str]:
|
||||||
indexes = ['idx_place_addressline_address_place_id',
|
indexes = ['idx_place_addressline_address_place_id',
|
||||||
'idx_placex_rank_search',
|
'idx_placex_rank_search',
|
||||||
'idx_placex_rank_address',
|
'idx_placex_rank_address',
|
||||||
@@ -131,7 +136,7 @@ def _get_indexes(conn):
|
|||||||
Project directory: {config.project_dir}
|
Project directory: {config.project_dir}
|
||||||
Current setting of NOMINATIM_DATABASE_DSN: {config.DATABASE_DSN}
|
Current setting of NOMINATIM_DATABASE_DSN: {config.DATABASE_DSN}
|
||||||
""")
|
""")
|
||||||
def check_connection(conn, config):
|
def check_connection(conn: Any, config: Configuration) -> CheckResult:
|
||||||
""" Checking database connection
|
""" Checking database connection
|
||||||
"""
|
"""
|
||||||
if isinstance(conn, _BadConnection):
|
if isinstance(conn, _BadConnection):
|
||||||
@@ -149,7 +154,7 @@ def check_connection(conn, config):
|
|||||||
Project directory: {config.project_dir}
|
Project directory: {config.project_dir}
|
||||||
Current setting of NOMINATIM_DATABASE_DSN: {config.DATABASE_DSN}
|
Current setting of NOMINATIM_DATABASE_DSN: {config.DATABASE_DSN}
|
||||||
""")
|
""")
|
||||||
def check_placex_table(conn, config):
|
def check_placex_table(conn: Connection, config: Configuration) -> CheckResult:
|
||||||
""" Checking for placex table
|
""" Checking for placex table
|
||||||
"""
|
"""
|
||||||
if conn.table_exists('placex'):
|
if conn.table_exists('placex'):
|
||||||
@@ -159,7 +164,7 @@ def check_placex_table(conn, config):
|
|||||||
|
|
||||||
|
|
||||||
@_check(hint="""placex table has no data. Did the import finish sucessfully?""")
|
@_check(hint="""placex table has no data. Did the import finish sucessfully?""")
|
||||||
def check_placex_size(conn, _):
|
def check_placex_size(conn: Connection, _: Configuration) -> CheckResult:
|
||||||
""" Checking for placex content
|
""" Checking for placex content
|
||||||
"""
|
"""
|
||||||
with conn.cursor() as cur:
|
with conn.cursor() as cur:
|
||||||
@@ -169,7 +174,7 @@ def check_placex_size(conn, _):
|
|||||||
|
|
||||||
|
|
||||||
@_check(hint="""{msg}""")
|
@_check(hint="""{msg}""")
|
||||||
def check_tokenizer(_, config):
|
def check_tokenizer(_: Connection, config: Configuration) -> CheckResult:
|
||||||
""" Checking that tokenizer works
|
""" Checking that tokenizer works
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
@@ -191,7 +196,7 @@ def check_tokenizer(_, config):
|
|||||||
Quality of search results may be degraded. Reverse geocoding is unaffected.
|
Quality of search results may be degraded. Reverse geocoding is unaffected.
|
||||||
See https://nominatim.org/release-docs/latest/admin/Import/#wikipediawikidata-rankings
|
See https://nominatim.org/release-docs/latest/admin/Import/#wikipediawikidata-rankings
|
||||||
""")
|
""")
|
||||||
def check_existance_wikipedia(conn, _):
|
def check_existance_wikipedia(conn: Connection, _: Configuration) -> CheckResult:
|
||||||
""" Checking for wikipedia/wikidata data
|
""" Checking for wikipedia/wikidata data
|
||||||
"""
|
"""
|
||||||
if not conn.table_exists('search_name'):
|
if not conn.table_exists('search_name'):
|
||||||
@@ -208,7 +213,7 @@ def check_existance_wikipedia(conn, _):
|
|||||||
|
|
||||||
To index the remaining entries, run: {index_cmd}
|
To index the remaining entries, run: {index_cmd}
|
||||||
""")
|
""")
|
||||||
def check_indexing(conn, _):
|
def check_indexing(conn: Connection, _: Configuration) -> CheckResult:
|
||||||
""" Checking indexing status
|
""" Checking indexing status
|
||||||
"""
|
"""
|
||||||
with conn.cursor() as cur:
|
with conn.cursor() as cur:
|
||||||
@@ -233,7 +238,7 @@ def check_indexing(conn, _):
|
|||||||
|
|
||||||
Rerun the index creation with: nominatim import --continue db-postprocess
|
Rerun the index creation with: nominatim import --continue db-postprocess
|
||||||
""")
|
""")
|
||||||
def check_database_indexes(conn, _):
|
def check_database_indexes(conn: Connection, _: Configuration) -> CheckResult:
|
||||||
""" Checking that database indexes are complete
|
""" Checking that database indexes are complete
|
||||||
"""
|
"""
|
||||||
missing = []
|
missing = []
|
||||||
@@ -255,7 +260,7 @@ def check_database_indexes(conn, _):
|
|||||||
Invalid indexes:
|
Invalid indexes:
|
||||||
{indexes}
|
{indexes}
|
||||||
""")
|
""")
|
||||||
def check_database_index_valid(conn, _):
|
def check_database_index_valid(conn: Connection, _: Configuration) -> CheckResult:
|
||||||
""" Checking that all database indexes are valid
|
""" Checking that all database indexes are valid
|
||||||
"""
|
"""
|
||||||
with conn.cursor() as cur:
|
with conn.cursor() as cur:
|
||||||
@@ -275,7 +280,7 @@ def check_database_index_valid(conn, _):
|
|||||||
{error}
|
{error}
|
||||||
Run TIGER import again: nominatim add-data --tiger-data <DIR>
|
Run TIGER import again: nominatim add-data --tiger-data <DIR>
|
||||||
""")
|
""")
|
||||||
def check_tiger_table(conn, config):
|
def check_tiger_table(conn: Connection, config: Configuration) -> CheckResult:
|
||||||
""" Checking TIGER external data table.
|
""" Checking TIGER external data table.
|
||||||
"""
|
"""
|
||||||
if not config.get_bool('USE_US_TIGER_DATA'):
|
if not config.get_bool('USE_US_TIGER_DATA'):
|
||||||
|
|||||||
@@ -7,6 +7,7 @@
|
|||||||
"""
|
"""
|
||||||
Functions for setting up and importing a new Nominatim database.
|
Functions for setting up and importing a new Nominatim database.
|
||||||
"""
|
"""
|
||||||
|
from typing import Tuple, Optional, Union, Sequence, MutableMapping, Any
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import selectors
|
import selectors
|
||||||
@@ -16,7 +17,8 @@ from pathlib import Path
|
|||||||
import psutil
|
import psutil
|
||||||
from psycopg2 import sql as pysql
|
from psycopg2 import sql as pysql
|
||||||
|
|
||||||
from nominatim.db.connection import connect, get_pg_env
|
from nominatim.config import Configuration
|
||||||
|
from nominatim.db.connection import connect, get_pg_env, Connection
|
||||||
from nominatim.db.async_connection import DBConnection
|
from nominatim.db.async_connection import DBConnection
|
||||||
from nominatim.db.sql_preprocessor import SQLPreprocessor
|
from nominatim.db.sql_preprocessor import SQLPreprocessor
|
||||||
from nominatim.tools.exec_utils import run_osm2pgsql
|
from nominatim.tools.exec_utils import run_osm2pgsql
|
||||||
@@ -25,7 +27,7 @@ from nominatim.version import POSTGRESQL_REQUIRED_VERSION, POSTGIS_REQUIRED_VERS
|
|||||||
|
|
||||||
LOG = logging.getLogger()
|
LOG = logging.getLogger()
|
||||||
|
|
||||||
def _require_version(module, actual, expected):
|
def _require_version(module: str, actual: Tuple[int, int], expected: Tuple[int, int]) -> None:
|
||||||
""" Compares the version for the given module and raises an exception
|
""" Compares the version for the given module and raises an exception
|
||||||
if the actual version is too old.
|
if the actual version is too old.
|
||||||
"""
|
"""
|
||||||
@@ -36,7 +38,7 @@ def _require_version(module, actual, expected):
|
|||||||
raise UsageError(f'{module} is too old.')
|
raise UsageError(f'{module} is too old.')
|
||||||
|
|
||||||
|
|
||||||
def setup_database_skeleton(dsn, rouser=None):
|
def setup_database_skeleton(dsn: str, rouser: Optional[str] = None) -> None:
|
||||||
""" Create a new database for Nominatim and populate it with the
|
""" Create a new database for Nominatim and populate it with the
|
||||||
essential extensions.
|
essential extensions.
|
||||||
|
|
||||||
@@ -80,7 +82,9 @@ def setup_database_skeleton(dsn, rouser=None):
|
|||||||
POSTGIS_REQUIRED_VERSION)
|
POSTGIS_REQUIRED_VERSION)
|
||||||
|
|
||||||
|
|
||||||
def import_osm_data(osm_files, options, drop=False, ignore_errors=False):
|
def import_osm_data(osm_files: Union[Path, Sequence[Path]],
|
||||||
|
options: MutableMapping[str, Any],
|
||||||
|
drop: bool = False, ignore_errors: bool = False) -> None:
|
||||||
""" Import the given OSM files. 'options' contains the list of
|
""" Import the given OSM files. 'options' contains the list of
|
||||||
default settings for osm2pgsql.
|
default settings for osm2pgsql.
|
||||||
"""
|
"""
|
||||||
@@ -91,7 +95,7 @@ def import_osm_data(osm_files, options, drop=False, ignore_errors=False):
|
|||||||
if not options['flatnode_file'] and options['osm2pgsql_cache'] == 0:
|
if not options['flatnode_file'] and options['osm2pgsql_cache'] == 0:
|
||||||
# Make some educated guesses about cache size based on the size
|
# Make some educated guesses about cache size based on the size
|
||||||
# of the import file and the available memory.
|
# of the import file and the available memory.
|
||||||
mem = psutil.virtual_memory()
|
mem = psutil.virtual_memory() # type: ignore[no-untyped-call]
|
||||||
fsize = 0
|
fsize = 0
|
||||||
if isinstance(osm_files, list):
|
if isinstance(osm_files, list):
|
||||||
for fname in osm_files:
|
for fname in osm_files:
|
||||||
@@ -117,7 +121,7 @@ def import_osm_data(osm_files, options, drop=False, ignore_errors=False):
|
|||||||
Path(options['flatnode_file']).unlink()
|
Path(options['flatnode_file']).unlink()
|
||||||
|
|
||||||
|
|
||||||
def create_tables(conn, config, reverse_only=False):
|
def create_tables(conn: Connection, config: Configuration, reverse_only: bool = False) -> None:
|
||||||
""" Create the set of basic tables.
|
""" Create the set of basic tables.
|
||||||
When `reverse_only` is True, then the main table for searching will
|
When `reverse_only` is True, then the main table for searching will
|
||||||
be skipped and only reverse search is possible.
|
be skipped and only reverse search is possible.
|
||||||
@@ -128,7 +132,7 @@ def create_tables(conn, config, reverse_only=False):
|
|||||||
sql.run_sql_file(conn, 'tables.sql')
|
sql.run_sql_file(conn, 'tables.sql')
|
||||||
|
|
||||||
|
|
||||||
def create_table_triggers(conn, config):
|
def create_table_triggers(conn: Connection, config: Configuration) -> None:
|
||||||
""" Create the triggers for the tables. The trigger functions must already
|
""" Create the triggers for the tables. The trigger functions must already
|
||||||
have been imported with refresh.create_functions().
|
have been imported with refresh.create_functions().
|
||||||
"""
|
"""
|
||||||
@@ -136,14 +140,14 @@ def create_table_triggers(conn, config):
|
|||||||
sql.run_sql_file(conn, 'table-triggers.sql')
|
sql.run_sql_file(conn, 'table-triggers.sql')
|
||||||
|
|
||||||
|
|
||||||
def create_partition_tables(conn, config):
|
def create_partition_tables(conn: Connection, config: Configuration) -> None:
|
||||||
""" Create tables that have explicit partitioning.
|
""" Create tables that have explicit partitioning.
|
||||||
"""
|
"""
|
||||||
sql = SQLPreprocessor(conn, config)
|
sql = SQLPreprocessor(conn, config)
|
||||||
sql.run_sql_file(conn, 'partition-tables.src.sql')
|
sql.run_sql_file(conn, 'partition-tables.src.sql')
|
||||||
|
|
||||||
|
|
||||||
def truncate_data_tables(conn):
|
def truncate_data_tables(conn: Connection) -> None:
|
||||||
""" Truncate all data tables to prepare for a fresh load.
|
""" Truncate all data tables to prepare for a fresh load.
|
||||||
"""
|
"""
|
||||||
with conn.cursor() as cur:
|
with conn.cursor() as cur:
|
||||||
@@ -174,7 +178,7 @@ _COPY_COLUMNS = pysql.SQL(',').join(map(pysql.Identifier,
|
|||||||
'extratags', 'geometry')))
|
'extratags', 'geometry')))
|
||||||
|
|
||||||
|
|
||||||
def load_data(dsn, threads):
|
def load_data(dsn: str, threads: int) -> None:
|
||||||
""" Copy data into the word and placex table.
|
""" Copy data into the word and placex table.
|
||||||
"""
|
"""
|
||||||
sel = selectors.DefaultSelector()
|
sel = selectors.DefaultSelector()
|
||||||
@@ -216,12 +220,12 @@ def load_data(dsn, threads):
|
|||||||
print('.', end='', flush=True)
|
print('.', end='', flush=True)
|
||||||
print('\n')
|
print('\n')
|
||||||
|
|
||||||
with connect(dsn) as conn:
|
with connect(dsn) as syn_conn:
|
||||||
with conn.cursor() as cur:
|
with syn_conn.cursor() as cur:
|
||||||
cur.execute('ANALYSE')
|
cur.execute('ANALYSE')
|
||||||
|
|
||||||
|
|
||||||
def create_search_indices(conn, config, drop=False):
|
def create_search_indices(conn: Connection, config: Configuration, drop: bool = False) -> None:
|
||||||
""" Create tables that have explicit partitioning.
|
""" Create tables that have explicit partitioning.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|||||||
@@ -7,17 +7,22 @@
|
|||||||
"""
|
"""
|
||||||
Helper functions for executing external programs.
|
Helper functions for executing external programs.
|
||||||
"""
|
"""
|
||||||
|
from typing import Any, Union, Optional, Mapping, IO
|
||||||
|
from pathlib import Path
|
||||||
import logging
|
import logging
|
||||||
import subprocess
|
import subprocess
|
||||||
import urllib.request as urlrequest
|
import urllib.request as urlrequest
|
||||||
from urllib.parse import urlencode
|
from urllib.parse import urlencode
|
||||||
|
|
||||||
|
from nominatim.typing import StrPath
|
||||||
from nominatim.version import version_str
|
from nominatim.version import version_str
|
||||||
from nominatim.db.connection import get_pg_env
|
from nominatim.db.connection import get_pg_env
|
||||||
|
|
||||||
LOG = logging.getLogger()
|
LOG = logging.getLogger()
|
||||||
|
|
||||||
def run_legacy_script(script, *args, nominatim_env=None, throw_on_fail=False):
|
def run_legacy_script(script: StrPath, *args: Union[int, str],
|
||||||
|
nominatim_env: Any,
|
||||||
|
throw_on_fail: bool = False) -> int:
|
||||||
""" Run a Nominatim PHP script with the given arguments.
|
""" Run a Nominatim PHP script with the given arguments.
|
||||||
|
|
||||||
Returns the exit code of the script. If `throw_on_fail` is True
|
Returns the exit code of the script. If `throw_on_fail` is True
|
||||||
@@ -40,8 +45,10 @@ def run_legacy_script(script, *args, nominatim_env=None, throw_on_fail=False):
|
|||||||
|
|
||||||
return proc.returncode
|
return proc.returncode
|
||||||
|
|
||||||
def run_api_script(endpoint, project_dir, extra_env=None, phpcgi_bin=None,
|
def run_api_script(endpoint: str, project_dir: Path,
|
||||||
params=None):
|
extra_env: Optional[Mapping[str, str]] = None,
|
||||||
|
phpcgi_bin: Optional[Path] = None,
|
||||||
|
params: Optional[Mapping[str, Any]] = None) -> int:
|
||||||
""" Execute a Nominatim API function.
|
""" Execute a Nominatim API function.
|
||||||
|
|
||||||
The function needs a project directory that contains the website
|
The function needs a project directory that contains the website
|
||||||
@@ -96,14 +103,14 @@ def run_api_script(endpoint, project_dir, extra_env=None, phpcgi_bin=None,
|
|||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
def run_php_server(server_address, base_dir):
|
def run_php_server(server_address: str, base_dir: StrPath) -> None:
|
||||||
""" Run the built-in server from the given directory.
|
""" Run the built-in server from the given directory.
|
||||||
"""
|
"""
|
||||||
subprocess.run(['/usr/bin/env', 'php', '-S', server_address],
|
subprocess.run(['/usr/bin/env', 'php', '-S', server_address],
|
||||||
cwd=str(base_dir), check=True)
|
cwd=str(base_dir), check=True)
|
||||||
|
|
||||||
|
|
||||||
def run_osm2pgsql(options):
|
def run_osm2pgsql(options: Mapping[str, Any]) -> None:
|
||||||
""" Run osm2pgsql with the given options.
|
""" Run osm2pgsql with the given options.
|
||||||
"""
|
"""
|
||||||
env = get_pg_env(options['dsn'])
|
env = get_pg_env(options['dsn'])
|
||||||
@@ -147,13 +154,14 @@ def run_osm2pgsql(options):
|
|||||||
env=env, check=True)
|
env=env, check=True)
|
||||||
|
|
||||||
|
|
||||||
def get_url(url):
|
def get_url(url: str) -> str:
|
||||||
""" Get the contents from the given URL and return it as a UTF-8 string.
|
""" Get the contents from the given URL and return it as a UTF-8 string.
|
||||||
"""
|
"""
|
||||||
headers = {"User-Agent": f"Nominatim/{version_str()}"}
|
headers = {"User-Agent": f"Nominatim/{version_str()}"}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with urlrequest.urlopen(urlrequest.Request(url, headers=headers)) as response:
|
request = urlrequest.Request(url, headers=headers)
|
||||||
|
with urlrequest.urlopen(request) as response: # type: IO[bytes]
|
||||||
return response.read().decode('utf-8')
|
return response.read().decode('utf-8')
|
||||||
except Exception:
|
except Exception:
|
||||||
LOG.fatal('Failed to load URL: %s', url)
|
LOG.fatal('Failed to load URL: %s', url)
|
||||||
|
|||||||
@@ -7,10 +7,13 @@
|
|||||||
"""
|
"""
|
||||||
Functions for removing unnecessary data from the database.
|
Functions for removing unnecessary data from the database.
|
||||||
"""
|
"""
|
||||||
|
from typing import Optional
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from psycopg2 import sql as pysql
|
from psycopg2 import sql as pysql
|
||||||
|
|
||||||
|
from nominatim.db.connection import Connection
|
||||||
|
|
||||||
UPDATE_TABLES = [
|
UPDATE_TABLES = [
|
||||||
'address_levels',
|
'address_levels',
|
||||||
'gb_postcode',
|
'gb_postcode',
|
||||||
@@ -25,7 +28,7 @@ UPDATE_TABLES = [
|
|||||||
'wikipedia_%'
|
'wikipedia_%'
|
||||||
]
|
]
|
||||||
|
|
||||||
def drop_update_tables(conn):
|
def drop_update_tables(conn: Connection) -> None:
|
||||||
""" Drop all tables only necessary for updating the database from
|
""" Drop all tables only necessary for updating the database from
|
||||||
OSM replication data.
|
OSM replication data.
|
||||||
"""
|
"""
|
||||||
@@ -42,10 +45,8 @@ def drop_update_tables(conn):
|
|||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
|
|
||||||
def drop_flatnode_file(fname):
|
def drop_flatnode_file(fpath: Optional[Path]) -> None:
|
||||||
""" Remove the flatnode file if it exists.
|
""" Remove the flatnode file if it exists.
|
||||||
"""
|
"""
|
||||||
if fname:
|
if fpath and fpath.exists():
|
||||||
fpath = Path(fname)
|
fpath.unlink()
|
||||||
if fpath.exists():
|
|
||||||
fpath.unlink()
|
|
||||||
|
|||||||
@@ -7,12 +7,14 @@
|
|||||||
"""
|
"""
|
||||||
Functions for database migration to newer software versions.
|
Functions for database migration to newer software versions.
|
||||||
"""
|
"""
|
||||||
|
from typing import List, Tuple, Callable, Any
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from psycopg2 import sql as pysql
|
from psycopg2 import sql as pysql
|
||||||
|
|
||||||
|
from nominatim.config import Configuration
|
||||||
from nominatim.db import properties
|
from nominatim.db import properties
|
||||||
from nominatim.db.connection import connect
|
from nominatim.db.connection import connect, Connection
|
||||||
from nominatim.version import NOMINATIM_VERSION, version_str
|
from nominatim.version import NOMINATIM_VERSION, version_str
|
||||||
from nominatim.tools import refresh
|
from nominatim.tools import refresh
|
||||||
from nominatim.tokenizer import factory as tokenizer_factory
|
from nominatim.tokenizer import factory as tokenizer_factory
|
||||||
@@ -20,9 +22,11 @@ from nominatim.errors import UsageError
|
|||||||
|
|
||||||
LOG = logging.getLogger()
|
LOG = logging.getLogger()
|
||||||
|
|
||||||
_MIGRATION_FUNCTIONS = []
|
VersionTuple = Tuple[int, int, int, int]
|
||||||
|
|
||||||
def migrate(config, paths):
|
_MIGRATION_FUNCTIONS : List[Tuple[VersionTuple, Callable[..., None]]] = []
|
||||||
|
|
||||||
|
def migrate(config: Configuration, paths: Any) -> int:
|
||||||
""" Check for the current database version and execute migrations,
|
""" Check for the current database version and execute migrations,
|
||||||
if necesssary.
|
if necesssary.
|
||||||
"""
|
"""
|
||||||
@@ -48,7 +52,8 @@ def migrate(config, paths):
|
|||||||
has_run_migration = False
|
has_run_migration = False
|
||||||
for version, func in _MIGRATION_FUNCTIONS:
|
for version, func in _MIGRATION_FUNCTIONS:
|
||||||
if db_version <= version:
|
if db_version <= version:
|
||||||
LOG.warning("Runnning: %s (%s)", func.__doc__.split('\n', 1)[0],
|
title = func.__doc__ or ''
|
||||||
|
LOG.warning("Runnning: %s (%s)", title.split('\n', 1)[0],
|
||||||
version_str(version))
|
version_str(version))
|
||||||
kwargs = dict(conn=conn, config=config, paths=paths)
|
kwargs = dict(conn=conn, config=config, paths=paths)
|
||||||
func(**kwargs)
|
func(**kwargs)
|
||||||
@@ -68,7 +73,7 @@ def migrate(config, paths):
|
|||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
def _guess_version(conn):
|
def _guess_version(conn: Connection) -> VersionTuple:
|
||||||
""" Guess a database version when there is no property table yet.
|
""" Guess a database version when there is no property table yet.
|
||||||
Only migrations for 3.6 and later are supported, so bail out
|
Only migrations for 3.6 and later are supported, so bail out
|
||||||
when the version seems older.
|
when the version seems older.
|
||||||
@@ -88,7 +93,8 @@ def _guess_version(conn):
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
def _migration(major, minor, patch=0, dbpatch=0):
|
def _migration(major: int, minor: int, patch: int = 0,
|
||||||
|
dbpatch: int = 0) -> Callable[[Callable[..., None]], Callable[..., None]]:
|
||||||
""" Decorator for a single migration step. The parameters describe the
|
""" Decorator for a single migration step. The parameters describe the
|
||||||
version after which the migration is applicable, i.e before changing
|
version after which the migration is applicable, i.e before changing
|
||||||
from the given version to the next, the migration is required.
|
from the given version to the next, the migration is required.
|
||||||
@@ -101,7 +107,7 @@ def _migration(major, minor, patch=0, dbpatch=0):
|
|||||||
process, so the migration functions may leave a temporary state behind
|
process, so the migration functions may leave a temporary state behind
|
||||||
there.
|
there.
|
||||||
"""
|
"""
|
||||||
def decorator(func):
|
def decorator(func: Callable[..., None]) -> Callable[..., None]:
|
||||||
_MIGRATION_FUNCTIONS.append(((major, minor, patch, dbpatch), func))
|
_MIGRATION_FUNCTIONS.append(((major, minor, patch, dbpatch), func))
|
||||||
return func
|
return func
|
||||||
|
|
||||||
@@ -109,7 +115,7 @@ def _migration(major, minor, patch=0, dbpatch=0):
|
|||||||
|
|
||||||
|
|
||||||
@_migration(3, 5, 0, 99)
|
@_migration(3, 5, 0, 99)
|
||||||
def import_status_timestamp_change(conn, **_):
|
def import_status_timestamp_change(conn: Connection, **_: Any) -> None:
|
||||||
""" Add timezone to timestamp in status table.
|
""" Add timezone to timestamp in status table.
|
||||||
|
|
||||||
The import_status table has been changed to include timezone information
|
The import_status table has been changed to include timezone information
|
||||||
@@ -121,7 +127,7 @@ def import_status_timestamp_change(conn, **_):
|
|||||||
|
|
||||||
|
|
||||||
@_migration(3, 5, 0, 99)
|
@_migration(3, 5, 0, 99)
|
||||||
def add_nominatim_property_table(conn, config, **_):
|
def add_nominatim_property_table(conn: Connection, config: Configuration, **_: Any) -> None:
|
||||||
""" Add nominatim_property table.
|
""" Add nominatim_property table.
|
||||||
"""
|
"""
|
||||||
if not conn.table_exists('nominatim_properties'):
|
if not conn.table_exists('nominatim_properties'):
|
||||||
@@ -133,7 +139,7 @@ def add_nominatim_property_table(conn, config, **_):
|
|||||||
""").format(pysql.Identifier(config.DATABASE_WEBUSER)))
|
""").format(pysql.Identifier(config.DATABASE_WEBUSER)))
|
||||||
|
|
||||||
@_migration(3, 6, 0, 0)
|
@_migration(3, 6, 0, 0)
|
||||||
def change_housenumber_transliteration(conn, **_):
|
def change_housenumber_transliteration(conn: Connection, **_: Any) -> None:
|
||||||
""" Transliterate housenumbers.
|
""" Transliterate housenumbers.
|
||||||
|
|
||||||
The database schema switched from saving raw housenumbers in
|
The database schema switched from saving raw housenumbers in
|
||||||
@@ -164,7 +170,7 @@ def change_housenumber_transliteration(conn, **_):
|
|||||||
|
|
||||||
|
|
||||||
@_migration(3, 7, 0, 0)
|
@_migration(3, 7, 0, 0)
|
||||||
def switch_placenode_geometry_index(conn, **_):
|
def switch_placenode_geometry_index(conn: Connection, **_: Any) -> None:
|
||||||
""" Replace idx_placex_geometry_reverse_placeNode index.
|
""" Replace idx_placex_geometry_reverse_placeNode index.
|
||||||
|
|
||||||
Make the index slightly more permissive, so that it can also be used
|
Make the index slightly more permissive, so that it can also be used
|
||||||
@@ -181,7 +187,7 @@ def switch_placenode_geometry_index(conn, **_):
|
|||||||
|
|
||||||
|
|
||||||
@_migration(3, 7, 0, 1)
|
@_migration(3, 7, 0, 1)
|
||||||
def install_legacy_tokenizer(conn, config, **_):
|
def install_legacy_tokenizer(conn: Connection, config: Configuration, **_: Any) -> None:
|
||||||
""" Setup legacy tokenizer.
|
""" Setup legacy tokenizer.
|
||||||
|
|
||||||
If no other tokenizer has been configured yet, then create the
|
If no other tokenizer has been configured yet, then create the
|
||||||
@@ -200,11 +206,11 @@ def install_legacy_tokenizer(conn, config, **_):
|
|||||||
tokenizer = tokenizer_factory.create_tokenizer(config, init_db=False,
|
tokenizer = tokenizer_factory.create_tokenizer(config, init_db=False,
|
||||||
module_name='legacy')
|
module_name='legacy')
|
||||||
|
|
||||||
tokenizer.migrate_database(config)
|
tokenizer.migrate_database(config) # type: ignore[attr-defined]
|
||||||
|
|
||||||
|
|
||||||
@_migration(4, 0, 99, 0)
|
@_migration(4, 0, 99, 0)
|
||||||
def create_tiger_housenumber_index(conn, **_):
|
def create_tiger_housenumber_index(conn: Connection, **_: Any) -> None:
|
||||||
""" Create idx_location_property_tiger_parent_place_id with included
|
""" Create idx_location_property_tiger_parent_place_id with included
|
||||||
house number.
|
house number.
|
||||||
|
|
||||||
@@ -221,7 +227,7 @@ def create_tiger_housenumber_index(conn, **_):
|
|||||||
|
|
||||||
|
|
||||||
@_migration(4, 0, 99, 1)
|
@_migration(4, 0, 99, 1)
|
||||||
def create_interpolation_index_on_place(conn, **_):
|
def create_interpolation_index_on_place(conn: Connection, **_: Any) -> None:
|
||||||
""" Create idx_place_interpolations for lookup of interpolation lines
|
""" Create idx_place_interpolations for lookup of interpolation lines
|
||||||
on updates.
|
on updates.
|
||||||
"""
|
"""
|
||||||
@@ -232,7 +238,7 @@ def create_interpolation_index_on_place(conn, **_):
|
|||||||
|
|
||||||
|
|
||||||
@_migration(4, 0, 99, 2)
|
@_migration(4, 0, 99, 2)
|
||||||
def add_step_column_for_interpolation(conn, **_):
|
def add_step_column_for_interpolation(conn: Connection, **_: Any) -> None:
|
||||||
""" Add a new column 'step' to the interpolations table.
|
""" Add a new column 'step' to the interpolations table.
|
||||||
|
|
||||||
Also convers the data into the stricter format which requires that
|
Also convers the data into the stricter format which requires that
|
||||||
@@ -267,7 +273,7 @@ def add_step_column_for_interpolation(conn, **_):
|
|||||||
|
|
||||||
|
|
||||||
@_migration(4, 0, 99, 3)
|
@_migration(4, 0, 99, 3)
|
||||||
def add_step_column_for_tiger(conn, **_):
|
def add_step_column_for_tiger(conn: Connection, **_: Any) -> None:
|
||||||
""" Add a new column 'step' to the tiger data table.
|
""" Add a new column 'step' to the tiger data table.
|
||||||
"""
|
"""
|
||||||
if conn.table_has_column('location_property_tiger', 'step'):
|
if conn.table_has_column('location_property_tiger', 'step'):
|
||||||
@@ -282,7 +288,7 @@ def add_step_column_for_tiger(conn, **_):
|
|||||||
|
|
||||||
|
|
||||||
@_migration(4, 0, 99, 4)
|
@_migration(4, 0, 99, 4)
|
||||||
def add_derived_name_column_for_country_names(conn, **_):
|
def add_derived_name_column_for_country_names(conn: Connection, **_: Any) -> None:
|
||||||
""" Add a new column 'derived_name' which in the future takes the
|
""" Add a new column 'derived_name' which in the future takes the
|
||||||
country names as imported from OSM data.
|
country names as imported from OSM data.
|
||||||
"""
|
"""
|
||||||
@@ -292,7 +298,7 @@ def add_derived_name_column_for_country_names(conn, **_):
|
|||||||
|
|
||||||
|
|
||||||
@_migration(4, 0, 99, 5)
|
@_migration(4, 0, 99, 5)
|
||||||
def mark_internal_country_names(conn, config, **_):
|
def mark_internal_country_names(conn: Connection, config: Configuration, **_: Any) -> None:
|
||||||
""" Names from the country table should be marked as internal to prevent
|
""" Names from the country table should be marked as internal to prevent
|
||||||
them from being deleted. Only necessary for ICU tokenizer.
|
them from being deleted. Only necessary for ICU tokenizer.
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -8,7 +8,9 @@
|
|||||||
Functions for importing, updating and otherwise maintaining the table
|
Functions for importing, updating and otherwise maintaining the table
|
||||||
of artificial postcode centroids.
|
of artificial postcode centroids.
|
||||||
"""
|
"""
|
||||||
|
from typing import Optional, Tuple, Dict, List, TextIO
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
from pathlib import Path
|
||||||
import csv
|
import csv
|
||||||
import gzip
|
import gzip
|
||||||
import logging
|
import logging
|
||||||
@@ -16,18 +18,19 @@ from math import isfinite
|
|||||||
|
|
||||||
from psycopg2 import sql as pysql
|
from psycopg2 import sql as pysql
|
||||||
|
|
||||||
from nominatim.db.connection import connect
|
from nominatim.db.connection import connect, Connection
|
||||||
from nominatim.utils.centroid import PointsCentroid
|
from nominatim.utils.centroid import PointsCentroid
|
||||||
from nominatim.data.postcode_format import PostcodeFormatter
|
from nominatim.data.postcode_format import PostcodeFormatter, CountryPostcodeMatcher
|
||||||
|
from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
|
||||||
|
|
||||||
LOG = logging.getLogger()
|
LOG = logging.getLogger()
|
||||||
|
|
||||||
def _to_float(num, max_value):
|
def _to_float(numstr: str, max_value: float) -> float:
|
||||||
""" Convert the number in string into a float. The number is expected
|
""" Convert the number in string into a float. The number is expected
|
||||||
to be in the range of [-max_value, max_value]. Otherwise rises a
|
to be in the range of [-max_value, max_value]. Otherwise rises a
|
||||||
ValueError.
|
ValueError.
|
||||||
"""
|
"""
|
||||||
num = float(num)
|
num = float(numstr)
|
||||||
if not isfinite(num) or num <= -max_value or num >= max_value:
|
if not isfinite(num) or num <= -max_value or num >= max_value:
|
||||||
raise ValueError()
|
raise ValueError()
|
||||||
|
|
||||||
@@ -37,18 +40,19 @@ class _PostcodeCollector:
|
|||||||
""" Collector for postcodes of a single country.
|
""" Collector for postcodes of a single country.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, country, matcher):
|
def __init__(self, country: str, matcher: Optional[CountryPostcodeMatcher]):
|
||||||
self.country = country
|
self.country = country
|
||||||
self.matcher = matcher
|
self.matcher = matcher
|
||||||
self.collected = defaultdict(PointsCentroid)
|
self.collected: Dict[str, PointsCentroid] = defaultdict(PointsCentroid)
|
||||||
self.normalization_cache = None
|
self.normalization_cache: Optional[Tuple[str, Optional[str]]] = None
|
||||||
|
|
||||||
|
|
||||||
def add(self, postcode, x, y):
|
def add(self, postcode: str, x: float, y: float) -> None:
|
||||||
""" Add the given postcode to the collection cache. If the postcode
|
""" Add the given postcode to the collection cache. If the postcode
|
||||||
already existed, it is overwritten with the new centroid.
|
already existed, it is overwritten with the new centroid.
|
||||||
"""
|
"""
|
||||||
if self.matcher is not None:
|
if self.matcher is not None:
|
||||||
|
normalized: Optional[str]
|
||||||
if self.normalization_cache and self.normalization_cache[0] == postcode:
|
if self.normalization_cache and self.normalization_cache[0] == postcode:
|
||||||
normalized = self.normalization_cache[1]
|
normalized = self.normalization_cache[1]
|
||||||
else:
|
else:
|
||||||
@@ -60,7 +64,7 @@ class _PostcodeCollector:
|
|||||||
self.collected[normalized] += (x, y)
|
self.collected[normalized] += (x, y)
|
||||||
|
|
||||||
|
|
||||||
def commit(self, conn, analyzer, project_dir):
|
def commit(self, conn: Connection, analyzer: AbstractAnalyzer, project_dir: Path) -> None:
|
||||||
""" Update postcodes for the country from the postcodes selected so far
|
""" Update postcodes for the country from the postcodes selected so far
|
||||||
as well as any externally supplied postcodes.
|
as well as any externally supplied postcodes.
|
||||||
"""
|
"""
|
||||||
@@ -94,7 +98,8 @@ class _PostcodeCollector:
|
|||||||
""").format(pysql.Literal(self.country)), to_update)
|
""").format(pysql.Literal(self.country)), to_update)
|
||||||
|
|
||||||
|
|
||||||
def _compute_changes(self, conn):
|
def _compute_changes(self, conn: Connection) \
|
||||||
|
-> Tuple[List[Tuple[str, float, float]], List[str], List[Tuple[str, float, float]]]:
|
||||||
""" Compute which postcodes from the collected postcodes have to be
|
""" Compute which postcodes from the collected postcodes have to be
|
||||||
added or modified and which from the location_postcode table
|
added or modified and which from the location_postcode table
|
||||||
have to be deleted.
|
have to be deleted.
|
||||||
@@ -116,12 +121,12 @@ class _PostcodeCollector:
|
|||||||
to_delete.append(postcode)
|
to_delete.append(postcode)
|
||||||
|
|
||||||
to_add = [(k, *v.centroid()) for k, v in self.collected.items()]
|
to_add = [(k, *v.centroid()) for k, v in self.collected.items()]
|
||||||
self.collected = None
|
self.collected = defaultdict(PointsCentroid)
|
||||||
|
|
||||||
return to_add, to_delete, to_update
|
return to_add, to_delete, to_update
|
||||||
|
|
||||||
|
|
||||||
def _update_from_external(self, analyzer, project_dir):
|
def _update_from_external(self, analyzer: AbstractAnalyzer, project_dir: Path) -> None:
|
||||||
""" Look for an external postcode file for the active country in
|
""" Look for an external postcode file for the active country in
|
||||||
the project directory and add missing postcodes when found.
|
the project directory and add missing postcodes when found.
|
||||||
"""
|
"""
|
||||||
@@ -151,7 +156,7 @@ class _PostcodeCollector:
|
|||||||
csvfile.close()
|
csvfile.close()
|
||||||
|
|
||||||
|
|
||||||
def _open_external(self, project_dir):
|
def _open_external(self, project_dir: Path) -> Optional[TextIO]:
|
||||||
fname = project_dir / f'{self.country}_postcodes.csv'
|
fname = project_dir / f'{self.country}_postcodes.csv'
|
||||||
|
|
||||||
if fname.is_file():
|
if fname.is_file():
|
||||||
@@ -167,7 +172,7 @@ class _PostcodeCollector:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def update_postcodes(dsn, project_dir, tokenizer):
|
def update_postcodes(dsn: str, project_dir: Path, tokenizer: AbstractTokenizer) -> None:
|
||||||
""" Update the table of artificial postcodes.
|
""" Update the table of artificial postcodes.
|
||||||
|
|
||||||
Computes artificial postcode centroids from the placex table,
|
Computes artificial postcode centroids from the placex table,
|
||||||
@@ -220,7 +225,7 @@ def update_postcodes(dsn, project_dir, tokenizer):
|
|||||||
|
|
||||||
analyzer.update_postcodes_from_db()
|
analyzer.update_postcodes_from_db()
|
||||||
|
|
||||||
def can_compute(dsn):
|
def can_compute(dsn: str) -> bool:
|
||||||
"""
|
"""
|
||||||
Check that the place table exists so that
|
Check that the place table exists so that
|
||||||
postcodes can be computed.
|
postcodes can be computed.
|
||||||
|
|||||||
@@ -7,12 +7,15 @@
|
|||||||
"""
|
"""
|
||||||
Functions for bringing auxiliary data in the database up-to-date.
|
Functions for bringing auxiliary data in the database up-to-date.
|
||||||
"""
|
"""
|
||||||
|
from typing import MutableSequence, Tuple, Any, Type, Mapping, Sequence, List, cast
|
||||||
import logging
|
import logging
|
||||||
from textwrap import dedent
|
from textwrap import dedent
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from psycopg2 import sql as pysql
|
from psycopg2 import sql as pysql
|
||||||
|
|
||||||
|
from nominatim.config import Configuration
|
||||||
|
from nominatim.db.connection import Connection
|
||||||
from nominatim.db.utils import execute_file
|
from nominatim.db.utils import execute_file
|
||||||
from nominatim.db.sql_preprocessor import SQLPreprocessor
|
from nominatim.db.sql_preprocessor import SQLPreprocessor
|
||||||
from nominatim.version import version_str
|
from nominatim.version import version_str
|
||||||
@@ -21,7 +24,8 @@ LOG = logging.getLogger()
|
|||||||
|
|
||||||
OSM_TYPE = {'N': 'node', 'W': 'way', 'R': 'relation'}
|
OSM_TYPE = {'N': 'node', 'W': 'way', 'R': 'relation'}
|
||||||
|
|
||||||
def _add_address_level_rows_from_entry(rows, entry):
|
def _add_address_level_rows_from_entry(rows: MutableSequence[Tuple[Any, ...]],
|
||||||
|
entry: Mapping[str, Any]) -> None:
|
||||||
""" Converts a single entry from the JSON format for address rank
|
""" Converts a single entry from the JSON format for address rank
|
||||||
descriptions into a flat format suitable for inserting into a
|
descriptions into a flat format suitable for inserting into a
|
||||||
PostgreSQL table and adds these lines to `rows`.
|
PostgreSQL table and adds these lines to `rows`.
|
||||||
@@ -38,14 +42,15 @@ def _add_address_level_rows_from_entry(rows, entry):
|
|||||||
for country in countries:
|
for country in countries:
|
||||||
rows.append((country, key, value, rank_search, rank_address))
|
rows.append((country, key, value, rank_search, rank_address))
|
||||||
|
|
||||||
def load_address_levels(conn, table, levels):
|
|
||||||
|
def load_address_levels(conn: Connection, table: str, levels: Sequence[Mapping[str, Any]]) -> None:
|
||||||
""" Replace the `address_levels` table with the contents of `levels'.
|
""" Replace the `address_levels` table with the contents of `levels'.
|
||||||
|
|
||||||
A new table is created any previously existing table is dropped.
|
A new table is created any previously existing table is dropped.
|
||||||
The table has the following columns:
|
The table has the following columns:
|
||||||
country, class, type, rank_search, rank_address
|
country, class, type, rank_search, rank_address
|
||||||
"""
|
"""
|
||||||
rows = []
|
rows: List[Tuple[Any, ...]] = []
|
||||||
for entry in levels:
|
for entry in levels:
|
||||||
_add_address_level_rows_from_entry(rows, entry)
|
_add_address_level_rows_from_entry(rows, entry)
|
||||||
|
|
||||||
@@ -69,7 +74,7 @@ def load_address_levels(conn, table, levels):
|
|||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
|
|
||||||
def load_address_levels_from_config(conn, config):
|
def load_address_levels_from_config(conn: Connection, config: Configuration) -> None:
|
||||||
""" Replace the `address_levels` table with the content as
|
""" Replace the `address_levels` table with the content as
|
||||||
defined in the given configuration. Uses the parameter
|
defined in the given configuration. Uses the parameter
|
||||||
NOMINATIM_ADDRESS_LEVEL_CONFIG to determine the location of the
|
NOMINATIM_ADDRESS_LEVEL_CONFIG to determine the location of the
|
||||||
@@ -79,7 +84,9 @@ def load_address_levels_from_config(conn, config):
|
|||||||
load_address_levels(conn, 'address_levels', cfg)
|
load_address_levels(conn, 'address_levels', cfg)
|
||||||
|
|
||||||
|
|
||||||
def create_functions(conn, config, enable_diff_updates=True, enable_debug=False):
|
def create_functions(conn: Connection, config: Configuration,
|
||||||
|
enable_diff_updates: bool = True,
|
||||||
|
enable_debug: bool = False) -> None:
|
||||||
""" (Re)create the PL/pgSQL functions.
|
""" (Re)create the PL/pgSQL functions.
|
||||||
"""
|
"""
|
||||||
sql = SQLPreprocessor(conn, config)
|
sql = SQLPreprocessor(conn, config)
|
||||||
@@ -116,7 +123,7 @@ PHP_CONST_DEFS = (
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def import_wikipedia_articles(dsn, data_path, ignore_errors=False):
|
def import_wikipedia_articles(dsn: str, data_path: Path, ignore_errors: bool = False) -> int:
|
||||||
""" Replaces the wikipedia importance tables with new data.
|
""" Replaces the wikipedia importance tables with new data.
|
||||||
The import is run in a single transaction so that the new data
|
The import is run in a single transaction so that the new data
|
||||||
is replace seemlessly.
|
is replace seemlessly.
|
||||||
@@ -140,7 +147,7 @@ def import_wikipedia_articles(dsn, data_path, ignore_errors=False):
|
|||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
def recompute_importance(conn):
|
def recompute_importance(conn: Connection) -> None:
|
||||||
""" Recompute wikipedia links and importance for all entries in placex.
|
""" Recompute wikipedia links and importance for all entries in placex.
|
||||||
This is a long-running operations that must not be executed in
|
This is a long-running operations that must not be executed in
|
||||||
parallel with updates.
|
parallel with updates.
|
||||||
@@ -163,18 +170,19 @@ def recompute_importance(conn):
|
|||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
|
|
||||||
def _quote_php_variable(var_type, config, conf_name):
|
def _quote_php_variable(var_type: Type[Any], config: Configuration,
|
||||||
|
conf_name: str) -> str:
|
||||||
if var_type == bool:
|
if var_type == bool:
|
||||||
return 'true' if config.get_bool(conf_name) else 'false'
|
return 'true' if config.get_bool(conf_name) else 'false'
|
||||||
|
|
||||||
if var_type == int:
|
if var_type == int:
|
||||||
return getattr(config, conf_name)
|
return cast(str, getattr(config, conf_name))
|
||||||
|
|
||||||
if not getattr(config, conf_name):
|
if not getattr(config, conf_name):
|
||||||
return 'false'
|
return 'false'
|
||||||
|
|
||||||
if var_type == Path:
|
if var_type == Path:
|
||||||
value = str(config.get_path(conf_name))
|
value = str(config.get_path(conf_name) or '')
|
||||||
else:
|
else:
|
||||||
value = getattr(config, conf_name)
|
value = getattr(config, conf_name)
|
||||||
|
|
||||||
@@ -182,7 +190,7 @@ def _quote_php_variable(var_type, config, conf_name):
|
|||||||
return f"'{quoted}'"
|
return f"'{quoted}'"
|
||||||
|
|
||||||
|
|
||||||
def setup_website(basedir, config, conn):
|
def setup_website(basedir: Path, config: Configuration, conn: Connection) -> None:
|
||||||
""" Create the website script stubs.
|
""" Create the website script stubs.
|
||||||
"""
|
"""
|
||||||
if not basedir.exists():
|
if not basedir.exists():
|
||||||
@@ -215,7 +223,8 @@ def setup_website(basedir, config, conn):
|
|||||||
(basedir / script).write_text(template.format(script), 'utf-8')
|
(basedir / script).write_text(template.format(script), 'utf-8')
|
||||||
|
|
||||||
|
|
||||||
def invalidate_osm_object(osm_type, osm_id, conn, recursive=True):
|
def invalidate_osm_object(osm_type: str, osm_id: int, conn: Connection,
|
||||||
|
recursive: bool = True) -> None:
|
||||||
""" Mark the given OSM object for reindexing. When 'recursive' is set
|
""" Mark the given OSM object for reindexing. When 'recursive' is set
|
||||||
to True (the default), then all dependent objects are marked for
|
to True (the default), then all dependent objects are marked for
|
||||||
reindexing as well.
|
reindexing as well.
|
||||||
|
|||||||
@@ -7,6 +7,7 @@
|
|||||||
"""
|
"""
|
||||||
Functions for updating a database from a replication source.
|
Functions for updating a database from a replication source.
|
||||||
"""
|
"""
|
||||||
|
from typing import ContextManager, MutableMapping, Any, Generator, cast
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
import datetime as dt
|
import datetime as dt
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
@@ -14,6 +15,7 @@ import logging
|
|||||||
import time
|
import time
|
||||||
|
|
||||||
from nominatim.db import status
|
from nominatim.db import status
|
||||||
|
from nominatim.db.connection import Connection
|
||||||
from nominatim.tools.exec_utils import run_osm2pgsql
|
from nominatim.tools.exec_utils import run_osm2pgsql
|
||||||
from nominatim.errors import UsageError
|
from nominatim.errors import UsageError
|
||||||
|
|
||||||
@@ -21,13 +23,13 @@ try:
|
|||||||
from osmium.replication.server import ReplicationServer
|
from osmium.replication.server import ReplicationServer
|
||||||
from osmium import WriteHandler
|
from osmium import WriteHandler
|
||||||
except ImportError as exc:
|
except ImportError as exc:
|
||||||
logging.getLogger().fatal("pyosmium not installed. Replication functions not available.\n"
|
logging.getLogger().critical("pyosmium not installed. Replication functions not available.\n"
|
||||||
"To install pyosmium via pip: pip3 install osmium")
|
"To install pyosmium via pip: pip3 install osmium")
|
||||||
raise UsageError("replication tools not available") from exc
|
raise UsageError("replication tools not available") from exc
|
||||||
|
|
||||||
LOG = logging.getLogger()
|
LOG = logging.getLogger()
|
||||||
|
|
||||||
def init_replication(conn, base_url):
|
def init_replication(conn: Connection, base_url: str) -> None:
|
||||||
""" Set up replication for the server at the given base URL.
|
""" Set up replication for the server at the given base URL.
|
||||||
"""
|
"""
|
||||||
LOG.info("Using replication source: %s", base_url)
|
LOG.info("Using replication source: %s", base_url)
|
||||||
@@ -51,7 +53,7 @@ def init_replication(conn, base_url):
|
|||||||
LOG.warning("Updates initialised at sequence %s (%s)", seq, date)
|
LOG.warning("Updates initialised at sequence %s (%s)", seq, date)
|
||||||
|
|
||||||
|
|
||||||
def check_for_updates(conn, base_url):
|
def check_for_updates(conn: Connection, base_url: str) -> int:
|
||||||
""" Check if new data is available from the replication service at the
|
""" Check if new data is available from the replication service at the
|
||||||
given base URL.
|
given base URL.
|
||||||
"""
|
"""
|
||||||
@@ -84,7 +86,7 @@ class UpdateState(Enum):
|
|||||||
NO_CHANGES = 3
|
NO_CHANGES = 3
|
||||||
|
|
||||||
|
|
||||||
def update(conn, options):
|
def update(conn: Connection, options: MutableMapping[str, Any]) -> UpdateState:
|
||||||
""" Update database from the next batch of data. Returns the state of
|
""" Update database from the next batch of data. Returns the state of
|
||||||
updates according to `UpdateState`.
|
updates according to `UpdateState`.
|
||||||
"""
|
"""
|
||||||
@@ -95,6 +97,8 @@ def update(conn, options):
|
|||||||
"Please run 'nominatim replication --init' first.")
|
"Please run 'nominatim replication --init' first.")
|
||||||
raise UsageError("Replication not set up.")
|
raise UsageError("Replication not set up.")
|
||||||
|
|
||||||
|
assert startdate is not None
|
||||||
|
|
||||||
if not indexed and options['indexed_only']:
|
if not indexed and options['indexed_only']:
|
||||||
LOG.info("Skipping update. There is data that needs indexing.")
|
LOG.info("Skipping update. There is data that needs indexing.")
|
||||||
return UpdateState.MORE_PENDING
|
return UpdateState.MORE_PENDING
|
||||||
@@ -132,17 +136,17 @@ def update(conn, options):
|
|||||||
return UpdateState.UP_TO_DATE
|
return UpdateState.UP_TO_DATE
|
||||||
|
|
||||||
|
|
||||||
def _make_replication_server(url):
|
def _make_replication_server(url: str) -> ContextManager[ReplicationServer]:
|
||||||
""" Returns a ReplicationServer in form of a context manager.
|
""" Returns a ReplicationServer in form of a context manager.
|
||||||
|
|
||||||
Creates a light wrapper around older versions of pyosmium that did
|
Creates a light wrapper around older versions of pyosmium that did
|
||||||
not support the context manager interface.
|
not support the context manager interface.
|
||||||
"""
|
"""
|
||||||
if hasattr(ReplicationServer, '__enter__'):
|
if hasattr(ReplicationServer, '__enter__'):
|
||||||
return ReplicationServer(url)
|
return cast(ContextManager[ReplicationServer], ReplicationServer(url))
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
def get_cm():
|
def get_cm() -> Generator[ReplicationServer, None, None]:
|
||||||
yield ReplicationServer(url)
|
yield ReplicationServer(url)
|
||||||
|
|
||||||
return get_cm()
|
return get_cm()
|
||||||
|
|||||||
@@ -12,15 +12,14 @@ import logging
|
|||||||
LOG = logging.getLogger()
|
LOG = logging.getLogger()
|
||||||
|
|
||||||
class SpecialPhrasesImporterStatistics():
|
class SpecialPhrasesImporterStatistics():
|
||||||
# pylint: disable-msg=too-many-instance-attributes
|
|
||||||
"""
|
"""
|
||||||
Class handling statistics of the import
|
Class handling statistics of the import
|
||||||
process of special phrases.
|
process of special phrases.
|
||||||
"""
|
"""
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
self._intialize_values()
|
self._intialize_values()
|
||||||
|
|
||||||
def _intialize_values(self):
|
def _intialize_values(self) -> None:
|
||||||
"""
|
"""
|
||||||
Set all counts for the global
|
Set all counts for the global
|
||||||
import to 0.
|
import to 0.
|
||||||
@@ -30,32 +29,32 @@ class SpecialPhrasesImporterStatistics():
|
|||||||
self.tables_ignored = 0
|
self.tables_ignored = 0
|
||||||
self.invalids = 0
|
self.invalids = 0
|
||||||
|
|
||||||
def notify_one_phrase_invalid(self):
|
def notify_one_phrase_invalid(self) -> None:
|
||||||
"""
|
"""
|
||||||
Add +1 to the count of invalid entries
|
Add +1 to the count of invalid entries
|
||||||
fetched from the wiki.
|
fetched from the wiki.
|
||||||
"""
|
"""
|
||||||
self.invalids += 1
|
self.invalids += 1
|
||||||
|
|
||||||
def notify_one_table_created(self):
|
def notify_one_table_created(self) -> None:
|
||||||
"""
|
"""
|
||||||
Add +1 to the count of created tables.
|
Add +1 to the count of created tables.
|
||||||
"""
|
"""
|
||||||
self.tables_created += 1
|
self.tables_created += 1
|
||||||
|
|
||||||
def notify_one_table_deleted(self):
|
def notify_one_table_deleted(self) -> None:
|
||||||
"""
|
"""
|
||||||
Add +1 to the count of deleted tables.
|
Add +1 to the count of deleted tables.
|
||||||
"""
|
"""
|
||||||
self.tables_deleted += 1
|
self.tables_deleted += 1
|
||||||
|
|
||||||
def notify_one_table_ignored(self):
|
def notify_one_table_ignored(self) -> None:
|
||||||
"""
|
"""
|
||||||
Add +1 to the count of ignored tables.
|
Add +1 to the count of ignored tables.
|
||||||
"""
|
"""
|
||||||
self.tables_ignored += 1
|
self.tables_ignored += 1
|
||||||
|
|
||||||
def notify_import_done(self):
|
def notify_import_done(self) -> None:
|
||||||
"""
|
"""
|
||||||
Print stats for the whole import process
|
Print stats for the whole import process
|
||||||
and reset all values.
|
and reset all values.
|
||||||
|
|||||||
@@ -9,6 +9,7 @@
|
|||||||
|
|
||||||
The class allows to load phrases from a csv file.
|
The class allows to load phrases from a csv file.
|
||||||
"""
|
"""
|
||||||
|
from typing import Iterable
|
||||||
import csv
|
import csv
|
||||||
import os
|
import os
|
||||||
from nominatim.tools.special_phrases.special_phrase import SpecialPhrase
|
from nominatim.tools.special_phrases.special_phrase import SpecialPhrase
|
||||||
@@ -18,12 +19,11 @@ class SPCsvLoader:
|
|||||||
"""
|
"""
|
||||||
Handles loading of special phrases from external csv file.
|
Handles loading of special phrases from external csv file.
|
||||||
"""
|
"""
|
||||||
def __init__(self, csv_path):
|
def __init__(self, csv_path: str) -> None:
|
||||||
super().__init__()
|
|
||||||
self.csv_path = csv_path
|
self.csv_path = csv_path
|
||||||
|
|
||||||
|
|
||||||
def generate_phrases(self):
|
def generate_phrases(self) -> Iterable[SpecialPhrase]:
|
||||||
""" Open and parse the given csv file.
|
""" Open and parse the given csv file.
|
||||||
Create the corresponding SpecialPhrases.
|
Create the corresponding SpecialPhrases.
|
||||||
"""
|
"""
|
||||||
@@ -35,7 +35,7 @@ class SPCsvLoader:
|
|||||||
yield SpecialPhrase(row['phrase'], row['class'], row['type'], row['operator'])
|
yield SpecialPhrase(row['phrase'], row['class'], row['type'], row['operator'])
|
||||||
|
|
||||||
|
|
||||||
def _check_csv_validity(self):
|
def _check_csv_validity(self) -> None:
|
||||||
"""
|
"""
|
||||||
Check that the csv file has the right extension.
|
Check that the csv file has the right extension.
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -13,19 +13,36 @@
|
|||||||
The phrases already present in the database which are not
|
The phrases already present in the database which are not
|
||||||
valids anymore are removed.
|
valids anymore are removed.
|
||||||
"""
|
"""
|
||||||
|
from typing import Iterable, Tuple, Mapping, Sequence, Optional, Set
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from psycopg2.sql import Identifier, SQL
|
from psycopg2.sql import Identifier, SQL
|
||||||
|
|
||||||
|
from nominatim.config import Configuration
|
||||||
|
from nominatim.db.connection import Connection
|
||||||
from nominatim.tools.special_phrases.importer_statistics import SpecialPhrasesImporterStatistics
|
from nominatim.tools.special_phrases.importer_statistics import SpecialPhrasesImporterStatistics
|
||||||
|
from nominatim.tools.special_phrases.special_phrase import SpecialPhrase
|
||||||
|
from nominatim.tokenizer.base import AbstractTokenizer
|
||||||
|
from nominatim.typing import Protocol
|
||||||
|
|
||||||
LOG = logging.getLogger()
|
LOG = logging.getLogger()
|
||||||
|
|
||||||
def _classtype_table(phrase_class, phrase_type):
|
def _classtype_table(phrase_class: str, phrase_type: str) -> str:
|
||||||
""" Return the name of the table for the given class and type.
|
""" Return the name of the table for the given class and type.
|
||||||
"""
|
"""
|
||||||
return f'place_classtype_{phrase_class}_{phrase_type}'
|
return f'place_classtype_{phrase_class}_{phrase_type}'
|
||||||
|
|
||||||
|
|
||||||
|
class SpecialPhraseLoader(Protocol):
|
||||||
|
""" Protocol for classes implementing a loader for special phrases.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def generate_phrases(self) -> Iterable[SpecialPhrase]:
|
||||||
|
""" Generates all special phrase terms this loader can produce.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
class SPImporter():
|
class SPImporter():
|
||||||
# pylint: disable-msg=too-many-instance-attributes
|
# pylint: disable-msg=too-many-instance-attributes
|
||||||
"""
|
"""
|
||||||
@@ -33,21 +50,22 @@ class SPImporter():
|
|||||||
|
|
||||||
Take a sp loader which load the phrases from an external source.
|
Take a sp loader which load the phrases from an external source.
|
||||||
"""
|
"""
|
||||||
def __init__(self, config, db_connection, sp_loader) -> None:
|
def __init__(self, config: Configuration, conn: Connection,
|
||||||
|
sp_loader: SpecialPhraseLoader) -> None:
|
||||||
self.config = config
|
self.config = config
|
||||||
self.db_connection = db_connection
|
self.db_connection = conn
|
||||||
self.sp_loader = sp_loader
|
self.sp_loader = sp_loader
|
||||||
self.statistics_handler = SpecialPhrasesImporterStatistics()
|
self.statistics_handler = SpecialPhrasesImporterStatistics()
|
||||||
self.black_list, self.white_list = self._load_white_and_black_lists()
|
self.black_list, self.white_list = self._load_white_and_black_lists()
|
||||||
self.sanity_check_pattern = re.compile(r'^\w+$')
|
self.sanity_check_pattern = re.compile(r'^\w+$')
|
||||||
# This set will contain all existing phrases to be added.
|
# This set will contain all existing phrases to be added.
|
||||||
# It contains tuples with the following format: (lable, class, type, operator)
|
# It contains tuples with the following format: (lable, class, type, operator)
|
||||||
self.word_phrases = set()
|
self.word_phrases: Set[Tuple[str, str, str, str]] = set()
|
||||||
# This set will contain all existing place_classtype tables which doesn't match any
|
# This set will contain all existing place_classtype tables which doesn't match any
|
||||||
# special phrases class/type on the wiki.
|
# special phrases class/type on the wiki.
|
||||||
self.table_phrases_to_delete = set()
|
self.table_phrases_to_delete: Set[str] = set()
|
||||||
|
|
||||||
def import_phrases(self, tokenizer, should_replace):
|
def import_phrases(self, tokenizer: AbstractTokenizer, should_replace: bool) -> None:
|
||||||
"""
|
"""
|
||||||
Iterate through all SpecialPhrases extracted from the
|
Iterate through all SpecialPhrases extracted from the
|
||||||
loader and import them into the database.
|
loader and import them into the database.
|
||||||
@@ -67,7 +85,7 @@ class SPImporter():
|
|||||||
if result:
|
if result:
|
||||||
class_type_pairs.add(result)
|
class_type_pairs.add(result)
|
||||||
|
|
||||||
self._create_place_classtype_table_and_indexes(class_type_pairs)
|
self._create_classtype_table_and_indexes(class_type_pairs)
|
||||||
if should_replace:
|
if should_replace:
|
||||||
self._remove_non_existent_tables_from_db()
|
self._remove_non_existent_tables_from_db()
|
||||||
self.db_connection.commit()
|
self.db_connection.commit()
|
||||||
@@ -79,7 +97,7 @@ class SPImporter():
|
|||||||
self.statistics_handler.notify_import_done()
|
self.statistics_handler.notify_import_done()
|
||||||
|
|
||||||
|
|
||||||
def _fetch_existing_place_classtype_tables(self):
|
def _fetch_existing_place_classtype_tables(self) -> None:
|
||||||
"""
|
"""
|
||||||
Fetch existing place_classtype tables.
|
Fetch existing place_classtype tables.
|
||||||
Fill the table_phrases_to_delete set of the class.
|
Fill the table_phrases_to_delete set of the class.
|
||||||
@@ -95,7 +113,8 @@ class SPImporter():
|
|||||||
for row in db_cursor:
|
for row in db_cursor:
|
||||||
self.table_phrases_to_delete.add(row[0])
|
self.table_phrases_to_delete.add(row[0])
|
||||||
|
|
||||||
def _load_white_and_black_lists(self):
|
def _load_white_and_black_lists(self) \
|
||||||
|
-> Tuple[Mapping[str, Sequence[str]], Mapping[str, Sequence[str]]]:
|
||||||
"""
|
"""
|
||||||
Load white and black lists from phrases-settings.json.
|
Load white and black lists from phrases-settings.json.
|
||||||
"""
|
"""
|
||||||
@@ -103,7 +122,7 @@ class SPImporter():
|
|||||||
|
|
||||||
return settings['blackList'], settings['whiteList']
|
return settings['blackList'], settings['whiteList']
|
||||||
|
|
||||||
def _check_sanity(self, phrase):
|
def _check_sanity(self, phrase: SpecialPhrase) -> bool:
|
||||||
"""
|
"""
|
||||||
Check sanity of given inputs in case somebody added garbage in the wiki.
|
Check sanity of given inputs in case somebody added garbage in the wiki.
|
||||||
If a bad class/type is detected the system will exit with an error.
|
If a bad class/type is detected the system will exit with an error.
|
||||||
@@ -117,7 +136,7 @@ class SPImporter():
|
|||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def _process_phrase(self, phrase):
|
def _process_phrase(self, phrase: SpecialPhrase) -> Optional[Tuple[str, str]]:
|
||||||
"""
|
"""
|
||||||
Processes the given phrase by checking black and white list
|
Processes the given phrase by checking black and white list
|
||||||
and sanity.
|
and sanity.
|
||||||
@@ -145,7 +164,8 @@ class SPImporter():
|
|||||||
return (phrase.p_class, phrase.p_type)
|
return (phrase.p_class, phrase.p_type)
|
||||||
|
|
||||||
|
|
||||||
def _create_place_classtype_table_and_indexes(self, class_type_pairs):
|
def _create_classtype_table_and_indexes(self,
|
||||||
|
class_type_pairs: Iterable[Tuple[str, str]]) -> None:
|
||||||
"""
|
"""
|
||||||
Create table place_classtype for each given pair.
|
Create table place_classtype for each given pair.
|
||||||
Also create indexes on place_id and centroid.
|
Also create indexes on place_id and centroid.
|
||||||
@@ -188,7 +208,8 @@ class SPImporter():
|
|||||||
db_cursor.execute("DROP INDEX idx_placex_classtype")
|
db_cursor.execute("DROP INDEX idx_placex_classtype")
|
||||||
|
|
||||||
|
|
||||||
def _create_place_classtype_table(self, sql_tablespace, phrase_class, phrase_type):
|
def _create_place_classtype_table(self, sql_tablespace: str,
|
||||||
|
phrase_class: str, phrase_type: str) -> None:
|
||||||
"""
|
"""
|
||||||
Create table place_classtype of the given phrase_class/phrase_type
|
Create table place_classtype of the given phrase_class/phrase_type
|
||||||
if doesn't exit.
|
if doesn't exit.
|
||||||
@@ -204,7 +225,8 @@ class SPImporter():
|
|||||||
(phrase_class, phrase_type))
|
(phrase_class, phrase_type))
|
||||||
|
|
||||||
|
|
||||||
def _create_place_classtype_indexes(self, sql_tablespace, phrase_class, phrase_type):
|
def _create_place_classtype_indexes(self, sql_tablespace: str,
|
||||||
|
phrase_class: str, phrase_type: str) -> None:
|
||||||
"""
|
"""
|
||||||
Create indexes on centroid and place_id for the place_classtype table.
|
Create indexes on centroid and place_id for the place_classtype table.
|
||||||
"""
|
"""
|
||||||
@@ -227,7 +249,7 @@ class SPImporter():
|
|||||||
SQL(sql_tablespace)))
|
SQL(sql_tablespace)))
|
||||||
|
|
||||||
|
|
||||||
def _grant_access_to_webuser(self, phrase_class, phrase_type):
|
def _grant_access_to_webuser(self, phrase_class: str, phrase_type: str) -> None:
|
||||||
"""
|
"""
|
||||||
Grant access on read to the table place_classtype for the webuser.
|
Grant access on read to the table place_classtype for the webuser.
|
||||||
"""
|
"""
|
||||||
@@ -237,7 +259,7 @@ class SPImporter():
|
|||||||
.format(Identifier(table_name),
|
.format(Identifier(table_name),
|
||||||
Identifier(self.config.DATABASE_WEBUSER)))
|
Identifier(self.config.DATABASE_WEBUSER)))
|
||||||
|
|
||||||
def _remove_non_existent_tables_from_db(self):
|
def _remove_non_existent_tables_from_db(self) -> None:
|
||||||
"""
|
"""
|
||||||
Remove special phrases which doesn't exist on the wiki anymore.
|
Remove special phrases which doesn't exist on the wiki anymore.
|
||||||
Delete the place_classtype tables.
|
Delete the place_classtype tables.
|
||||||
|
|||||||
@@ -7,14 +7,17 @@
|
|||||||
"""
|
"""
|
||||||
Module containing the SPWikiLoader class.
|
Module containing the SPWikiLoader class.
|
||||||
"""
|
"""
|
||||||
|
from typing import Iterable
|
||||||
import re
|
import re
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
|
from nominatim.config import Configuration
|
||||||
from nominatim.tools.special_phrases.special_phrase import SpecialPhrase
|
from nominatim.tools.special_phrases.special_phrase import SpecialPhrase
|
||||||
from nominatim.tools.exec_utils import get_url
|
from nominatim.tools.exec_utils import get_url
|
||||||
|
|
||||||
LOG = logging.getLogger()
|
LOG = logging.getLogger()
|
||||||
|
|
||||||
def _get_wiki_content(lang):
|
def _get_wiki_content(lang: str) -> str:
|
||||||
"""
|
"""
|
||||||
Request and return the wiki page's content
|
Request and return the wiki page's content
|
||||||
corresponding to special phrases for a given lang.
|
corresponding to special phrases for a given lang.
|
||||||
@@ -30,8 +33,7 @@ class SPWikiLoader:
|
|||||||
"""
|
"""
|
||||||
Handles loading of special phrases from the wiki.
|
Handles loading of special phrases from the wiki.
|
||||||
"""
|
"""
|
||||||
def __init__(self, config):
|
def __init__(self, config: Configuration) -> None:
|
||||||
super().__init__()
|
|
||||||
self.config = config
|
self.config = config
|
||||||
# Compile the regex here to increase performances.
|
# Compile the regex here to increase performances.
|
||||||
self.occurence_pattern = re.compile(
|
self.occurence_pattern = re.compile(
|
||||||
@@ -39,10 +41,15 @@ class SPWikiLoader:
|
|||||||
)
|
)
|
||||||
# Hack around a bug where building=yes was imported with quotes into the wiki
|
# Hack around a bug where building=yes was imported with quotes into the wiki
|
||||||
self.type_fix_pattern = re.compile(r'\"|"')
|
self.type_fix_pattern = re.compile(r'\"|"')
|
||||||
self._load_languages()
|
|
||||||
|
self.languages = self.config.get_str_list('LANGUAGES') or \
|
||||||
|
['af', 'ar', 'br', 'ca', 'cs', 'de', 'en', 'es',
|
||||||
|
'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'hr', 'hu',
|
||||||
|
'ia', 'is', 'it', 'ja', 'mk', 'nl', 'no', 'pl',
|
||||||
|
'ps', 'pt', 'ru', 'sk', 'sl', 'sv', 'uk', 'vi']
|
||||||
|
|
||||||
|
|
||||||
def generate_phrases(self):
|
def generate_phrases(self) -> Iterable[SpecialPhrase]:
|
||||||
""" Download the wiki pages for the configured languages
|
""" Download the wiki pages for the configured languages
|
||||||
and extract the phrases from the page.
|
and extract the phrases from the page.
|
||||||
"""
|
"""
|
||||||
@@ -58,19 +65,3 @@ class SPWikiLoader:
|
|||||||
match[1],
|
match[1],
|
||||||
self.type_fix_pattern.sub('', match[2]),
|
self.type_fix_pattern.sub('', match[2]),
|
||||||
match[3])
|
match[3])
|
||||||
|
|
||||||
|
|
||||||
def _load_languages(self):
|
|
||||||
"""
|
|
||||||
Get list of all languages from env config file
|
|
||||||
or default if there is no languages configured.
|
|
||||||
The system will extract special phrases only from all specified languages.
|
|
||||||
"""
|
|
||||||
if self.config.LANGUAGES:
|
|
||||||
self.languages = self.config.get_str_list('LANGUAGES')
|
|
||||||
else:
|
|
||||||
self.languages = [
|
|
||||||
'af', 'ar', 'br', 'ca', 'cs', 'de', 'en', 'es',
|
|
||||||
'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'hr', 'hu',
|
|
||||||
'ia', 'is', 'it', 'ja', 'mk', 'nl', 'no', 'pl',
|
|
||||||
'ps', 'pt', 'ru', 'sk', 'sl', 'sv', 'uk', 'vi']
|
|
||||||
|
|||||||
@@ -10,20 +10,21 @@
|
|||||||
This class is a model used to transfer a special phrase through
|
This class is a model used to transfer a special phrase through
|
||||||
the process of load and importation.
|
the process of load and importation.
|
||||||
"""
|
"""
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
class SpecialPhrase:
|
class SpecialPhrase:
|
||||||
"""
|
"""
|
||||||
Model representing a special phrase.
|
Model representing a special phrase.
|
||||||
"""
|
"""
|
||||||
def __init__(self, p_label, p_class, p_type, p_operator):
|
def __init__(self, p_label: str, p_class: str, p_type: str, p_operator: str) -> None:
|
||||||
self.p_label = p_label.strip()
|
self.p_label = p_label.strip()
|
||||||
self.p_class = p_class.strip()
|
self.p_class = p_class.strip()
|
||||||
# Hack around a bug where building=yes was imported with quotes into the wiki
|
|
||||||
self.p_type = p_type.strip()
|
self.p_type = p_type.strip()
|
||||||
# Needed if some operator in the wiki are not written in english
|
# Needed if some operator in the wiki are not written in english
|
||||||
p_operator = p_operator.strip().lower()
|
p_operator = p_operator.strip().lower()
|
||||||
self.p_operator = '-' if p_operator not in ('near', 'in') else p_operator
|
self.p_operator = '-' if p_operator not in ('near', 'in') else p_operator
|
||||||
|
|
||||||
def __eq__(self, other):
|
def __eq__(self, other: Any) -> bool:
|
||||||
if not isinstance(other, SpecialPhrase):
|
if not isinstance(other, SpecialPhrase):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@@ -32,5 +33,5 @@ class SpecialPhrase:
|
|||||||
and self.p_type == other.p_type \
|
and self.p_type == other.p_type \
|
||||||
and self.p_operator == other.p_operator
|
and self.p_operator == other.p_operator
|
||||||
|
|
||||||
def __hash__(self):
|
def __hash__(self) -> int:
|
||||||
return hash((self.p_label, self.p_class, self.p_type, self.p_operator))
|
return hash((self.p_label, self.p_class, self.p_type, self.p_operator))
|
||||||
|
|||||||
@@ -7,6 +7,7 @@
|
|||||||
"""
|
"""
|
||||||
Functions for importing tiger data and handling tarbar and directory files
|
Functions for importing tiger data and handling tarbar and directory files
|
||||||
"""
|
"""
|
||||||
|
from typing import Any, TextIO, List, Union, cast
|
||||||
import csv
|
import csv
|
||||||
import io
|
import io
|
||||||
import logging
|
import logging
|
||||||
@@ -15,11 +16,13 @@ import tarfile
|
|||||||
|
|
||||||
from psycopg2.extras import Json
|
from psycopg2.extras import Json
|
||||||
|
|
||||||
|
from nominatim.config import Configuration
|
||||||
from nominatim.db.connection import connect
|
from nominatim.db.connection import connect
|
||||||
from nominatim.db.async_connection import WorkerPool
|
from nominatim.db.async_connection import WorkerPool
|
||||||
from nominatim.db.sql_preprocessor import SQLPreprocessor
|
from nominatim.db.sql_preprocessor import SQLPreprocessor
|
||||||
from nominatim.errors import UsageError
|
from nominatim.errors import UsageError
|
||||||
from nominatim.data.place_info import PlaceInfo
|
from nominatim.data.place_info import PlaceInfo
|
||||||
|
from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
|
||||||
|
|
||||||
LOG = logging.getLogger()
|
LOG = logging.getLogger()
|
||||||
|
|
||||||
@@ -28,9 +31,9 @@ class TigerInput:
|
|||||||
either be in a directory or gzipped together in a tar file.
|
either be in a directory or gzipped together in a tar file.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, data_dir):
|
def __init__(self, data_dir: str) -> None:
|
||||||
self.tar_handle = None
|
self.tar_handle = None
|
||||||
self.files = []
|
self.files: List[Union[str, tarfile.TarInfo]] = []
|
||||||
|
|
||||||
if data_dir.endswith('.tar.gz'):
|
if data_dir.endswith('.tar.gz'):
|
||||||
try:
|
try:
|
||||||
@@ -50,33 +53,36 @@ class TigerInput:
|
|||||||
LOG.warning("Tiger data import selected but no files found at %s", data_dir)
|
LOG.warning("Tiger data import selected but no files found at %s", data_dir)
|
||||||
|
|
||||||
|
|
||||||
def __enter__(self):
|
def __enter__(self) -> 'TigerInput':
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
||||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
|
||||||
if self.tar_handle:
|
if self.tar_handle:
|
||||||
self.tar_handle.close()
|
self.tar_handle.close()
|
||||||
self.tar_handle = None
|
self.tar_handle = None
|
||||||
|
|
||||||
|
|
||||||
def next_file(self):
|
def next_file(self) -> TextIO:
|
||||||
""" Return a file handle to the next file to be processed.
|
""" Return a file handle to the next file to be processed.
|
||||||
Raises an IndexError if there is no file left.
|
Raises an IndexError if there is no file left.
|
||||||
"""
|
"""
|
||||||
fname = self.files.pop(0)
|
fname = self.files.pop(0)
|
||||||
|
|
||||||
if self.tar_handle is not None:
|
if self.tar_handle is not None:
|
||||||
return io.TextIOWrapper(self.tar_handle.extractfile(fname))
|
extracted = self.tar_handle.extractfile(fname)
|
||||||
|
assert extracted is not None
|
||||||
|
return io.TextIOWrapper(extracted)
|
||||||
|
|
||||||
return open(fname, encoding='utf-8')
|
return open(cast(str, fname), encoding='utf-8')
|
||||||
|
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self) -> int:
|
||||||
return len(self.files)
|
return len(self.files)
|
||||||
|
|
||||||
|
|
||||||
def handle_threaded_sql_statements(pool, fd, analyzer):
|
def handle_threaded_sql_statements(pool: WorkerPool, fd: TextIO,
|
||||||
|
analyzer: AbstractAnalyzer) -> None:
|
||||||
""" Handles sql statement with multiplexing
|
""" Handles sql statement with multiplexing
|
||||||
"""
|
"""
|
||||||
lines = 0
|
lines = 0
|
||||||
@@ -101,14 +107,15 @@ def handle_threaded_sql_statements(pool, fd, analyzer):
|
|||||||
lines = 0
|
lines = 0
|
||||||
|
|
||||||
|
|
||||||
def add_tiger_data(data_dir, config, threads, tokenizer):
|
def add_tiger_data(data_dir: str, config: Configuration, threads: int,
|
||||||
|
tokenizer: AbstractTokenizer) -> int:
|
||||||
""" Import tiger data from directory or tar file `data dir`.
|
""" Import tiger data from directory or tar file `data dir`.
|
||||||
"""
|
"""
|
||||||
dsn = config.get_libpq_dsn()
|
dsn = config.get_libpq_dsn()
|
||||||
|
|
||||||
with TigerInput(data_dir) as tar:
|
with TigerInput(data_dir) as tar:
|
||||||
if not tar:
|
if not tar:
|
||||||
return
|
return 1
|
||||||
|
|
||||||
with connect(dsn) as conn:
|
with connect(dsn) as conn:
|
||||||
sql = SQLPreprocessor(conn, config)
|
sql = SQLPreprocessor(conn, config)
|
||||||
@@ -130,3 +137,5 @@ def add_tiger_data(data_dir, config, threads, tokenizer):
|
|||||||
with connect(dsn) as conn:
|
with connect(dsn) as conn:
|
||||||
sql = SQLPreprocessor(conn, config)
|
sql = SQLPreprocessor(conn, config)
|
||||||
sql.run_sql_file(conn, 'tiger_import_finish.sql')
|
sql.run_sql_file(conn, 'tiger_import_finish.sql')
|
||||||
|
|
||||||
|
return 0
|
||||||
|
|||||||
52
nominatim/typing.py
Normal file
52
nominatim/typing.py
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
# SPDX-License-Identifier: GPL-2.0-only
|
||||||
|
#
|
||||||
|
# This file is part of Nominatim. (https://nominatim.org)
|
||||||
|
#
|
||||||
|
# Copyright (C) 2022 by the Nominatim developer community.
|
||||||
|
# For a full list of authors see the git log.
|
||||||
|
"""
|
||||||
|
Type definitions for typing annotations.
|
||||||
|
|
||||||
|
Complex type definitions are moved here, to keep the source files readable.
|
||||||
|
"""
|
||||||
|
from typing import Any, Union, Mapping, TypeVar, Sequence, TYPE_CHECKING
|
||||||
|
|
||||||
|
# Generics varaible names do not confirm to naming styles, ignore globally here.
|
||||||
|
# pylint: disable=invalid-name,abstract-method,multiple-statements
|
||||||
|
# pylint: disable=missing-class-docstring,useless-import-alias
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
import psycopg2.sql
|
||||||
|
import psycopg2.extensions
|
||||||
|
import psycopg2.extras
|
||||||
|
import os
|
||||||
|
|
||||||
|
StrPath = Union[str, 'os.PathLike[str]']
|
||||||
|
|
||||||
|
SysEnv = Mapping[str, str]
|
||||||
|
|
||||||
|
# psycopg2-related types
|
||||||
|
|
||||||
|
Query = Union[str, bytes, 'psycopg2.sql.Composable']
|
||||||
|
|
||||||
|
T_ResultKey = TypeVar('T_ResultKey', int, str)
|
||||||
|
|
||||||
|
class DictCursorResult(Mapping[str, Any]):
|
||||||
|
def __getitem__(self, x: Union[int, str]) -> Any: ...
|
||||||
|
|
||||||
|
DictCursorResults = Sequence[DictCursorResult]
|
||||||
|
|
||||||
|
T_cursor = TypeVar('T_cursor', bound='psycopg2.extensions.cursor')
|
||||||
|
|
||||||
|
# The following typing features require typing_extensions to work
|
||||||
|
# on all supported Python versions.
|
||||||
|
# Only require this for type checking but not for normal operations.
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from typing_extensions import (Protocol as Protocol,
|
||||||
|
Final as Final,
|
||||||
|
TypedDict as TypedDict)
|
||||||
|
else:
|
||||||
|
Protocol = object
|
||||||
|
Final = 'Final'
|
||||||
|
TypedDict = dict
|
||||||
@@ -7,6 +7,7 @@
|
|||||||
"""
|
"""
|
||||||
Functions for computation of centroids.
|
Functions for computation of centroids.
|
||||||
"""
|
"""
|
||||||
|
from typing import Tuple, Any
|
||||||
from collections.abc import Collection
|
from collections.abc import Collection
|
||||||
|
|
||||||
class PointsCentroid:
|
class PointsCentroid:
|
||||||
@@ -17,12 +18,12 @@ class PointsCentroid:
|
|||||||
(i.e. in OSM style).
|
(i.e. in OSM style).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
self.sum_x = 0
|
self.sum_x = 0
|
||||||
self.sum_y = 0
|
self.sum_y = 0
|
||||||
self.count = 0
|
self.count = 0
|
||||||
|
|
||||||
def centroid(self):
|
def centroid(self) -> Tuple[float, float]:
|
||||||
""" Return the centroid of all points collected so far.
|
""" Return the centroid of all points collected so far.
|
||||||
"""
|
"""
|
||||||
if self.count == 0:
|
if self.count == 0:
|
||||||
@@ -32,11 +33,11 @@ class PointsCentroid:
|
|||||||
float(self.sum_y/self.count)/10000000)
|
float(self.sum_y/self.count)/10000000)
|
||||||
|
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self) -> int:
|
||||||
return self.count
|
return self.count
|
||||||
|
|
||||||
|
|
||||||
def __iadd__(self, other):
|
def __iadd__(self, other: Any) -> 'PointsCentroid':
|
||||||
if isinstance(other, Collection) and len(other) == 2:
|
if isinstance(other, Collection) and len(other) == 2:
|
||||||
if all(isinstance(p, (float, int)) for p in other):
|
if all(isinstance(p, (float, int)) for p in other):
|
||||||
x, y = other
|
x, y = other
|
||||||
|
|||||||
@@ -7,6 +7,7 @@
|
|||||||
"""
|
"""
|
||||||
Version information for Nominatim.
|
Version information for Nominatim.
|
||||||
"""
|
"""
|
||||||
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
# Version information: major, minor, patch level, database patch level
|
# Version information: major, minor, patch level, database patch level
|
||||||
#
|
#
|
||||||
@@ -33,11 +34,11 @@ POSTGIS_REQUIRED_VERSION = (2, 2)
|
|||||||
# on every execution of 'make'.
|
# on every execution of 'make'.
|
||||||
# cmake/tool-installed.tmpl is used to build the binary 'nominatim'. Inside
|
# cmake/tool-installed.tmpl is used to build the binary 'nominatim'. Inside
|
||||||
# there is a call to set the variable value below.
|
# there is a call to set the variable value below.
|
||||||
GIT_COMMIT_HASH = None
|
GIT_COMMIT_HASH : Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
# pylint: disable=consider-using-f-string
|
# pylint: disable=consider-using-f-string
|
||||||
def version_str(version=NOMINATIM_VERSION):
|
def version_str(version:Tuple[int, int, int, int] = NOMINATIM_VERSION) -> str:
|
||||||
"""
|
"""
|
||||||
Return a human-readable string of the version.
|
Return a human-readable string of the version.
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ sys.path.insert(1, str((Path(__file__) / '..' / '..' / '..' / '..').resolve()))
|
|||||||
|
|
||||||
from nominatim import cli
|
from nominatim import cli
|
||||||
from nominatim.config import Configuration
|
from nominatim.config import Configuration
|
||||||
from nominatim.db.connection import _Connection
|
from nominatim.db.connection import Connection
|
||||||
from nominatim.tools import refresh
|
from nominatim.tools import refresh
|
||||||
from nominatim.tokenizer import factory as tokenizer_factory
|
from nominatim.tokenizer import factory as tokenizer_factory
|
||||||
from steps.utils import run_script
|
from steps.utils import run_script
|
||||||
@@ -61,7 +61,7 @@ class NominatimEnvironment:
|
|||||||
dbargs['user'] = self.db_user
|
dbargs['user'] = self.db_user
|
||||||
if self.db_pass:
|
if self.db_pass:
|
||||||
dbargs['password'] = self.db_pass
|
dbargs['password'] = self.db_pass
|
||||||
conn = psycopg2.connect(connection_factory=_Connection, **dbargs)
|
conn = psycopg2.connect(connection_factory=Connection, **dbargs)
|
||||||
return conn
|
return conn
|
||||||
|
|
||||||
def next_code_coverage_file(self):
|
def next_code_coverage_file(self):
|
||||||
|
|||||||
@@ -82,32 +82,32 @@ def test_create_split_regex_empty_delimiter():
|
|||||||
def test_create_kind_filter_no_params(inp):
|
def test_create_kind_filter_no_params(inp):
|
||||||
filt = SanitizerConfig().get_filter_kind()
|
filt = SanitizerConfig().get_filter_kind()
|
||||||
|
|
||||||
assert filt(PlaceName('something', inp, ''))
|
assert filt(inp)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('kind', ('de', 'name:de', 'ende'))
|
@pytest.mark.parametrize('kind', ('de', 'name:de', 'ende'))
|
||||||
def test_create_kind_filter_custom_regex_positive(kind):
|
def test_create_kind_filter_custom_regex_positive(kind):
|
||||||
filt = SanitizerConfig({'filter-kind': '.*de'}).get_filter_kind()
|
filt = SanitizerConfig({'filter-kind': '.*de'}).get_filter_kind()
|
||||||
|
|
||||||
assert filt(PlaceName('something', kind, ''))
|
assert filt(kind)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('kind', ('de ', '123', '', 'bedece'))
|
@pytest.mark.parametrize('kind', ('de ', '123', '', 'bedece'))
|
||||||
def test_create_kind_filter_custom_regex_negative(kind):
|
def test_create_kind_filter_custom_regex_negative(kind):
|
||||||
filt = SanitizerConfig({'filter-kind': '.*de'}).get_filter_kind()
|
filt = SanitizerConfig({'filter-kind': '.*de'}).get_filter_kind()
|
||||||
|
|
||||||
assert not filt(PlaceName('something', kind, ''))
|
assert not filt(kind)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('kind', ('name', 'fr', 'name:fr', 'frfr', '34'))
|
@pytest.mark.parametrize('kind', ('name', 'fr', 'name:fr', 'frfr', '34'))
|
||||||
def test_create_kind_filter_many_positive(kind):
|
def test_create_kind_filter_many_positive(kind):
|
||||||
filt = SanitizerConfig({'filter-kind': ['.*fr', 'name', r'\d+']}).get_filter_kind()
|
filt = SanitizerConfig({'filter-kind': ['.*fr', 'name', r'\d+']}).get_filter_kind()
|
||||||
|
|
||||||
assert filt(PlaceName('something', kind, ''))
|
assert filt(kind)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('kind', ('name:de', 'fridge', 'a34', '.*', '\\'))
|
@pytest.mark.parametrize('kind', ('name:de', 'fridge', 'a34', '.*', '\\'))
|
||||||
def test_create_kind_filter_many_negative(kind):
|
def test_create_kind_filter_many_negative(kind):
|
||||||
filt = SanitizerConfig({'filter-kind': ['.*fr', 'name', r'\d+']}).get_filter_kind()
|
filt = SanitizerConfig({'filter-kind': ['.*fr', 'name', r'\d+']}).get_filter_kind()
|
||||||
|
|
||||||
assert not filt(PlaceName('something', kind, ''))
|
assert not filt(kind)
|
||||||
|
|||||||
@@ -39,17 +39,17 @@ def test_drop_tables(temp_db_conn, temp_db_cursor, table_factory):
|
|||||||
assert not temp_db_cursor.table_exists(table)
|
assert not temp_db_cursor.table_exists(table)
|
||||||
|
|
||||||
def test_drop_flatnode_file_no_file():
|
def test_drop_flatnode_file_no_file():
|
||||||
freeze.drop_flatnode_file('')
|
freeze.drop_flatnode_file(None)
|
||||||
|
|
||||||
|
|
||||||
def test_drop_flatnode_file_file_already_gone(tmp_path):
|
def test_drop_flatnode_file_file_already_gone(tmp_path):
|
||||||
freeze.drop_flatnode_file(str(tmp_path / 'something.store'))
|
freeze.drop_flatnode_file(tmp_path / 'something.store')
|
||||||
|
|
||||||
|
|
||||||
def test_drop_flatnode_file_delte(tmp_path):
|
def test_drop_flatnode_file_delte(tmp_path):
|
||||||
flatfile = tmp_path / 'flatnode.store'
|
flatfile = tmp_path / 'flatnode.store'
|
||||||
flatfile.write_text('Some content')
|
flatfile.write_text('Some content')
|
||||||
|
|
||||||
freeze.drop_flatnode_file(str(flatfile))
|
freeze.drop_flatnode_file(flatfile)
|
||||||
|
|
||||||
assert not flatfile.exists()
|
assert not flatfile.exists()
|
||||||
|
|||||||
@@ -128,7 +128,7 @@ def test_create_place_classtype_table_and_indexes(
|
|||||||
"""
|
"""
|
||||||
pairs = set([('class1', 'type1'), ('class2', 'type2')])
|
pairs = set([('class1', 'type1'), ('class2', 'type2')])
|
||||||
|
|
||||||
sp_importer._create_place_classtype_table_and_indexes(pairs)
|
sp_importer._create_classtype_table_and_indexes(pairs)
|
||||||
|
|
||||||
for pair in pairs:
|
for pair in pairs:
|
||||||
assert check_table_exist(temp_db_conn, pair[0], pair[1])
|
assert check_table_exist(temp_db_conn, pair[0], pair[1])
|
||||||
|
|||||||
Reference in New Issue
Block a user