split code into submodules

This commit is contained in:
Sarah Hoffmann
2024-05-16 11:55:17 +02:00
parent 0fb4fe8e4d
commit 6e89310a92
137 changed files with 757 additions and 716 deletions

View File

228
src/nominatim_db/cli.py Normal file
View File

@@ -0,0 +1,228 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Command-line interface to the Nominatim functions for import, update,
database administration and querying.
"""
from typing import Optional, Any
import importlib
import logging
import os
import sys
import argparse
from pathlib import Path
from nominatim_core.config import Configuration
from nominatim_core.errors import UsageError
from .tools.exec_utils import run_php_server
from . import clicmd
from . import version
from .clicmd.args import NominatimArgs, Subcommand
LOG = logging.getLogger()
class CommandlineParser:
""" Wraps some of the common functions for parsing the command line
and setting up subcommands.
"""
def __init__(self, prog: str, description: Optional[str]):
self.parser = argparse.ArgumentParser(
prog=prog,
description=description,
formatter_class=argparse.RawDescriptionHelpFormatter)
self.subs = self.parser.add_subparsers(title='available commands',
dest='subcommand')
# Global arguments that only work if no sub-command given
self.parser.add_argument('--version', action='store_true',
help='Print Nominatim version and exit')
# Arguments added to every sub-command
self.default_args = argparse.ArgumentParser(add_help=False)
group = self.default_args.add_argument_group('Default arguments')
group.add_argument('-h', '--help', action='help',
help='Show this help message and exit')
group.add_argument('-q', '--quiet', action='store_const', const=0,
dest='verbose', default=1,
help='Print only error messages')
group.add_argument('-v', '--verbose', action='count', default=1,
help='Increase verboseness of output')
group.add_argument('--project-dir', metavar='DIR', default='.',
help='Base directory of the Nominatim installation (default:.)')
group.add_argument('-j', '--threads', metavar='NUM', type=int,
help='Number of parallel threads to use')
def nominatim_version_text(self) -> str:
""" Program name and version number as string
"""
text = f'Nominatim version {version.NOMINATIM_VERSION!s}'
if version.GIT_COMMIT_HASH is not None:
text += f' ({version.GIT_COMMIT_HASH})'
return text
def add_subcommand(self, name: str, cmd: Subcommand) -> None:
""" Add a subcommand to the parser. The subcommand must be a class
with a function add_args() that adds the parameters for the
subcommand and a run() function that executes the command.
"""
assert cmd.__doc__ is not None
parser = self.subs.add_parser(name, parents=[self.default_args],
help=cmd.__doc__.split('\n', 1)[0],
description=cmd.__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter,
add_help=False)
parser.set_defaults(command=cmd)
cmd.add_args(parser)
def run(self, **kwargs: Any) -> int:
""" Parse the command line arguments of the program and execute the
appropriate subcommand.
"""
args = NominatimArgs()
try:
self.parser.parse_args(args=kwargs.get('cli_args'), namespace=args)
except SystemExit:
return 1
if args.version:
print(self.nominatim_version_text())
return 0
if args.subcommand is None:
self.parser.print_help()
return 1
args.project_dir = Path(args.project_dir).resolve()
if 'cli_args' not in kwargs:
logging.basicConfig(stream=sys.stderr,
format='%(asctime)s: %(message)s',
datefmt='%Y-%m-%d %H:%M:%S',
level=max(4 - args.verbose, 1) * 10)
args.config = Configuration(args.project_dir,
environ=kwargs.get('environ', os.environ))
args.config.set_libdirs(module=kwargs['module_dir'],
osm2pgsql=kwargs['osm2pgsql_path'])
log = logging.getLogger()
log.warning('Using project directory: %s', str(args.project_dir))
try:
return args.command.run(args)
except UsageError as exception:
if log.isEnabledFor(logging.DEBUG):
raise # use Python's exception printing
log.fatal('FATAL: %s', exception)
# If we get here, then execution has failed in some way.
return 1
# Subcommand classes
#
# Each class needs to implement two functions: add_args() adds the CLI parameters
# for the subfunction, run() executes the subcommand.
#
# The class documentation doubles as the help text for the command. The
# first line is also used in the summary when calling the program without
# a subcommand.
#
# No need to document the functions each time.
# pylint: disable=C0111
class AdminServe:
"""\
Start a simple web server for serving the API.
This command starts a built-in webserver to serve the website
from the current project directory. This webserver is only suitable
for testing and development. Do not use it in production setups!
There are different webservers available. The default 'php' engine
runs the classic PHP frontend. The other engines are Python servers
which run the new Python frontend code. This is highly experimental
at the moment and may not include the full API.
By the default, the webserver can be accessed at: http://127.0.0.1:8088
"""
def add_args(self, parser: argparse.ArgumentParser) -> None:
group = parser.add_argument_group('Server arguments')
group.add_argument('--server', default='127.0.0.1:8088',
help='The address the server will listen to.')
group.add_argument('--engine', default='falcon',
choices=('php', 'falcon', 'starlette'),
help='Webserver framework to run. (default: falcon)')
def run(self, args: NominatimArgs) -> int:
if args.engine == 'php':
if args.config.lib_dir.php is None:
raise UsageError("PHP frontend not configured.")
run_php_server(args.server, args.project_dir / 'website')
else:
import uvicorn # pylint: disable=import-outside-toplevel
server_info = args.server.split(':', 1)
host = server_info[0]
if len(server_info) > 1:
if not server_info[1].isdigit():
raise UsageError('Invalid format for --server parameter. Use <host>:<port>')
port = int(server_info[1])
else:
port = 8088
server_module = importlib.import_module(f'nominatim.server.{args.engine}.server')
app = server_module.get_application(args.project_dir)
uvicorn.run(app, host=host, port=port)
return 0
def get_set_parser() -> CommandlineParser:
"""\
Initializes the parser and adds various subcommands for
nominatim cli.
"""
parser = CommandlineParser('nominatim', nominatim.__doc__)
parser.add_subcommand('import', clicmd.SetupAll())
parser.add_subcommand('freeze', clicmd.SetupFreeze())
parser.add_subcommand('replication', clicmd.UpdateReplication())
parser.add_subcommand('special-phrases', clicmd.ImportSpecialPhrases())
parser.add_subcommand('add-data', clicmd.UpdateAddData())
parser.add_subcommand('index', clicmd.UpdateIndex())
parser.add_subcommand('refresh', clicmd.UpdateRefresh())
parser.add_subcommand('admin', clicmd.AdminFuncs())
parser.add_subcommand('export', clicmd.QueryExport())
parser.add_subcommand('convert', clicmd.ConvertDB())
parser.add_subcommand('serve', AdminServe())
parser.add_subcommand('search', clicmd.APISearch())
parser.add_subcommand('reverse', clicmd.APIReverse())
parser.add_subcommand('lookup', clicmd.APILookup())
parser.add_subcommand('details', clicmd.APIDetails())
parser.add_subcommand('status', clicmd.APIStatus())
return parser
def nominatim(**kwargs: Any) -> int:
"""\
Command-line tools for importing, updating, administrating and
querying the Nominatim database.
"""
return get_set_parser().run(**kwargs)

View File

@@ -0,0 +1,28 @@
# SPDX-License-Identifier: GPL-2.0-only
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2023 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Subcommand definitions for the command-line tool.
"""
# mypy and pylint disagree about the style of explicit exports,
# see https://github.com/PyCQA/pylint/issues/6006.
# pylint: disable=useless-import-alias
from .setup import SetupAll as SetupAll
from .replication import UpdateReplication as UpdateReplication
from .api import (APISearch as APISearch,
APIReverse as APIReverse,
APILookup as APILookup,
APIDetails as APIDetails,
APIStatus as APIStatus)
from .index import UpdateIndex as UpdateIndex
from .refresh import UpdateRefresh as UpdateRefresh
from .add_data import UpdateAddData as UpdateAddData
from .admin import AdminFuncs as AdminFuncs
from .freeze import SetupFreeze as SetupFreeze
from .special_phrases import ImportSpecialPhrases as ImportSpecialPhrases
from .export import QueryExport as QueryExport
from .convert import ConvertDB as ConvertDB

View File

@@ -0,0 +1,101 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Implementation of the 'add-data' subcommand.
"""
from typing import cast
import argparse
import logging
import psutil
from .args import NominatimArgs
# Do not repeat documentation of subcommand classes.
# pylint: disable=C0111
# Using non-top-level imports to avoid eventually unused imports.
# pylint: disable=E0012,C0415
LOG = logging.getLogger()
class UpdateAddData:
"""\
Add additional data from a file or an online source.
This command allows to add or update the search data in the database.
The data can come either from an OSM file or single OSM objects can
directly be downloaded from the OSM API. This function only loads the
data into the database. Afterwards it still needs to be integrated
in the search index. Use the `nominatim index` command for that.
The command can also be used to add external non-OSM data to the
database. At the moment the only supported format is TIGER housenumber
data. See the online documentation at
https://nominatim.org/release-docs/latest/admin/Import/#installing-tiger-housenumber-data-for-the-us
for more information.
"""
def add_args(self, parser: argparse.ArgumentParser) -> None:
group_name = parser.add_argument_group('Source')
group1 = group_name.add_mutually_exclusive_group(required=True)
group1.add_argument('--file', metavar='FILE',
help='Import data from an OSM file or diff file')
group1.add_argument('--diff', metavar='FILE',
help='Import data from an OSM diff file (deprecated: use --file)')
group1.add_argument('--node', metavar='ID', type=int,
help='Import a single node from the API')
group1.add_argument('--way', metavar='ID', type=int,
help='Import a single way from the API')
group1.add_argument('--relation', metavar='ID', type=int,
help='Import a single relation from the API')
group1.add_argument('--tiger-data', metavar='DIR',
help='Add housenumbers from the US TIGER census database')
group2 = parser.add_argument_group('Extra arguments')
group2.add_argument('--use-main-api', action='store_true',
help='Use OSM API instead of Overpass to download objects')
group2.add_argument('--osm2pgsql-cache', metavar='SIZE', type=int,
help='Size of cache to be used by osm2pgsql (in MB)')
group2.add_argument('--socket-timeout', dest='socket_timeout', type=int, default=60,
help='Set timeout for file downloads')
def run(self, args: NominatimArgs) -> int:
from ..tokenizer import factory as tokenizer_factory
from ..tools import tiger_data, add_osm_data
if args.tiger_data:
tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
return tiger_data.add_tiger_data(args.tiger_data,
args.config,
args.threads or psutil.cpu_count() or 1,
tokenizer)
osm2pgsql_params = args.osm2pgsql_options(default_cache=1000, default_threads=1)
if args.file or args.diff:
return add_osm_data.add_data_from_file(args.config.get_libpq_dsn(),
cast(str, args.file or args.diff),
osm2pgsql_params)
if args.node:
return add_osm_data.add_osm_object(args.config.get_libpq_dsn(),
'node', args.node,
args.use_main_api,
osm2pgsql_params)
if args.way:
return add_osm_data.add_osm_object(args.config.get_libpq_dsn(),
'way', args.way,
args.use_main_api,
osm2pgsql_params)
if args.relation:
return add_osm_data.add_osm_object(args.config.get_libpq_dsn(),
'relation', args.relation,
args.use_main_api,
osm2pgsql_params)
return 0

View File

@@ -0,0 +1,123 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Implementation of the 'admin' subcommand.
"""
import logging
import argparse
import random
import nominatim_api as napi
from nominatim_core.db.connection import connect
from .args import NominatimArgs
# Do not repeat documentation of subcommand classes.
# pylint: disable=C0111
# Using non-top-level imports to avoid eventually unused imports.
# pylint: disable=E0012,C0415
LOG = logging.getLogger()
class AdminFuncs:
"""\
Analyse and maintain the database.
"""
def add_args(self, parser: argparse.ArgumentParser) -> None:
group = parser.add_argument_group('Admin tasks')
objs = group.add_mutually_exclusive_group(required=True)
objs.add_argument('--warm', action='store_true',
help='Warm database caches for search and reverse queries')
objs.add_argument('--check-database', action='store_true',
help='Check that the database is complete and operational')
objs.add_argument('--migrate', action='store_true',
help='Migrate the database to a new software version')
objs.add_argument('--analyse-indexing', action='store_true',
help='Print performance analysis of the indexing process')
objs.add_argument('--collect-os-info', action="store_true",
help="Generate a report about the host system information")
objs.add_argument('--clean-deleted', action='store', metavar='AGE',
help='Clean up deleted relations')
group = parser.add_argument_group('Arguments for cache warming')
group.add_argument('--search-only', action='store_const', dest='target',
const='search',
help="Only pre-warm tables for search queries")
group.add_argument('--reverse-only', action='store_const', dest='target',
const='reverse',
help="Only pre-warm tables for reverse queries")
group = parser.add_argument_group('Arguments for index anaysis')
mgroup = group.add_mutually_exclusive_group()
mgroup.add_argument('--osm-id', type=str,
help='Analyse indexing of the given OSM object')
mgroup.add_argument('--place-id', type=int,
help='Analyse indexing of the given Nominatim object')
def run(self, args: NominatimArgs) -> int:
# pylint: disable=too-many-return-statements
if args.warm:
return self._warm(args)
if args.check_database:
LOG.warning('Checking database')
from ..tools import check_database
return check_database.check_database(args.config)
if args.analyse_indexing:
LOG.warning('Analysing performance of indexing function')
from ..tools import admin
admin.analyse_indexing(args.config, osm_id=args.osm_id, place_id=args.place_id)
return 0
if args.migrate:
LOG.warning('Checking for necessary database migrations')
from ..tools import migration
return migration.migrate(args.config, args)
if args.collect_os_info:
LOG.warning("Reporting System Information")
from ..tools import collect_os_info
collect_os_info.report_system_information(args.config)
return 0
if args.clean_deleted:
LOG.warning('Cleaning up deleted relations')
from ..tools import admin
admin.clean_deleted_relations(args.config, age=args.clean_deleted)
return 0
return 1
def _warm(self, args: NominatimArgs) -> int:
LOG.warning('Warming database caches')
api = napi.NominatimAPI(args.project_dir)
try:
if args.target != 'search':
for _ in range(1000):
api.reverse((random.uniform(-90, 90), random.uniform(-180, 180)),
address_details=True)
if args.target != 'reverse':
from ..tokenizer import factory as tokenizer_factory
tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
with connect(args.config.get_libpq_dsn()) as conn:
if conn.table_exists('search_name'):
words = tokenizer.most_frequent_words(conn, 1000)
else:
words = []
for word in words:
api.search(word)
finally:
api.close()
return 0

View File

@@ -0,0 +1,374 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Subcommand definitions for API calls from the command line.
"""
from typing import Dict, Any
import argparse
import logging
import json
import sys
import nominatim_api as napi
import nominatim_api.v1 as api_output
from nominatim_api.v1.helpers import zoom_to_rank, deduplicate_results
from nominatim_api.v1.format import dispatch as formatting
import nominatim_api.logging as loglib
from .args import NominatimArgs
# Do not repeat documentation of subcommand classes.
# pylint: disable=C0111
LOG = logging.getLogger()
STRUCTURED_QUERY = (
('amenity', 'name and/or type of POI'),
('street', 'housenumber and street'),
('city', 'city, town or village'),
('county', 'county'),
('state', 'state'),
('country', 'country'),
('postalcode', 'postcode')
)
EXTRADATA_PARAMS = (
('addressdetails', 'Include a breakdown of the address into elements'),
('extratags', ("Include additional information if available "
"(e.g. wikipedia link, opening hours)")),
('namedetails', 'Include a list of alternative names')
)
def _add_api_output_arguments(parser: argparse.ArgumentParser) -> None:
group = parser.add_argument_group('Output arguments')
group.add_argument('--format', default='jsonv2',
choices=formatting.list_formats(napi.SearchResults) + ['debug'],
help='Format of result')
for name, desc in EXTRADATA_PARAMS:
group.add_argument('--' + name, action='store_true', help=desc)
group.add_argument('--lang', '--accept-language', metavar='LANGS',
help='Preferred language order for presenting search results')
group.add_argument('--polygon-output',
choices=['geojson', 'kml', 'svg', 'text'],
help='Output geometry of results as a GeoJSON, KML, SVG or WKT')
group.add_argument('--polygon-threshold', type=float, default = 0.0,
metavar='TOLERANCE',
help=("Simplify output geometry."
"Parameter is difference tolerance in degrees."))
class APISearch:
"""\
Execute a search query.
This command works exactly the same as if calling the /search endpoint on
the web API. See the online documentation for more details on the
various parameters:
https://nominatim.org/release-docs/latest/api/Search/
"""
def add_args(self, parser: argparse.ArgumentParser) -> None:
group = parser.add_argument_group('Query arguments')
group.add_argument('--query',
help='Free-form query string')
for name, desc in STRUCTURED_QUERY:
group.add_argument('--' + name, help='Structured query: ' + desc)
_add_api_output_arguments(parser)
group = parser.add_argument_group('Result limitation')
group.add_argument('--countrycodes', metavar='CC,..',
help='Limit search results to one or more countries')
group.add_argument('--exclude_place_ids', metavar='ID,..',
help='List of search object to be excluded')
group.add_argument('--limit', type=int, default=10,
help='Limit the number of returned results')
group.add_argument('--viewbox', metavar='X1,Y1,X2,Y2',
help='Preferred area to find search results')
group.add_argument('--bounded', action='store_true',
help='Strictly restrict results to viewbox area')
group = parser.add_argument_group('Other arguments')
group.add_argument('--no-dedupe', action='store_false', dest='dedupe',
help='Do not remove duplicates from the result list')
def run(self, args: NominatimArgs) -> int:
if args.format == 'debug':
loglib.set_log_output('text')
api = napi.NominatimAPI(args.project_dir)
params: Dict[str, Any] = {'max_results': args.limit + min(args.limit, 10),
'address_details': True, # needed for display name
'geometry_output': args.get_geometry_output(),
'geometry_simplification': args.polygon_threshold,
'countries': args.countrycodes,
'excluded': args.exclude_place_ids,
'viewbox': args.viewbox,
'bounded_viewbox': args.bounded,
'locales': args.get_locales(api.config.DEFAULT_LANGUAGE)
}
if args.query:
results = api.search(args.query, **params)
else:
results = api.search_address(amenity=args.amenity,
street=args.street,
city=args.city,
county=args.county,
state=args.state,
postalcode=args.postalcode,
country=args.country,
**params)
if args.dedupe and len(results) > 1:
results = deduplicate_results(results, args.limit)
if args.format == 'debug':
print(loglib.get_and_disable())
return 0
output = api_output.format_result(
results,
args.format,
{'extratags': args.extratags,
'namedetails': args.namedetails,
'addressdetails': args.addressdetails})
if args.format != 'xml':
# reformat the result, so it is pretty-printed
json.dump(json.loads(output), sys.stdout, indent=4, ensure_ascii=False)
else:
sys.stdout.write(output)
sys.stdout.write('\n')
return 0
class APIReverse:
"""\
Execute API reverse query.
This command works exactly the same as if calling the /reverse endpoint on
the web API. See the online documentation for more details on the
various parameters:
https://nominatim.org/release-docs/latest/api/Reverse/
"""
def add_args(self, parser: argparse.ArgumentParser) -> None:
group = parser.add_argument_group('Query arguments')
group.add_argument('--lat', type=float, required=True,
help='Latitude of coordinate to look up (in WGS84)')
group.add_argument('--lon', type=float, required=True,
help='Longitude of coordinate to look up (in WGS84)')
group.add_argument('--zoom', type=int,
help='Level of detail required for the address')
group.add_argument('--layer', metavar='LAYER',
choices=[n.name.lower() for n in napi.DataLayer if n.name],
action='append', required=False, dest='layers',
help='OSM id to lookup in format <NRW><id> (may be repeated)')
_add_api_output_arguments(parser)
def run(self, args: NominatimArgs) -> int:
if args.format == 'debug':
loglib.set_log_output('text')
api = napi.NominatimAPI(args.project_dir)
result = api.reverse(napi.Point(args.lon, args.lat),
max_rank=zoom_to_rank(args.zoom or 18),
layers=args.get_layers(napi.DataLayer.ADDRESS | napi.DataLayer.POI),
address_details=True, # needed for display name
geometry_output=args.get_geometry_output(),
geometry_simplification=args.polygon_threshold,
locales=args.get_locales(api.config.DEFAULT_LANGUAGE))
if args.format == 'debug':
print(loglib.get_and_disable())
return 0
if result:
output = api_output.format_result(
napi.ReverseResults([result]),
args.format,
{'extratags': args.extratags,
'namedetails': args.namedetails,
'addressdetails': args.addressdetails})
if args.format != 'xml':
# reformat the result, so it is pretty-printed
json.dump(json.loads(output), sys.stdout, indent=4, ensure_ascii=False)
else:
sys.stdout.write(output)
sys.stdout.write('\n')
return 0
LOG.error("Unable to geocode.")
return 42
class APILookup:
"""\
Execute API lookup query.
This command works exactly the same as if calling the /lookup endpoint on
the web API. See the online documentation for more details on the
various parameters:
https://nominatim.org/release-docs/latest/api/Lookup/
"""
def add_args(self, parser: argparse.ArgumentParser) -> None:
group = parser.add_argument_group('Query arguments')
group.add_argument('--id', metavar='OSMID',
action='append', required=True, dest='ids',
help='OSM id to lookup in format <NRW><id> (may be repeated)')
_add_api_output_arguments(parser)
def run(self, args: NominatimArgs) -> int:
if args.format == 'debug':
loglib.set_log_output('text')
api = napi.NominatimAPI(args.project_dir)
if args.format == 'debug':
print(loglib.get_and_disable())
return 0
places = [napi.OsmID(o[0], int(o[1:])) for o in args.ids]
results = api.lookup(places,
address_details=True, # needed for display name
geometry_output=args.get_geometry_output(),
geometry_simplification=args.polygon_threshold or 0.0,
locales=args.get_locales(api.config.DEFAULT_LANGUAGE))
output = api_output.format_result(
results,
args.format,
{'extratags': args.extratags,
'namedetails': args.namedetails,
'addressdetails': args.addressdetails})
if args.format != 'xml':
# reformat the result, so it is pretty-printed
json.dump(json.loads(output), sys.stdout, indent=4, ensure_ascii=False)
else:
sys.stdout.write(output)
sys.stdout.write('\n')
return 0
class APIDetails:
"""\
Execute API details query.
This command works exactly the same as if calling the /details endpoint on
the web API. See the online documentation for more details on the
various parameters:
https://nominatim.org/release-docs/latest/api/Details/
"""
def add_args(self, parser: argparse.ArgumentParser) -> None:
group = parser.add_argument_group('Query arguments')
objs = group.add_mutually_exclusive_group(required=True)
objs.add_argument('--node', '-n', type=int,
help="Look up the OSM node with the given ID.")
objs.add_argument('--way', '-w', type=int,
help="Look up the OSM way with the given ID.")
objs.add_argument('--relation', '-r', type=int,
help="Look up the OSM relation with the given ID.")
objs.add_argument('--place_id', '-p', type=int,
help='Database internal identifier of the OSM object to look up')
group.add_argument('--class', dest='object_class',
help=("Class type to disambiguated multiple entries "
"of the same object."))
group = parser.add_argument_group('Output arguments')
group.add_argument('--addressdetails', action='store_true',
help='Include a breakdown of the address into elements')
group.add_argument('--keywords', action='store_true',
help='Include a list of name keywords and address keywords')
group.add_argument('--linkedplaces', action='store_true',
help='Include a details of places that are linked with this one')
group.add_argument('--hierarchy', action='store_true',
help='Include details of places lower in the address hierarchy')
group.add_argument('--group_hierarchy', action='store_true',
help='Group the places by type')
group.add_argument('--polygon_geojson', action='store_true',
help='Include geometry of result')
group.add_argument('--lang', '--accept-language', metavar='LANGS',
help='Preferred language order for presenting search results')
def run(self, args: NominatimArgs) -> int:
place: napi.PlaceRef
if args.node:
place = napi.OsmID('N', args.node, args.object_class)
elif args.way:
place = napi.OsmID('W', args.way, args.object_class)
elif args.relation:
place = napi.OsmID('R', args.relation, args.object_class)
else:
assert args.place_id is not None
place = napi.PlaceID(args.place_id)
api = napi.NominatimAPI(args.project_dir)
locales = args.get_locales(api.config.DEFAULT_LANGUAGE)
result = api.details(place,
address_details=args.addressdetails,
linked_places=args.linkedplaces,
parented_places=args.hierarchy,
keywords=args.keywords,
geometry_output=napi.GeometryFormat.GEOJSON
if args.polygon_geojson
else napi.GeometryFormat.NONE,
locales=locales)
if result:
output = api_output.format_result(
result,
'json',
{'locales': locales,
'group_hierarchy': args.group_hierarchy})
# reformat the result, so it is pretty-printed
json.dump(json.loads(output), sys.stdout, indent=4, ensure_ascii=False)
sys.stdout.write('\n')
return 0
LOG.error("Object not found in database.")
return 42
class APIStatus:
"""
Execute API status query.
This command works exactly the same as if calling the /status endpoint on
the web API. See the online documentation for more details on the
various parameters:
https://nominatim.org/release-docs/latest/api/Status/
"""
def add_args(self, parser: argparse.ArgumentParser) -> None:
formats = api_output.list_formats(napi.StatusResult)
group = parser.add_argument_group('API parameters')
group.add_argument('--format', default=formats[0], choices=formats,
help='Format of result')
def run(self, args: NominatimArgs) -> int:
status = napi.NominatimAPI(args.project_dir).status()
print(api_output.format_result(status, args.format, {}))
return 0

View File

@@ -0,0 +1,260 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Provides custom functions over command-line arguments.
"""
from typing import Optional, List, Dict, Any, Sequence, Tuple
import argparse
import logging
from functools import reduce
from pathlib import Path
from nominatim_core.errors import UsageError
from nominatim_core.config import Configuration
from nominatim_core.typing import Protocol
import nominatim_api as napi
LOG = logging.getLogger()
class Subcommand(Protocol):
"""
Interface to be implemented by classes implementing a CLI subcommand.
"""
def add_args(self, parser: argparse.ArgumentParser) -> None:
"""
Fill the given parser for the subcommand with the appropriate
parameters.
"""
def run(self, args: 'NominatimArgs') -> int:
"""
Run the subcommand with the given parsed arguments.
"""
class NominatimArgs:
""" Customized namespace class for the nominatim command line tool
to receive the command-line arguments.
"""
# Basic environment set by root program.
config: Configuration
project_dir: Path
# Global switches
version: bool
subcommand: Optional[str]
command: Subcommand
# Shared parameters
osm2pgsql_cache: Optional[int]
socket_timeout: int
# Arguments added to all subcommands.
verbose: int
threads: Optional[int]
# Arguments to 'add-data'
file: Optional[str]
diff: Optional[str]
node: Optional[int]
way: Optional[int]
relation: Optional[int]
tiger_data: Optional[str]
use_main_api: bool
# Arguments to 'admin'
warm: bool
check_database: bool
migrate: bool
collect_os_info: bool
clean_deleted: str
analyse_indexing: bool
target: Optional[str]
osm_id: Optional[str]
place_id: Optional[int]
# Arguments to 'import'
osm_file: List[str]
continue_at: Optional[str]
reverse_only: bool
no_partitions: bool
no_updates: bool
offline: bool
ignore_errors: bool
index_noanalyse: bool
prepare_database: bool
# Arguments to 'index'
boundaries_only: bool
no_boundaries: bool
minrank: int
maxrank: int
# Arguments to 'export'
output_type: str
output_format: str
output_all_postcodes: bool
language: Optional[str]
restrict_to_country: Optional[str]
# Arguments to 'convert'
output: Path
# Arguments to 'refresh'
postcodes: bool
word_tokens: bool
word_counts: bool
address_levels: bool
functions: bool
wiki_data: bool
secondary_importance: bool
importance: bool
website: bool
diffs: bool
enable_debug_statements: bool
data_object: Sequence[Tuple[str, int]]
data_area: Sequence[Tuple[str, int]]
# Arguments to 'replication'
init: bool
update_functions: bool
check_for_updates: bool
once: bool
catch_up: bool
do_index: bool
# Arguments to 'serve'
server: str
engine: str
# Arguments to 'special-phrases
import_from_wiki: bool
import_from_csv: Optional[str]
no_replace: bool
# Arguments to all query functions
format: str
addressdetails: bool
extratags: bool
namedetails: bool
lang: Optional[str]
polygon_output: Optional[str]
polygon_threshold: Optional[float]
# Arguments to 'search'
query: Optional[str]
amenity: Optional[str]
street: Optional[str]
city: Optional[str]
county: Optional[str]
state: Optional[str]
country: Optional[str]
postalcode: Optional[str]
countrycodes: Optional[str]
exclude_place_ids: Optional[str]
limit: int
viewbox: Optional[str]
bounded: bool
dedupe: bool
# Arguments to 'reverse'
lat: float
lon: float
zoom: Optional[int]
layers: Optional[Sequence[str]]
# Arguments to 'lookup'
ids: Sequence[str]
# Arguments to 'details'
object_class: Optional[str]
linkedplaces: bool
hierarchy: bool
keywords: bool
polygon_geojson: bool
group_hierarchy: bool
def osm2pgsql_options(self, default_cache: int,
default_threads: int) -> Dict[str, Any]:
""" Return the standard osm2pgsql options that can be derived
from the command line arguments. The resulting dict can be
further customized and then used in `run_osm2pgsql()`.
"""
return dict(osm2pgsql=self.config.OSM2PGSQL_BINARY or self.config.lib_dir.osm2pgsql,
osm2pgsql_cache=self.osm2pgsql_cache or default_cache,
osm2pgsql_style=self.config.get_import_style_file(),
osm2pgsql_style_path=self.config.config_dir,
threads=self.threads or default_threads,
dsn=self.config.get_libpq_dsn(),
flatnode_file=str(self.config.get_path('FLATNODE_FILE') or ''),
tablespaces=dict(slim_data=self.config.TABLESPACE_OSM_DATA,
slim_index=self.config.TABLESPACE_OSM_INDEX,
main_data=self.config.TABLESPACE_PLACE_DATA,
main_index=self.config.TABLESPACE_PLACE_INDEX
)
)
def get_osm_file_list(self) -> Optional[List[Path]]:
""" Return the --osm-file argument as a list of Paths or None
if no argument was given. The function also checks if the files
exist and raises a UsageError if one cannot be found.
"""
if not self.osm_file:
return None
files = [Path(f) for f in self.osm_file]
for fname in files:
if not fname.is_file():
LOG.fatal("OSM file '%s' does not exist.", fname)
raise UsageError('Cannot access file.')
return files
def get_geometry_output(self) -> napi.GeometryFormat:
""" Get the requested geometry output format in a API-compatible
format.
"""
if not self.polygon_output:
return napi.GeometryFormat.NONE
if self.polygon_output == 'geojson':
return napi.GeometryFormat.GEOJSON
if self.polygon_output == 'kml':
return napi.GeometryFormat.KML
if self.polygon_output == 'svg':
return napi.GeometryFormat.SVG
if self.polygon_output == 'text':
return napi.GeometryFormat.TEXT
try:
return napi.GeometryFormat[self.polygon_output.upper()]
except KeyError as exp:
raise UsageError(f"Unknown polygon output format '{self.polygon_output}'.") from exp
def get_locales(self, default: Optional[str]) -> napi.Locales:
""" Get the locales from the language parameter.
"""
if self.lang:
return napi.Locales.from_accept_languages(self.lang)
if default:
return napi.Locales.from_accept_languages(default)
return napi.Locales()
def get_layers(self, default: napi.DataLayer) -> Optional[napi.DataLayer]:
""" Get the list of selected layers as a DataLayer enum.
"""
if not self.layers:
return default
return reduce(napi.DataLayer.__or__,
(napi.DataLayer[s.upper()] for s in self.layers))

View File

@@ -0,0 +1,95 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Implementation of the 'convert' subcommand.
"""
from typing import Set, Any, Union, Optional, Sequence
import argparse
import asyncio
from pathlib import Path
from nominatim_core.errors import UsageError
from .args import NominatimArgs
# Do not repeat documentation of subcommand classes.
# pylint: disable=C0111
# Using non-top-level imports to avoid eventually unused imports.
# pylint: disable=E0012,C0415
class WithAction(argparse.Action):
""" Special action that saves a list of flags, given on the command-line
as `--with-foo` or `--without-foo`.
"""
def __init__(self, option_strings: Sequence[str], dest: Any,
default: bool = True, **kwargs: Any) -> None:
if 'nargs' in kwargs:
raise ValueError("nargs not allowed.")
if option_strings is None:
raise ValueError("Positional parameter not allowed.")
self.dest_set = kwargs.pop('dest_set')
full_option_strings = []
for opt in option_strings:
if not opt.startswith('--'):
raise ValueError("short-form options not allowed")
if default:
self.dest_set.add(opt[2:])
full_option_strings.append(f"--with-{opt[2:]}")
full_option_strings.append(f"--without-{opt[2:]}")
super().__init__(full_option_strings, argparse.SUPPRESS, nargs=0, **kwargs)
def __call__(self, parser: argparse.ArgumentParser, namespace: argparse.Namespace,
values: Union[str, Sequence[Any], None],
option_string: Optional[str] = None) -> None:
assert option_string
if option_string.startswith('--with-'):
self.dest_set.add(option_string[7:])
if option_string.startswith('--without-'):
self.dest_set.discard(option_string[10:])
class ConvertDB:
""" Convert an existing database into a different format. (EXPERIMENTAL)
Dump a read-only version of the database in a different format.
At the moment only a SQLite database suitable for reverse lookup
can be created.
"""
def __init__(self) -> None:
self.options: Set[str] = set()
def add_args(self, parser: argparse.ArgumentParser) -> None:
parser.add_argument('--format', default='sqlite',
choices=('sqlite', ),
help='Format of the output database (must be sqlite currently)')
parser.add_argument('--output', '-o', required=True, type=Path,
help='File to write the database to.')
group = parser.add_argument_group('Switches to define database layout'
'(currently no effect)')
group.add_argument('--reverse', action=WithAction, dest_set=self.options, default=True,
help='Enable/disable support for reverse and lookup API'
' (default: enabled)')
group.add_argument('--search', action=WithAction, dest_set=self.options, default=True,
help='Enable/disable support for search API (default: disabled)')
group.add_argument('--details', action=WithAction, dest_set=self.options, default=True,
help='Enable/disable support for details API (default: enabled)')
def run(self, args: NominatimArgs) -> int:
if args.output.exists():
raise UsageError(f"File '{args.output}' already exists. Refusing to overwrite.")
if args.format == 'sqlite':
from ..tools import convert_sqlite
asyncio.run(convert_sqlite.convert(args.project_dir, args.output, self.options))
return 0
return 1

View File

@@ -0,0 +1,200 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Implementation of the 'export' subcommand.
"""
from typing import Optional, List, cast
import logging
import argparse
import asyncio
import csv
import sys
import sqlalchemy as sa
import nominatim_api as napi
from nominatim_api.results import create_from_placex_row, ReverseResult, add_result_details
from nominatim_api.types import LookupDetails
from nominatim_core.errors import UsageError
from .args import NominatimArgs
# Do not repeat documentation of subcommand classes.
# pylint: disable=C0111
# Using non-top-level imports to avoid eventually unused imports.
# pylint: disable=E0012,C0415
# Needed for SQLAlchemy
# pylint: disable=singleton-comparison
LOG = logging.getLogger()
RANK_RANGE_MAP = {
'country': (4, 4),
'state': (5, 9),
'county': (10, 12),
'city': (13, 16),
'suburb': (17, 21),
'street': (26, 26),
'path': (27, 27)
}
RANK_TO_OUTPUT_MAP = {
4: 'country',
5: 'state', 6: 'state', 7: 'state', 8: 'state', 9: 'state',
10: 'county', 11: 'county', 12: 'county',
13: 'city', 14: 'city', 15: 'city', 16: 'city',
17: 'suburb', 18: 'suburb', 19: 'suburb', 20: 'suburb', 21: 'suburb',
26: 'street', 27: 'path'}
class QueryExport:
"""\
Export places as CSV file from the database.
"""
def add_args(self, parser: argparse.ArgumentParser) -> None:
group = parser.add_argument_group('Output arguments')
group.add_argument('--output-type', default='street',
choices=('country', 'state', 'county',
'city', 'suburb', 'street', 'path'),
help='Type of places to output (default: street)')
group.add_argument('--output-format',
default='street;suburb;city;county;state;country',
help=("Semicolon-separated list of address types "
"(see --output-type). Additionally accepts:"
"placeid,postcode"))
group.add_argument('--language',
help=("Preferred language for output "
"(use local name, if omitted)"))
group = parser.add_argument_group('Filter arguments')
group.add_argument('--restrict-to-country', metavar='COUNTRY_CODE',
help='Export only objects within country')
group.add_argument('--restrict-to-osm-node', metavar='ID', type=int,
dest='node',
help='Export only children of this OSM node')
group.add_argument('--restrict-to-osm-way', metavar='ID', type=int,
dest='way',
help='Export only children of this OSM way')
group.add_argument('--restrict-to-osm-relation', metavar='ID', type=int,
dest='relation',
help='Export only children of this OSM relation')
def run(self, args: NominatimArgs) -> int:
return asyncio.run(export(args))
async def export(args: NominatimArgs) -> int:
""" The actual export as a asynchronous function.
"""
api = napi.NominatimAPIAsync(args.project_dir)
try:
output_range = RANK_RANGE_MAP[args.output_type]
writer = init_csv_writer(args.output_format)
async with api.begin() as conn, api.begin() as detail_conn:
t = conn.t.placex
sql = sa.select(t.c.place_id, t.c.parent_place_id,
t.c.osm_type, t.c.osm_id, t.c.name,
t.c.class_, t.c.type, t.c.admin_level,
t.c.address, t.c.extratags,
t.c.housenumber, t.c.postcode, t.c.country_code,
t.c.importance, t.c.wikipedia, t.c.indexed_date,
t.c.rank_address, t.c.rank_search,
t.c.centroid)\
.where(t.c.linked_place_id == None)\
.where(t.c.rank_address.between(*output_range))
parent_place_id = await get_parent_id(conn, args.node, args.way, args.relation)
if parent_place_id:
taddr = conn.t.addressline
sql = sql.join(taddr, taddr.c.place_id == t.c.place_id)\
.where(taddr.c.address_place_id == parent_place_id)\
.where(taddr.c.isaddress)
if args.restrict_to_country:
sql = sql.where(t.c.country_code == args.restrict_to_country.lower())
results = []
for row in await conn.execute(sql):
result = create_from_placex_row(row, ReverseResult)
if result is not None:
results.append(result)
if len(results) == 1000:
await dump_results(detail_conn, results, writer, args.language)
results = []
if results:
await dump_results(detail_conn, results, writer, args.language)
finally:
await api.close()
return 0
def init_csv_writer(output_format: str) -> 'csv.DictWriter[str]':
fields = output_format.split(';')
writer = csv.DictWriter(sys.stdout, fieldnames=fields, extrasaction='ignore')
writer.writeheader()
return writer
async def dump_results(conn: napi.SearchConnection,
results: List[ReverseResult],
writer: 'csv.DictWriter[str]',
lang: Optional[str]) -> None:
locale = napi.Locales([lang] if lang else None)
await add_result_details(conn, results,
LookupDetails(address_details=True, locales=locale))
for result in results:
data = {'placeid': result.place_id,
'postcode': result.postcode}
for line in (result.address_rows or []):
if line.isaddress and line.local_name:
if line.category[1] == 'postcode':
data['postcode'] = line.local_name
elif line.rank_address in RANK_TO_OUTPUT_MAP:
data[RANK_TO_OUTPUT_MAP[line.rank_address]] = line.local_name
writer.writerow(data)
async def get_parent_id(conn: napi.SearchConnection, node_id: Optional[int],
way_id: Optional[int],
relation_id: Optional[int]) -> Optional[int]:
""" Get the place ID for the given OSM object.
"""
if node_id is not None:
osm_type, osm_id = 'N', node_id
elif way_id is not None:
osm_type, osm_id = 'W', way_id
elif relation_id is not None:
osm_type, osm_id = 'R', relation_id
else:
return None
t = conn.t.placex
sql = sa.select(t.c.place_id).limit(1)\
.where(t.c.osm_type == osm_type)\
.where(t.c.osm_id == osm_id)\
.where(t.c.rank_address > 0)\
.order_by(t.c.rank_address)
for result in await conn.execute(sql):
return cast(int, result[0])
raise UsageError(f'Cannot find a place {osm_type}{osm_id}.')

View File

@@ -0,0 +1,43 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Implementation of the 'freeze' subcommand.
"""
import argparse
from nominatim_core.db.connection import connect
from .args import NominatimArgs
# Do not repeat documentation of subcommand classes.
# pylint: disable=C0111
# Using non-top-level imports to avoid eventually unused imports.
# pylint: disable=E0012,C0415
class SetupFreeze:
"""\
Make database read-only.
About half of data in the Nominatim database is kept only to be able to
keep the data up-to-date with new changes made in OpenStreetMap. This
command drops all this data and only keeps the part needed for geocoding
itself.
This command has the same effect as the `--no-updates` option for imports.
"""
def add_args(self, parser: argparse.ArgumentParser) -> None:
pass # No options
def run(self, args: NominatimArgs) -> int:
from ..tools import freeze
with connect(args.config.get_libpq_dsn()) as conn:
freeze.drop_update_tables(conn)
freeze.drop_flatnode_file(args.config.get_path('FLATNODE_FILE'))
return 0

View File

@@ -0,0 +1,66 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Implementation of the 'index' subcommand.
"""
import argparse
import psutil
from nominatim_core.db import status
from nominatim_core.db.connection import connect
from .args import NominatimArgs
# Do not repeat documentation of subcommand classes.
# pylint: disable=C0111
# Using non-top-level imports to avoid eventually unused imports.
# pylint: disable=E0012,C0415
class UpdateIndex:
"""\
Reindex all new and modified data.
Indexing is the process of computing the address and search terms for
the places in the database. Every time data is added or changed, indexing
needs to be run. Imports and replication updates automatically take care
of indexing. For other cases, this function allows to run indexing manually.
"""
def add_args(self, parser: argparse.ArgumentParser) -> None:
group = parser.add_argument_group('Filter arguments')
group.add_argument('--boundaries-only', action='store_true',
help="""Index only administrative boundaries.""")
group.add_argument('--no-boundaries', action='store_true',
help="""Index everything except administrative boundaries.""")
group.add_argument('--minrank', '-r', type=int, metavar='RANK', default=0,
help='Minimum/starting rank')
group.add_argument('--maxrank', '-R', type=int, metavar='RANK', default=30,
help='Maximum/finishing rank')
def run(self, args: NominatimArgs) -> int:
from ..indexer.indexer import Indexer
from ..tokenizer import factory as tokenizer_factory
tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
indexer = Indexer(args.config.get_libpq_dsn(), tokenizer,
args.threads or psutil.cpu_count() or 1)
if not args.no_boundaries:
indexer.index_boundaries(args.minrank, args.maxrank)
if not args.boundaries_only:
indexer.index_by_rank(args.minrank, args.maxrank)
indexer.index_postcodes()
if not args.no_boundaries and not args.boundaries_only \
and args.minrank == 0 and args.maxrank == 30:
with connect(args.config.get_libpq_dsn()) as conn:
status.set_indexed(conn, True)
return 0

View File

@@ -0,0 +1,187 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Implementation of 'refresh' subcommand.
"""
from typing import Tuple, Optional
import argparse
import logging
from pathlib import Path
from nominatim_core.config import Configuration
from nominatim_core.db.connection import connect
from ..tokenizer.base import AbstractTokenizer
from .args import NominatimArgs
# Do not repeat documentation of subcommand classes.
# pylint: disable=C0111
# Using non-top-level imports to avoid eventually unused imports.
# pylint: disable=E0012,C0415
LOG = logging.getLogger()
def _parse_osm_object(obj: str) -> Tuple[str, int]:
""" Parse the given argument into a tuple of OSM type and ID.
Raises an ArgumentError if the format is not recognized.
"""
if len(obj) < 2 or obj[0].lower() not in 'nrw' or not obj[1:].isdigit():
raise argparse.ArgumentTypeError("Cannot parse OSM ID. Expect format: [N|W|R]<id>.")
return (obj[0].upper(), int(obj[1:]))
class UpdateRefresh:
"""\
Recompute auxiliary data used by the indexing process.
This sub-commands updates various static data and functions in the database.
It usually needs to be run after changing various aspects of the
configuration. The configuration documentation will mention the exact
command to use in such case.
Warning: the 'update' command must not be run in parallel with other update
commands like 'replication' or 'add-data'.
"""
def __init__(self) -> None:
self.tokenizer: Optional[AbstractTokenizer] = None
def add_args(self, parser: argparse.ArgumentParser) -> None:
group = parser.add_argument_group('Data arguments')
group.add_argument('--postcodes', action='store_true',
help='Update postcode centroid table')
group.add_argument('--word-tokens', action='store_true',
help='Clean up search terms')
group.add_argument('--word-counts', action='store_true',
help='Compute frequency of full-word search terms')
group.add_argument('--address-levels', action='store_true',
help='Reimport address level configuration')
group.add_argument('--functions', action='store_true',
help='Update the PL/pgSQL functions in the database')
group.add_argument('--wiki-data', action='store_true',
help='Update Wikipedia/data importance numbers')
group.add_argument('--secondary-importance', action='store_true',
help='Update secondary importance raster data')
group.add_argument('--importance', action='store_true',
help='Recompute place importances (expensive!)')
group.add_argument('--website', action='store_true',
help='Refresh the directory that serves the scripts for the web API')
group.add_argument('--data-object', action='append',
type=_parse_osm_object, metavar='OBJECT',
help='Mark the given OSM object as requiring an update'
' (format: [NWR]<id>)')
group.add_argument('--data-area', action='append',
type=_parse_osm_object, metavar='OBJECT',
help='Mark the area around the given OSM object as requiring an update'
' (format: [NWR]<id>)')
group = parser.add_argument_group('Arguments for function refresh')
group.add_argument('--no-diff-updates', action='store_false', dest='diffs',
help='Do not enable code for propagating updates')
group.add_argument('--enable-debug-statements', action='store_true',
help='Enable debug warning statements in functions')
def run(self, args: NominatimArgs) -> int: #pylint: disable=too-many-branches, too-many-statements
from ..tools import refresh, postcodes
from ..indexer.indexer import Indexer
need_function_refresh = args.functions
if args.postcodes:
if postcodes.can_compute(args.config.get_libpq_dsn()):
LOG.warning("Update postcodes centroid")
tokenizer = self._get_tokenizer(args.config)
postcodes.update_postcodes(args.config.get_libpq_dsn(),
args.project_dir, tokenizer)
indexer = Indexer(args.config.get_libpq_dsn(), tokenizer,
args.threads or 1)
indexer.index_postcodes()
else:
LOG.error("The place table doesn't exist. "
"Postcode updates on a frozen database is not possible.")
if args.word_tokens:
LOG.warning('Updating word tokens')
tokenizer = self._get_tokenizer(args.config)
tokenizer.update_word_tokens()
if args.word_counts:
LOG.warning('Recompute word statistics')
self._get_tokenizer(args.config).update_statistics(args.config,
threads=args.threads or 1)
if args.address_levels:
LOG.warning('Updating address levels')
with connect(args.config.get_libpq_dsn()) as conn:
refresh.load_address_levels_from_config(conn, args.config)
# Attention: must come BEFORE functions
if args.secondary_importance:
with connect(args.config.get_libpq_dsn()) as conn:
# If the table did not exist before, then the importance code
# needs to be enabled.
if not conn.table_exists('secondary_importance'):
args.functions = True
LOG.warning('Import secondary importance raster data from %s', args.project_dir)
if refresh.import_secondary_importance(args.config.get_libpq_dsn(),
args.project_dir) > 0:
LOG.fatal('FATAL: Cannot update secondary importance raster data')
return 1
need_function_refresh = True
if args.wiki_data:
data_path = Path(args.config.WIKIPEDIA_DATA_PATH
or args.project_dir)
LOG.warning('Import wikipedia article importance from %s', data_path)
if refresh.import_wikipedia_articles(args.config.get_libpq_dsn(),
data_path) > 0:
LOG.fatal('FATAL: Wikipedia importance file not found in %s', data_path)
return 1
need_function_refresh = True
if need_function_refresh:
LOG.warning('Create functions')
with connect(args.config.get_libpq_dsn()) as conn:
refresh.create_functions(conn, args.config,
args.diffs, args.enable_debug_statements)
self._get_tokenizer(args.config).update_sql_functions(args.config)
# Attention: importance MUST come after wiki data import and after functions.
if args.importance:
LOG.warning('Update importance values for database')
with connect(args.config.get_libpq_dsn()) as conn:
refresh.recompute_importance(conn)
if args.website:
webdir = args.project_dir / 'website'
LOG.warning('Setting up website directory at %s', webdir)
# This is a little bit hacky: call the tokenizer setup, so that
# the tokenizer directory gets repopulated as well, in case it
# wasn't there yet.
self._get_tokenizer(args.config)
with connect(args.config.get_libpq_dsn()) as conn:
refresh.setup_website(webdir, args.config, conn)
if args.data_object or args.data_area:
with connect(args.config.get_libpq_dsn()) as conn:
for obj in args.data_object or []:
refresh.invalidate_osm_object(*obj, conn, recursive=False)
for obj in args.data_area or []:
refresh.invalidate_osm_object(*obj, conn, recursive=True)
conn.commit()
return 0
def _get_tokenizer(self, config: Configuration) -> AbstractTokenizer:
if self.tokenizer is None:
from ..tokenizer import factory as tokenizer_factory
self.tokenizer = tokenizer_factory.get_tokenizer_for_db(config)
return self.tokenizer

View File

@@ -0,0 +1,200 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Implementation of the 'replication' sub-command.
"""
from typing import Optional
import argparse
import datetime as dt
import logging
import socket
import time
from nominatim_core.db import status
from nominatim_core.db.connection import connect
from nominatim_core.errors import UsageError
from .args import NominatimArgs
LOG = logging.getLogger()
# Do not repeat documentation of subcommand classes.
# pylint: disable=C0111
# Using non-top-level imports to make pyosmium optional for replication only.
# pylint: disable=C0415
class UpdateReplication:
"""\
Update the database using an online replication service.
An OSM replication service is an online service that provides regular
updates (OSM diff files) for the planet or update they provide. The OSMF
provides the primary replication service for the full planet at
https://planet.osm.org/replication/ but there are other providers of
extracts of OSM data who provide such a service as well.
This sub-command allows to set up such a replication service and download
and import updates at regular intervals. You need to call '--init' once to
set up the process or whenever you change the replication configuration
parameters. Without any arguments, the sub-command will go into a loop and
continuously apply updates as they become available. Giving `--once` just
downloads and imports the next batch of updates.
"""
def add_args(self, parser: argparse.ArgumentParser) -> None:
group = parser.add_argument_group('Arguments for initialisation')
group.add_argument('--init', action='store_true',
help='Initialise the update process')
group.add_argument('--no-update-functions', dest='update_functions',
action='store_false',
help="Do not update the trigger function to "
"support differential updates (EXPERT)")
group = parser.add_argument_group('Arguments for updates')
group.add_argument('--check-for-updates', action='store_true',
help='Check if new updates are available and exit')
group.add_argument('--once', action='store_true',
help="Download and apply updates only once. When "
"not set, updates are continuously applied")
group.add_argument('--catch-up', action='store_true',
help="Download and apply updates until no new "
"data is available on the server")
group.add_argument('--no-index', action='store_false', dest='do_index',
help=("Do not index the new data. Only usable "
"together with --once"))
group.add_argument('--osm2pgsql-cache', metavar='SIZE', type=int,
help='Size of cache to be used by osm2pgsql (in MB)')
group = parser.add_argument_group('Download parameters')
group.add_argument('--socket-timeout', dest='socket_timeout', type=int, default=60,
help='Set timeout for file downloads')
def _init_replication(self, args: NominatimArgs) -> int:
from ..tools import replication, refresh
LOG.warning("Initialising replication updates")
with connect(args.config.get_libpq_dsn()) as conn:
replication.init_replication(conn, base_url=args.config.REPLICATION_URL,
socket_timeout=args.socket_timeout)
if args.update_functions:
LOG.warning("Create functions")
refresh.create_functions(conn, args.config, True, False)
return 0
def _check_for_updates(self, args: NominatimArgs) -> int:
from ..tools import replication
with connect(args.config.get_libpq_dsn()) as conn:
return replication.check_for_updates(conn, base_url=args.config.REPLICATION_URL,
socket_timeout=args.socket_timeout)
def _report_update(self, batchdate: dt.datetime,
start_import: dt.datetime,
start_index: Optional[dt.datetime]) -> None:
def round_time(delta: dt.timedelta) -> dt.timedelta:
return dt.timedelta(seconds=int(delta.total_seconds()))
end = dt.datetime.now(dt.timezone.utc)
LOG.warning("Update completed. Import: %s. %sTotal: %s. Remaining backlog: %s.",
round_time((start_index or end) - start_import),
f"Indexing: {round_time(end - start_index)} " if start_index else '',
round_time(end - start_import),
round_time(end - batchdate))
def _compute_update_interval(self, args: NominatimArgs) -> int:
if args.catch_up:
return 0
update_interval = args.config.get_int('REPLICATION_UPDATE_INTERVAL')
# Sanity check to not overwhelm the Geofabrik servers.
if 'download.geofabrik.de' in args.config.REPLICATION_URL\
and update_interval < 86400:
LOG.fatal("Update interval too low for download.geofabrik.de.\n"
"Please check install documentation "
"(https://nominatim.org/release-docs/latest/admin/Import-and-Update#"
"setting-up-the-update-process).")
raise UsageError("Invalid replication update interval setting.")
return update_interval
def _update(self, args: NominatimArgs) -> None:
# pylint: disable=too-many-locals
from ..tools import replication
from ..indexer.indexer import Indexer
from ..tokenizer import factory as tokenizer_factory
update_interval = self._compute_update_interval(args)
params = args.osm2pgsql_options(default_cache=2000, default_threads=1)
params.update(base_url=args.config.REPLICATION_URL,
update_interval=update_interval,
import_file=args.project_dir / 'osmosischange.osc',
max_diff_size=args.config.get_int('REPLICATION_MAX_DIFF'),
indexed_only=not args.once)
if not args.once:
if not args.do_index:
LOG.fatal("Indexing cannot be disabled when running updates continuously.")
raise UsageError("Bad argument '--no-index'.")
recheck_interval = args.config.get_int('REPLICATION_RECHECK_INTERVAL')
tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
indexer = Indexer(args.config.get_libpq_dsn(), tokenizer, args.threads or 1)
dsn = args.config.get_libpq_dsn()
while True:
start = dt.datetime.now(dt.timezone.utc)
state = replication.update(dsn, params, socket_timeout=args.socket_timeout)
with connect(dsn) as conn:
if state is not replication.UpdateState.NO_CHANGES:
status.log_status(conn, start, 'import')
batchdate, _, _ = status.get_status(conn)
conn.commit()
if state is not replication.UpdateState.NO_CHANGES and args.do_index:
index_start = dt.datetime.now(dt.timezone.utc)
indexer.index_full(analyse=False)
with connect(dsn) as conn:
status.set_indexed(conn, True)
status.log_status(conn, index_start, 'index')
conn.commit()
else:
index_start = None
if state is replication.UpdateState.NO_CHANGES and \
args.catch_up or update_interval > 40*60:
while indexer.has_pending():
indexer.index_full(analyse=False)
if LOG.isEnabledFor(logging.WARNING):
assert batchdate is not None
self._report_update(batchdate, start, index_start)
if args.once or (args.catch_up and state is replication.UpdateState.NO_CHANGES):
break
if state is replication.UpdateState.NO_CHANGES:
LOG.warning("No new changes. Sleeping for %d sec.", recheck_interval)
time.sleep(recheck_interval)
def run(self, args: NominatimArgs) -> int:
socket.setdefaulttimeout(args.socket_timeout)
if args.init:
return self._init_replication(args)
if args.check_for_updates:
return self._check_for_updates(args)
self._update(args)
return 0

View File

@@ -0,0 +1,229 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Implementation of the 'import' subcommand.
"""
from typing import Optional
import argparse
import logging
from pathlib import Path
import psutil
from nominatim_core.errors import UsageError
from nominatim_core.config import Configuration
from nominatim_core.db.connection import connect
from nominatim_core.db import status, properties
from ..tokenizer.base import AbstractTokenizer
from ..version import NOMINATIM_VERSION
from .args import NominatimArgs
# Do not repeat documentation of subcommand classes.
# pylint: disable=C0111
# Using non-top-level imports to avoid eventually unused imports.
# pylint: disable=C0415
LOG = logging.getLogger()
class SetupAll:
"""\
Create a new Nominatim database from an OSM file.
This sub-command sets up a new Nominatim database from scratch starting
with creating a new database in Postgresql. The user running this command
needs superuser rights on the database.
"""
def add_args(self, parser: argparse.ArgumentParser) -> None:
group1 = parser.add_argument_group('Required arguments')
group1.add_argument('--osm-file', metavar='FILE', action='append',
help='OSM file to be imported'
' (repeat for importing multiple files)',
default=None)
group1.add_argument('--continue', dest='continue_at',
choices=['import-from-file', 'load-data', 'indexing', 'db-postprocess'],
help='Continue an import that was interrupted',
default=None)
group2 = parser.add_argument_group('Optional arguments')
group2.add_argument('--osm2pgsql-cache', metavar='SIZE', type=int,
help='Size of cache to be used by osm2pgsql (in MB)')
group2.add_argument('--reverse-only', action='store_true',
help='Do not create tables and indexes for searching')
group2.add_argument('--no-partitions', action='store_true',
help=("Do not partition search indices "
"(speeds up import of single country extracts)"))
group2.add_argument('--no-updates', action='store_true',
help="Do not keep tables that are only needed for "
"updating the database later")
group2.add_argument('--offline', action='store_true',
help="Do not attempt to load any additional data from the internet")
group3 = parser.add_argument_group('Expert options')
group3.add_argument('--ignore-errors', action='store_true',
help='Continue import even when errors in SQL are present')
group3.add_argument('--index-noanalyse', action='store_true',
help='Do not perform analyse operations during index (expert only)')
group3.add_argument('--prepare-database', action='store_true',
help='Create the database but do not import any data')
def run(self, args: NominatimArgs) -> int: # pylint: disable=too-many-statements, too-many-branches
from ..data import country_info
from ..tools import database_import, refresh, postcodes, freeze
from ..indexer.indexer import Indexer
num_threads = args.threads or psutil.cpu_count() or 1
country_info.setup_country_config(args.config)
if args.osm_file is None and args.continue_at is None and not args.prepare_database:
raise UsageError("No input files (use --osm-file).")
if args.osm_file is not None and args.continue_at not in ('import-from-file', None):
raise UsageError(f"Cannot use --continue {args.continue_at} and --osm-file together.")
if args.continue_at is not None and args.prepare_database:
raise UsageError(
"Cannot use --continue and --prepare-database together."
)
if args.prepare_database or args.continue_at is None:
LOG.warning('Creating database')
database_import.setup_database_skeleton(args.config.get_libpq_dsn(),
rouser=args.config.DATABASE_WEBUSER)
if args.prepare_database:
return 0
if args.continue_at in (None, 'import-from-file'):
files = args.get_osm_file_list()
if not files:
raise UsageError("No input files (use --osm-file).")
if args.continue_at in ('import-from-file', None):
# Check if the correct plugins are installed
database_import.check_existing_database_plugins(args.config.get_libpq_dsn())
LOG.warning('Setting up country tables')
country_info.setup_country_tables(args.config.get_libpq_dsn(),
args.config.lib_dir.data,
args.no_partitions)
LOG.warning('Importing OSM data file')
database_import.import_osm_data(files,
args.osm2pgsql_options(0, 1),
drop=args.no_updates,
ignore_errors=args.ignore_errors)
LOG.warning('Importing wikipedia importance data')
data_path = Path(args.config.WIKIPEDIA_DATA_PATH or args.project_dir)
if refresh.import_wikipedia_articles(args.config.get_libpq_dsn(),
data_path) > 0:
LOG.error('Wikipedia importance dump file not found. '
'Calculating importance values of locations will not '
'use Wikipedia importance data.')
LOG.warning('Importing secondary importance raster data')
if refresh.import_secondary_importance(args.config.get_libpq_dsn(),
args.project_dir) != 0:
LOG.error('Secondary importance file not imported. '
'Falling back to default ranking.')
self._setup_tables(args.config, args.reverse_only)
if args.continue_at in ('import-from-file', 'load-data', None):
LOG.warning('Initialise tables')
with connect(args.config.get_libpq_dsn()) as conn:
database_import.truncate_data_tables(conn)
LOG.warning('Load data into placex table')
database_import.load_data(args.config.get_libpq_dsn(), num_threads)
LOG.warning("Setting up tokenizer")
tokenizer = self._get_tokenizer(args.continue_at, args.config)
if args.continue_at in ('import-from-file', 'load-data', None):
LOG.warning('Calculate postcodes')
postcodes.update_postcodes(args.config.get_libpq_dsn(),
args.project_dir, tokenizer)
if args.continue_at in \
('import-from-file', 'load-data', 'indexing', None):
LOG.warning('Indexing places')
indexer = Indexer(args.config.get_libpq_dsn(), tokenizer, num_threads)
indexer.index_full(analyse=not args.index_noanalyse)
LOG.warning('Post-process tables')
with connect(args.config.get_libpq_dsn()) as conn:
database_import.create_search_indices(conn, args.config,
drop=args.no_updates,
threads=num_threads)
LOG.warning('Create search index for default country names.')
country_info.create_country_names(conn, tokenizer,
args.config.get_str_list('LANGUAGES'))
if args.no_updates:
freeze.drop_update_tables(conn)
tokenizer.finalize_import(args.config)
LOG.warning('Recompute word counts')
tokenizer.update_statistics(args.config, threads=num_threads)
webdir = args.project_dir / 'website'
LOG.warning('Setup website at %s', webdir)
with connect(args.config.get_libpq_dsn()) as conn:
refresh.setup_website(webdir, args.config, conn)
self._finalize_database(args.config.get_libpq_dsn(), args.offline)
return 0
def _setup_tables(self, config: Configuration, reverse_only: bool) -> None:
""" Set up the basic database layout: tables, indexes and functions.
"""
from ..tools import database_import, refresh
with connect(config.get_libpq_dsn()) as conn:
LOG.warning('Create functions (1st pass)')
refresh.create_functions(conn, config, False, False)
LOG.warning('Create tables')
database_import.create_tables(conn, config, reverse_only=reverse_only)
refresh.load_address_levels_from_config(conn, config)
LOG.warning('Create functions (2nd pass)')
refresh.create_functions(conn, config, False, False)
LOG.warning('Create table triggers')
database_import.create_table_triggers(conn, config)
LOG.warning('Create partition tables')
database_import.create_partition_tables(conn, config)
LOG.warning('Create functions (3rd pass)')
refresh.create_functions(conn, config, False, False)
def _get_tokenizer(self, continue_at: Optional[str],
config: Configuration) -> AbstractTokenizer:
""" Set up a new tokenizer or load an already initialised one.
"""
from ..tokenizer import factory as tokenizer_factory
if continue_at in ('import-from-file', 'load-data', None):
# (re)initialise the tokenizer data
return tokenizer_factory.create_tokenizer(config)
# just load the tokenizer
return tokenizer_factory.get_tokenizer_for_db(config)
def _finalize_database(self, dsn: str, offline: bool) -> None:
""" Determine the database date and set the status accordingly.
"""
with connect(dsn) as conn:
properties.set_property(conn, 'database_version', str(NOMINATIM_VERSION))
try:
dbdate = status.compute_database_date(conn, offline)
status.set_status(conn, dbdate)
LOG.info('Database is at %s.', dbdate)
except Exception as exc: # pylint: disable=broad-except
LOG.error('Cannot determine date of database: %s', exc)

View File

@@ -0,0 +1,93 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Implementation of the 'special-phrases' command.
"""
import argparse
import logging
from pathlib import Path
from nominatim_core.errors import UsageError
from nominatim_core.db.connection import connect
from ..tools.special_phrases.sp_importer import SPImporter, SpecialPhraseLoader
from ..tools.special_phrases.sp_wiki_loader import SPWikiLoader
from ..tools.special_phrases.sp_csv_loader import SPCsvLoader
from .args import NominatimArgs
LOG = logging.getLogger()
# Do not repeat documentation of subcommand classes.
# pylint: disable=C0111
# Using non-top-level imports to avoid eventually unused imports.
# pylint: disable=E0012,C0415
class ImportSpecialPhrases:
"""\
Import special phrases.
Special phrases are search terms that narrow down the type of object
that should be searched. For example, you might want to search for
'Hotels in Barcelona'. The OSM wiki has a selection of special phrases
in many languages, which can be imported with this command.
You can also provide your own phrases in a CSV file. The file needs to have
the following five columns:
* phrase - the term expected for searching
* class - the OSM tag key of the object type
* type - the OSM tag value of the object type
* operator - the kind of search to be done (one of: in, near, name, -)
* plural - whether the term is a plural or not (Y/N)
An example file can be found in the Nominatim sources at
'test/testdb/full_en_phrases_test.csv'.
The import can be further configured to ignore specific key/value pairs.
This is particularly useful when importing phrases from the wiki. The
default configuration excludes some very common tags like building=yes.
The configuration can be customized by putting a file `phrase-settings.json`
with custom rules into the project directory or by using the `--config`
option to point to another configuration file.
"""
def add_args(self, parser: argparse.ArgumentParser) -> None:
group = parser.add_argument_group('Input arguments')
group.add_argument('--import-from-wiki', action='store_true',
help='Import special phrases from the OSM wiki to the database')
group.add_argument('--import-from-csv', metavar='FILE',
help='Import special phrases from a CSV file')
group.add_argument('--no-replace', action='store_true',
help='Keep the old phrases and only add the new ones')
def run(self, args: NominatimArgs) -> int:
if args.import_from_wiki:
self.start_import(args, SPWikiLoader(args.config))
if args.import_from_csv:
if not Path(args.import_from_csv).is_file():
LOG.fatal("CSV file '%s' does not exist.", args.import_from_csv)
raise UsageError('Cannot access file.')
self.start_import(args, SPCsvLoader(args.import_from_csv))
return 0
def start_import(self, args: NominatimArgs, loader: SpecialPhraseLoader) -> None:
"""
Create the SPImporter object containing the right
sp loader and then start the import of special phrases.
"""
from ..tokenizer import factory as tokenizer_factory
tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
should_replace = not args.no_replace
with connect(args.config.get_libpq_dsn()) as db_connection:
SPImporter(
args.config, db_connection, loader
).import_phrases(tokenizer, should_replace)

View File

View File

@@ -0,0 +1,175 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Functions for importing and managing static country information.
"""
from typing import Dict, Any, Iterable, Tuple, Optional, Container, overload
from pathlib import Path
import psycopg2.extras
from nominatim_core.db import utils as db_utils
from nominatim_core.db.connection import connect, Connection
from nominatim_core.errors import UsageError
from nominatim_core.config import Configuration
from ..tokenizer.base import AbstractTokenizer
def _flatten_name_list(names: Any) -> Dict[str, str]:
if names is None:
return {}
if not isinstance(names, dict):
raise UsageError("Expected key-value list for names in country_settings.py")
flat = {}
for prefix, remain in names.items():
if isinstance(remain, str):
flat[prefix] = remain
elif not isinstance(remain, dict):
raise UsageError("Entries in names must be key-value lists.")
else:
for suffix, name in remain.items():
if suffix == 'default':
flat[prefix] = name
else:
flat[f'{prefix}:{suffix}'] = name
return flat
class _CountryInfo:
""" Caches country-specific properties from the configuration file.
"""
def __init__(self) -> None:
self._info: Dict[str, Dict[str, Any]] = {}
def load(self, config: Configuration) -> None:
""" Load the country properties from the configuration files,
if they are not loaded yet.
"""
if not self._info:
self._info = config.load_sub_configuration('country_settings.yaml')
for prop in self._info.values():
# Convert languages into a list for simpler handling.
if 'languages' not in prop:
prop['languages'] = []
elif not isinstance(prop['languages'], list):
prop['languages'] = [x.strip()
for x in prop['languages'].split(',')]
prop['names'] = _flatten_name_list(prop.get('names'))
def items(self) -> Iterable[Tuple[str, Dict[str, Any]]]:
""" Return tuples of (country_code, property dict) as iterable.
"""
return self._info.items()
def get(self, country_code: str) -> Dict[str, Any]:
""" Get country information for the country with the given country code.
"""
return self._info.get(country_code, {})
_COUNTRY_INFO = _CountryInfo()
def setup_country_config(config: Configuration) -> None:
""" Load country properties from the configuration file.
Needs to be called before using any other functions in this
file.
"""
_COUNTRY_INFO.load(config)
@overload
def iterate() -> Iterable[Tuple[str, Dict[str, Any]]]:
...
@overload
def iterate(prop: str) -> Iterable[Tuple[str, Any]]:
...
def iterate(prop: Optional[str] = None) -> Iterable[Tuple[str, Dict[str, Any]]]:
""" Iterate over country code and properties.
When `prop` is None, all countries are returned with their complete
set of properties.
If `prop` is given, then only countries are returned where the
given property is set. The second item of the tuple contains only
the content of the given property.
"""
if prop is None:
return _COUNTRY_INFO.items()
return ((c, p[prop]) for c, p in _COUNTRY_INFO.items() if prop in p)
def setup_country_tables(dsn: str, sql_dir: Path, ignore_partitions: bool = False) -> None:
""" Create and populate the tables with basic static data that provides
the background for geocoding. Data is assumed to not yet exist.
"""
db_utils.execute_file(dsn, sql_dir / 'country_osm_grid.sql.gz')
params = []
for ccode, props in _COUNTRY_INFO.items():
if ccode is not None and props is not None:
if ignore_partitions:
partition = 0
else:
partition = props.get('partition', 0)
lang = props['languages'][0] if len(
props['languages']) == 1 else None
params.append((ccode, props['names'], lang, partition))
with connect(dsn) as conn:
with conn.cursor() as cur:
psycopg2.extras.register_hstore(cur)
cur.execute(
""" CREATE TABLE public.country_name (
country_code character varying(2),
name public.hstore,
derived_name public.hstore,
country_default_language_code text,
partition integer
); """)
cur.execute_values(
""" INSERT INTO public.country_name
(country_code, name, country_default_language_code, partition) VALUES %s
""", params)
conn.commit()
def create_country_names(conn: Connection, tokenizer: AbstractTokenizer,
languages: Optional[Container[str]] = None) -> None:
""" Add default country names to search index. `languages` is a comma-
separated list of language codes as used in OSM. If `languages` is not
empty then only name translations for the given languages are added
to the index.
"""
def _include_key(key: str) -> bool:
return ':' not in key or not languages or \
key[key.index(':') + 1:] in languages
with conn.cursor() as cur:
psycopg2.extras.register_hstore(cur)
cur.execute("""SELECT country_code, name FROM country_name
WHERE country_code is not null""")
with tokenizer.name_analyzer() as analyzer:
for code, name in cur:
names = {'countrycode': code}
# country names (only in languages as provided)
if name:
names.update({k : v for k, v in name.items() if _include_key(k)})
analyzer.add_country_names(code, names)
conn.commit()

View File

@@ -0,0 +1,86 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Wrapper around place information the indexer gets from the database and hands to
the tokenizer.
"""
from typing import Optional, Mapping, Any, Tuple
class PlaceInfo:
""" This data class contains all information the tokenizer can access
about a place.
"""
def __init__(self, info: Mapping[str, Any]) -> None:
self._info = info
@property
def name(self) -> Optional[Mapping[str, str]]:
""" A dictionary with the names of the place. Keys and values represent
the full key and value of the corresponding OSM tag. Which tags
are saved as names is determined by the import style.
The property may be None if the place has no names.
"""
return self._info.get('name')
@property
def address(self) -> Optional[Mapping[str, str]]:
""" A dictionary with the address elements of the place. They key
usually corresponds to the suffix part of the key of an OSM
'addr:*' or 'isin:*' tag. There are also some special keys like
`country` or `country_code` which merge OSM keys that contain
the same information. See [Import Styles][1] for details.
The property may be None if the place has no address information.
[1]: ../customize/Import-Styles.md
"""
return self._info.get('address')
@property
def country_code(self) -> Optional[str]:
""" The country code of the country the place is in. Guaranteed
to be a two-letter lower-case string. If the place is not inside
any country, the property is set to None.
"""
return self._info.get('country_code')
@property
def rank_address(self) -> int:
""" The [rank address][1] before any rank correction is applied.
[1]: ../customize/Ranking.md#address-rank
"""
return self._info.get('rank_address', 0)
@property
def centroid(self) -> Optional[Tuple[float, float]]:
""" A center point of the place in WGS84. May be None when the
geometry of the place is unknown.
"""
x, y = self._info.get('centroid_x'), self._info.get('centroid_y')
return None if x is None or y is None else (x, y)
def is_a(self, key: str, value: str) -> bool:
""" Set to True when the place's primary tag corresponds to the given
key and value.
"""
return self._info.get('class') == key and self._info.get('type') == value
def is_country(self) -> bool:
""" Set to True when the place is a valid country boundary.
"""
return self.rank_address == 4 \
and self.is_a('boundary', 'administrative') \
and self.country_code is not None

View File

@@ -0,0 +1,78 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Data class for a single name of a place.
"""
from typing import Optional, Dict, Mapping
class PlaceName:
""" Each name and address part of a place is encapsulated in an object of
this class. It saves not only the name proper but also describes the
kind of name with two properties:
* `kind` describes the name of the OSM key used without any suffixes
(i.e. the part after the colon removed)
* `suffix` contains the suffix of the OSM tag, if any. The suffix
is the part of the key after the first colon.
In addition to that, a name may have arbitrary additional attributes.
How attributes are used, depends on the sanitizers and token analysers.
The exception is the 'analyzer' attribute. This attribute determines
which token analysis module will be used to finalize the treatment of
names.
"""
def __init__(self, name: str, kind: str, suffix: Optional[str]):
self.name = name
self.kind = kind
self.suffix = suffix
self.attr: Dict[str, str] = {}
def __repr__(self) -> str:
return f"PlaceName(name={self.name!r},kind={self.kind!r},suffix={self.suffix!r})"
def clone(self, name: Optional[str] = None,
kind: Optional[str] = None,
suffix: Optional[str] = None,
attr: Optional[Mapping[str, str]] = None) -> 'PlaceName':
""" Create a deep copy of the place name, optionally with the
given parameters replaced. In the attribute list only the given
keys are updated. The list is not replaced completely.
In particular, the function cannot to be used to remove an
attribute from a place name.
"""
newobj = PlaceName(name or self.name,
kind or self.kind,
suffix or self.suffix)
newobj.attr.update(self.attr)
if attr:
newobj.attr.update(attr)
return newobj
def set_attr(self, key: str, value: str) -> None:
""" Add the given property to the name. If the property was already
set, then the value is overwritten.
"""
self.attr[key] = value
def get_attr(self, key: str, default: Optional[str] = None) -> Optional[str]:
""" Return the given property or the value of 'default' if it
is not set.
"""
return self.attr.get(key, default)
def has_attr(self, key: str) -> bool:
""" Check if the given attribute is set.
"""
return key in self.attr

View File

@@ -0,0 +1,114 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Functions for formatting postcodes according to their country-specific
format.
"""
from typing import Any, Mapping, Optional, Set, Match
import re
from nominatim_core.errors import UsageError
from . import country_info
class CountryPostcodeMatcher:
""" Matches and formats a postcode according to a format definition
of the given country.
"""
def __init__(self, country_code: str, config: Mapping[str, Any]) -> None:
if 'pattern' not in config:
raise UsageError("Field 'pattern' required for 'postcode' "
f"for country '{country_code}'")
pc_pattern = config['pattern'].replace('d', '[0-9]').replace('l', '[A-Z]')
self.norm_pattern = re.compile(f'\\s*(?:{country_code.upper()}[ -]?)?({pc_pattern})\\s*')
self.pattern = re.compile(pc_pattern)
self.output = config.get('output', r'\g<0>')
def match(self, postcode: str) -> Optional[Match[str]]:
""" Match the given postcode against the postcode pattern for this
matcher. Returns a `re.Match` object if the match was successful
and None otherwise.
"""
# Upper-case, strip spaces and leading country code.
normalized = self.norm_pattern.fullmatch(postcode.upper())
if normalized:
return self.pattern.fullmatch(normalized.group(1))
return None
def normalize(self, match: Match[str]) -> str:
""" Return the default format of the postcode for the given match.
`match` must be a `re.Match` object previously returned by
`match()`
"""
return match.expand(self.output)
class PostcodeFormatter:
""" Container for different postcode formats of the world and
access functions.
"""
def __init__(self) -> None:
# Objects without a country code can't have a postcode per definition.
self.country_without_postcode: Set[Optional[str]] = {None}
self.country_matcher = {}
self.default_matcher = CountryPostcodeMatcher('', {'pattern': '.*'})
for ccode, prop in country_info.iterate('postcode'):
if prop is False:
self.country_without_postcode.add(ccode)
elif isinstance(prop, dict):
self.country_matcher[ccode] = CountryPostcodeMatcher(ccode, prop)
else:
raise UsageError(f"Invalid entry 'postcode' for country '{ccode}'")
def set_default_pattern(self, pattern: str) -> None:
""" Set the postcode match pattern to use, when a country does not
have a specific pattern.
"""
self.default_matcher = CountryPostcodeMatcher('', {'pattern': pattern})
def get_matcher(self, country_code: Optional[str]) -> Optional[CountryPostcodeMatcher]:
""" Return the CountryPostcodeMatcher for the given country.
Returns None if the country doesn't have a postcode and the
default matcher if there is no specific matcher configured for
the country.
"""
if country_code in self.country_without_postcode:
return None
assert country_code is not None
return self.country_matcher.get(country_code, self.default_matcher)
def match(self, country_code: Optional[str], postcode: str) -> Optional[Match[str]]:
""" Match the given postcode against the postcode pattern for this
matcher. Returns a `re.Match` object if the country has a pattern
and the match was successful or None if the match failed.
"""
if country_code in self.country_without_postcode:
return None
assert country_code is not None
return self.country_matcher.get(country_code, self.default_matcher).match(postcode)
def normalize(self, country_code: str, match: Match[str]) -> str:
""" Return the default format of the postcode for the given match.
`match` must be a `re.Match` object previously returned by
`match()`
"""
return self.country_matcher.get(country_code, self.default_matcher).normalize(match)

View File

View File

@@ -0,0 +1,242 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Main work horse for indexing (computing addresses) the database.
"""
from typing import Optional, Any, cast
import logging
import time
import psycopg2.extras
from nominatim_core.typing import DictCursorResults
from nominatim_core.db.async_connection import DBConnection, WorkerPool
from nominatim_core.db.connection import connect, Connection, Cursor
from ..tokenizer.base import AbstractTokenizer
from .progress import ProgressLogger
from . import runners
LOG = logging.getLogger()
class PlaceFetcher:
""" Asynchronous connection that fetches place details for processing.
"""
def __init__(self, dsn: str, setup_conn: Connection) -> None:
self.wait_time = 0.0
self.current_ids: Optional[DictCursorResults] = None
self.conn: Optional[DBConnection] = DBConnection(dsn,
cursor_factory=psycopg2.extras.DictCursor)
with setup_conn.cursor() as cur:
# need to fetch those manually because register_hstore cannot
# fetch them on an asynchronous connection below.
hstore_oid = cur.scalar("SELECT 'hstore'::regtype::oid")
hstore_array_oid = cur.scalar("SELECT 'hstore[]'::regtype::oid")
psycopg2.extras.register_hstore(self.conn.conn, oid=hstore_oid,
array_oid=hstore_array_oid)
def close(self) -> None:
""" Close the underlying asynchronous connection.
"""
if self.conn:
self.conn.close()
self.conn = None
def fetch_next_batch(self, cur: Cursor, runner: runners.Runner) -> bool:
""" Send a request for the next batch of places.
If details for the places are required, they will be fetched
asynchronously.
Returns true if there is still data available.
"""
ids = cast(Optional[DictCursorResults], cur.fetchmany(100))
if not ids:
self.current_ids = None
return False
assert self.conn is not None
self.current_ids = runner.get_place_details(self.conn, ids)
return True
def get_batch(self) -> DictCursorResults:
""" Get the next batch of data, previously requested with
`fetch_next_batch`.
"""
assert self.conn is not None
assert self.conn.cursor is not None
if self.current_ids is not None and not self.current_ids:
tstart = time.time()
self.conn.wait()
self.wait_time += time.time() - tstart
self.current_ids = cast(Optional[DictCursorResults],
self.conn.cursor.fetchall())
return self.current_ids if self.current_ids is not None else []
def __enter__(self) -> 'PlaceFetcher':
return self
def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
assert self.conn is not None
self.conn.wait()
self.close()
class Indexer:
""" Main indexing routine.
"""
def __init__(self, dsn: str, tokenizer: AbstractTokenizer, num_threads: int):
self.dsn = dsn
self.tokenizer = tokenizer
self.num_threads = num_threads
def has_pending(self) -> bool:
""" Check if any data still needs indexing.
This function must only be used after the import has finished.
Otherwise it will be very expensive.
"""
with connect(self.dsn) as conn:
with conn.cursor() as cur:
cur.execute("SELECT 'a' FROM placex WHERE indexed_status > 0 LIMIT 1")
return cur.rowcount > 0
def index_full(self, analyse: bool = True) -> None:
""" Index the complete database. This will first index boundaries
followed by all other objects. When `analyse` is True, then the
database will be analysed at the appropriate places to
ensure that database statistics are updated.
"""
with connect(self.dsn) as conn:
conn.autocommit = True
def _analyze() -> None:
if analyse:
with conn.cursor() as cur:
cur.execute('ANALYZE')
if self.index_by_rank(0, 4) > 0:
_analyze()
if self.index_boundaries(0, 30) > 100:
_analyze()
if self.index_by_rank(5, 25) > 100:
_analyze()
if self.index_by_rank(26, 30) > 1000:
_analyze()
if self.index_postcodes() > 100:
_analyze()
def index_boundaries(self, minrank: int, maxrank: int) -> int:
""" Index only administrative boundaries within the given rank range.
"""
total = 0
LOG.warning("Starting indexing boundaries using %s threads",
self.num_threads)
with self.tokenizer.name_analyzer() as analyzer:
for rank in range(max(minrank, 4), min(maxrank, 26)):
total += self._index(runners.BoundaryRunner(rank, analyzer))
return total
def index_by_rank(self, minrank: int, maxrank: int) -> int:
""" Index all entries of placex in the given rank range (inclusive)
in order of their address rank.
When rank 30 is requested then also interpolations and
places with address rank 0 will be indexed.
"""
total = 0
maxrank = min(maxrank, 30)
LOG.warning("Starting indexing rank (%i to %i) using %i threads",
minrank, maxrank, self.num_threads)
with self.tokenizer.name_analyzer() as analyzer:
for rank in range(max(1, minrank), maxrank + 1):
total += self._index(runners.RankRunner(rank, analyzer), 20 if rank == 30 else 1)
if maxrank == 30:
total += self._index(runners.RankRunner(0, analyzer))
total += self._index(runners.InterpolationRunner(analyzer), 20)
return total
def index_postcodes(self) -> int:
"""Index the entries of the location_postcode table.
"""
LOG.warning("Starting indexing postcodes using %s threads", self.num_threads)
return self._index(runners.PostcodeRunner(), 20)
def update_status_table(self) -> None:
""" Update the status in the status table to 'indexed'.
"""
with connect(self.dsn) as conn:
with conn.cursor() as cur:
cur.execute('UPDATE import_status SET indexed = true')
conn.commit()
def _index(self, runner: runners.Runner, batch: int = 1) -> int:
""" Index a single rank or table. `runner` describes the SQL to use
for indexing. `batch` describes the number of objects that
should be processed with a single SQL statement
"""
LOG.warning("Starting %s (using batch size %s)", runner.name(), batch)
with connect(self.dsn) as conn:
psycopg2.extras.register_hstore(conn)
with conn.cursor() as cur:
total_tuples = cur.scalar(runner.sql_count_objects())
LOG.debug("Total number of rows: %i", total_tuples)
conn.commit()
progress = ProgressLogger(runner.name(), total_tuples)
if total_tuples > 0:
with conn.cursor(name='places') as cur:
cur.execute(runner.sql_get_objects())
with PlaceFetcher(self.dsn, conn) as fetcher:
with WorkerPool(self.dsn, self.num_threads) as pool:
has_more = fetcher.fetch_next_batch(cur, runner)
while has_more:
places = fetcher.get_batch()
# asynchronously get the next batch
has_more = fetcher.fetch_next_batch(cur, runner)
# And insert the current batch
for idx in range(0, len(places), batch):
part = places[idx:idx + batch]
LOG.debug("Processing places: %s", str(part))
runner.index_places(pool.next_free_worker(), part)
progress.add(len(part))
LOG.info("Wait time: fetcher: %.2fs, pool: %.2fs",
fetcher.wait_time, pool.wait_time)
conn.commit()
return progress.done()

View File

@@ -0,0 +1,74 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Helpers for progress logging.
"""
import logging
from datetime import datetime
LOG = logging.getLogger()
INITIAL_PROGRESS = 10
class ProgressLogger:
""" Tracks and prints progress for the indexing process.
`name` is the name of the indexing step being tracked.
`total` sets up the total number of items that need processing.
`log_interval` denotes the interval in seconds at which progress
should be reported.
"""
def __init__(self, name: str, total: int, log_interval: int = 1) -> None:
self.name = name
self.total_places = total
self.done_places = 0
self.rank_start_time = datetime.now()
self.log_interval = log_interval
self.next_info = INITIAL_PROGRESS if LOG.isEnabledFor(logging.WARNING) else total + 1
def add(self, num: int = 1) -> None:
""" Mark `num` places as processed. Print a log message if the
logging is at least info and the log interval has passed.
"""
self.done_places += num
if self.done_places < self.next_info:
return
now = datetime.now()
done_time = (now - self.rank_start_time).total_seconds()
if done_time < 2:
self.next_info = self.done_places + INITIAL_PROGRESS
return
places_per_sec = self.done_places / done_time
eta = (self.total_places - self.done_places) / places_per_sec
LOG.warning("Done %d in %d @ %.3f per second - %s ETA (seconds): %.2f",
self.done_places, int(done_time),
places_per_sec, self.name, eta)
self.next_info += int(places_per_sec) * self.log_interval
def done(self) -> int:
""" Print final statistics about the progress.
"""
rank_end_time = datetime.now()
if rank_end_time == self.rank_start_time:
diff_seconds = 0.0
places_per_sec = float(self.done_places)
else:
diff_seconds = (rank_end_time - self.rank_start_time).total_seconds()
places_per_sec = self.done_places / diff_seconds
LOG.warning("Done %d/%d in %d @ %.3f per second - FINISHED %s\n",
self.done_places, self.total_places, int(diff_seconds),
places_per_sec, self.name)
return self.done_places

View File

@@ -0,0 +1,196 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Mix-ins that provide the actual commands for the indexer for various indexing
tasks.
"""
from typing import Any, List
import functools
from psycopg2 import sql as pysql
import psycopg2.extras
from nominatim_core.typing import Query, DictCursorResult, DictCursorResults, Protocol
from nominatim_core.db.async_connection import DBConnection
from ..data.place_info import PlaceInfo
from ..tokenizer.base import AbstractAnalyzer
# pylint: disable=C0111
def _mk_valuelist(template: str, num: int) -> pysql.Composed:
return pysql.SQL(',').join([pysql.SQL(template)] * num)
def _analyze_place(place: DictCursorResult, analyzer: AbstractAnalyzer) -> psycopg2.extras.Json:
return psycopg2.extras.Json(analyzer.process_place(PlaceInfo(place)))
class Runner(Protocol):
def name(self) -> str: ...
def sql_count_objects(self) -> Query: ...
def sql_get_objects(self) -> Query: ...
def get_place_details(self, worker: DBConnection,
ids: DictCursorResults) -> DictCursorResults: ...
def index_places(self, worker: DBConnection, places: DictCursorResults) -> None: ...
class AbstractPlacexRunner:
""" Returns SQL commands for indexing of the placex table.
"""
SELECT_SQL = pysql.SQL('SELECT place_id FROM placex ')
UPDATE_LINE = "(%s, %s::hstore, %s::hstore, %s::int, %s::jsonb)"
def __init__(self, rank: int, analyzer: AbstractAnalyzer) -> None:
self.rank = rank
self.analyzer = analyzer
@functools.lru_cache(maxsize=1)
def _index_sql(self, num_places: int) -> pysql.Composed:
return pysql.SQL(
""" UPDATE placex
SET indexed_status = 0, address = v.addr, token_info = v.ti,
name = v.name, linked_place_id = v.linked_place_id
FROM (VALUES {}) as v(id, name, addr, linked_place_id, ti)
WHERE place_id = v.id
""").format(_mk_valuelist(AbstractPlacexRunner.UPDATE_LINE, num_places))
def get_place_details(self, worker: DBConnection, ids: DictCursorResults) -> DictCursorResults:
worker.perform("""SELECT place_id, extra.*
FROM placex, LATERAL placex_indexing_prepare(placex) as extra
WHERE place_id IN %s""",
(tuple((p[0] for p in ids)), ))
return []
def index_places(self, worker: DBConnection, places: DictCursorResults) -> None:
values: List[Any] = []
for place in places:
for field in ('place_id', 'name', 'address', 'linked_place_id'):
values.append(place[field])
values.append(_analyze_place(place, self.analyzer))
worker.perform(self._index_sql(len(places)), values)
class RankRunner(AbstractPlacexRunner):
""" Returns SQL commands for indexing one rank within the placex table.
"""
def name(self) -> str:
return f"rank {self.rank}"
def sql_count_objects(self) -> pysql.Composed:
return pysql.SQL("""SELECT count(*) FROM placex
WHERE rank_address = {} and indexed_status > 0
""").format(pysql.Literal(self.rank))
def sql_get_objects(self) -> pysql.Composed:
return self.SELECT_SQL + pysql.SQL(
"""WHERE indexed_status > 0 and rank_address = {}
ORDER BY geometry_sector
""").format(pysql.Literal(self.rank))
class BoundaryRunner(AbstractPlacexRunner):
""" Returns SQL commands for indexing the administrative boundaries
of a certain rank.
"""
def name(self) -> str:
return f"boundaries rank {self.rank}"
def sql_count_objects(self) -> pysql.Composed:
return pysql.SQL("""SELECT count(*) FROM placex
WHERE indexed_status > 0
AND rank_search = {}
AND class = 'boundary' and type = 'administrative'
""").format(pysql.Literal(self.rank))
def sql_get_objects(self) -> pysql.Composed:
return self.SELECT_SQL + pysql.SQL(
"""WHERE indexed_status > 0 and rank_search = {}
and class = 'boundary' and type = 'administrative'
ORDER BY partition, admin_level
""").format(pysql.Literal(self.rank))
class InterpolationRunner:
""" Returns SQL commands for indexing the address interpolation table
location_property_osmline.
"""
def __init__(self, analyzer: AbstractAnalyzer) -> None:
self.analyzer = analyzer
def name(self) -> str:
return "interpolation lines (location_property_osmline)"
def sql_count_objects(self) -> str:
return """SELECT count(*) FROM location_property_osmline
WHERE indexed_status > 0"""
def sql_get_objects(self) -> str:
return """SELECT place_id
FROM location_property_osmline
WHERE indexed_status > 0
ORDER BY geometry_sector"""
def get_place_details(self, worker: DBConnection, ids: DictCursorResults) -> DictCursorResults:
worker.perform("""SELECT place_id, get_interpolation_address(address, osm_id) as address
FROM location_property_osmline WHERE place_id IN %s""",
(tuple((p[0] for p in ids)), ))
return []
@functools.lru_cache(maxsize=1)
def _index_sql(self, num_places: int) -> pysql.Composed:
return pysql.SQL("""UPDATE location_property_osmline
SET indexed_status = 0, address = v.addr, token_info = v.ti
FROM (VALUES {}) as v(id, addr, ti)
WHERE place_id = v.id
""").format(_mk_valuelist("(%s, %s::hstore, %s::jsonb)", num_places))
def index_places(self, worker: DBConnection, places: DictCursorResults) -> None:
values: List[Any] = []
for place in places:
values.extend((place[x] for x in ('place_id', 'address')))
values.append(_analyze_place(place, self.analyzer))
worker.perform(self._index_sql(len(places)), values)
class PostcodeRunner(Runner):
""" Provides the SQL commands for indexing the location_postcode table.
"""
def name(self) -> str:
return "postcodes (location_postcode)"
def sql_count_objects(self) -> str:
return 'SELECT count(*) FROM location_postcode WHERE indexed_status > 0'
def sql_get_objects(self) -> str:
return """SELECT place_id FROM location_postcode
WHERE indexed_status > 0
ORDER BY country_code, postcode"""
def get_place_details(self, worker: DBConnection, ids: DictCursorResults) -> DictCursorResults:
return ids
def index_places(self, worker: DBConnection, places: DictCursorResults) -> None:
worker.perform(pysql.SQL("""UPDATE location_postcode SET indexed_status = 0
WHERE place_id IN ({})""")
.format(pysql.SQL(',').join((pysql.Literal(i[0]) for i in places))))

View File

View File

@@ -0,0 +1,253 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Abstract class definitions for tokenizers. These base classes are here
mainly for documentation purposes.
"""
from abc import ABC, abstractmethod
from typing import List, Tuple, Dict, Any, Optional, Iterable
from pathlib import Path
from nominatim_core.typing import Protocol
from nominatim_core.config import Configuration
from nominatim_core.db.connection import Connection
from ..data.place_info import PlaceInfo
class AbstractAnalyzer(ABC):
""" The analyzer provides the functions for analysing names and building
the token database.
Analyzers are instantiated on a per-thread base. Access to global data
structures must be synchronised accordingly.
"""
def __enter__(self) -> 'AbstractAnalyzer':
return self
def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
self.close()
@abstractmethod
def close(self) -> None:
""" Free all resources used by the analyzer.
"""
@abstractmethod
def get_word_token_info(self, words: List[str]) -> List[Tuple[str, str, int]]:
""" Return token information for the given list of words.
The function is used for testing and debugging only
and does not need to be particularly efficient.
Arguments:
words: A list of words to look up the tokens for.
If a word starts with # it is assumed to be a full name
otherwise is a partial term.
Returns:
The function returns the list of all tuples that could be
found for the given words. Each list entry is a tuple of
(original word, word token, word id).
"""
@abstractmethod
def normalize_postcode(self, postcode: str) -> str:
""" Convert the postcode to its standardized form.
This function must yield exactly the same result as the SQL function
`token_normalized_postcode()`.
Arguments:
postcode: The postcode to be normalized.
Returns:
The given postcode after normalization.
"""
@abstractmethod
def update_postcodes_from_db(self) -> None:
""" Update the tokenizer's postcode tokens from the current content
of the `location_postcode` table.
"""
@abstractmethod
def update_special_phrases(self,
phrases: Iterable[Tuple[str, str, str, str]],
should_replace: bool) -> None:
""" Update the tokenizer's special phrase tokens from the given
list of special phrases.
Arguments:
phrases: The new list of special phrases. Each entry is
a tuple of (phrase, class, type, operator).
should_replace: If true, replace the current list of phrases.
When false, just add the given phrases to the
ones that already exist.
"""
@abstractmethod
def add_country_names(self, country_code: str, names: Dict[str, str]) -> None:
""" Add the given names to the tokenizer's list of country tokens.
Arguments:
country_code: two-letter country code for the country the names
refer to.
names: Dictionary of name type to name.
"""
@abstractmethod
def process_place(self, place: PlaceInfo) -> Any:
""" Extract tokens for the given place and compute the
information to be handed to the PL/pgSQL processor for building
the search index.
Arguments:
place: Place information retrieved from the database.
Returns:
A JSON-serialisable structure that will be handed into
the database via the `token_info` field.
"""
class AbstractTokenizer(ABC):
""" The tokenizer instance is the central instance of the tokenizer in
the system. There will only be a single instance of the tokenizer
active at any time.
"""
@abstractmethod
def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
""" Set up a new tokenizer for the database.
The function should copy all necessary data into the project
directory or save it in the property table to make sure that
the tokenizer remains stable over updates.
Arguments:
config: Read-only object with configuration options.
init_db: When set to False, then initialisation of database
tables should be skipped. This option is only required for
migration purposes and can be safely ignored by custom
tokenizers.
"""
@abstractmethod
def init_from_project(self, config: Configuration) -> None:
""" Initialise the tokenizer from an existing database setup.
The function should load all previously saved configuration from
the project directory and/or the property table.
Arguments:
config: Read-only object with configuration options.
"""
@abstractmethod
def finalize_import(self, config: Configuration) -> None:
""" This function is called at the very end of an import when all
data has been imported and indexed. The tokenizer may create
at this point any additional indexes and data structures needed
during query time.
Arguments:
config: Read-only object with configuration options.
"""
@abstractmethod
def update_sql_functions(self, config: Configuration) -> None:
""" Update the SQL part of the tokenizer. This function is called
automatically on migrations or may be called explicitly by the
user through the `nominatim refresh --functions` command.
The tokenizer must only update the code of the tokenizer. The
data structures or data itself must not be changed by this function.
Arguments:
config: Read-only object with configuration options.
"""
@abstractmethod
def check_database(self, config: Configuration) -> Optional[str]:
""" Check that the database is set up correctly and ready for being
queried.
Arguments:
config: Read-only object with configuration options.
Returns:
If an issue was found, return an error message with the
description of the issue as well as hints for the user on
how to resolve the issue. If everything is okay, return `None`.
"""
@abstractmethod
def update_statistics(self, config: Configuration, threads: int = 1) -> None:
""" Recompute any tokenizer statistics necessary for efficient lookup.
This function is meant to be called from time to time by the user
to improve performance. However, the tokenizer must not depend on
it to be called in order to work.
"""
@abstractmethod
def update_word_tokens(self) -> None:
""" Do house-keeping on the tokenizers internal data structures.
Remove unused word tokens, resort data etc.
"""
@abstractmethod
def name_analyzer(self) -> AbstractAnalyzer:
""" Create a new analyzer for tokenizing names and queries
using this tokinzer. Analyzers are context managers and should
be used accordingly:
```
with tokenizer.name_analyzer() as analyzer:
analyser.tokenize()
```
When used outside the with construct, the caller must ensure to
call the close() function before destructing the analyzer.
"""
@abstractmethod
def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
""" Return a list of the most frequent full words in the database.
Arguments:
conn: Open connection to the database which may be used to
retrieve the words.
num: Maximum number of words to return.
"""
class TokenizerModule(Protocol):
""" Interface that must be exported by modules that implement their
own tokenizer.
"""
def create(self, dsn: str, data_dir: Path) -> AbstractTokenizer:
""" Factory for new tokenizers.
"""

View File

@@ -0,0 +1,102 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Functions for creating a tokenizer or initialising the right one for an
existing database.
A tokenizer is something that is bound to the lifetime of a database. It
can be chosen and configured before the initial import but then needs to
be used consistently when querying and updating the database.
This module provides the functions to create and configure a new tokenizer
as well as instantiating the appropriate tokenizer for updating an existing
database.
A tokenizer usually also includes PHP code for querying. The appropriate PHP
normalizer module is installed, when the tokenizer is created.
"""
from typing import Optional
import logging
import importlib
from pathlib import Path
from nominatim_core.errors import UsageError
from nominatim_core.db import properties
from nominatim_core.db.connection import connect
from nominatim_core.config import Configuration
from ..tokenizer.base import AbstractTokenizer, TokenizerModule
LOG = logging.getLogger()
def _import_tokenizer(name: str) -> TokenizerModule:
""" Load the tokenizer.py module from project directory.
"""
src_file = Path(__file__).parent / (name + '_tokenizer.py')
if not src_file.is_file():
LOG.fatal("No tokenizer named '%s' available. "
"Check the setting of NOMINATIM_TOKENIZER.", name)
raise UsageError('Tokenizer not found')
return importlib.import_module('nominatim.tokenizer.' + name + '_tokenizer')
def create_tokenizer(config: Configuration, init_db: bool = True,
module_name: Optional[str] = None) -> AbstractTokenizer:
""" Create a new tokenizer as defined by the given configuration.
The tokenizer data and code is copied into the 'tokenizer' directory
of the project directory and the tokenizer loaded from its new location.
"""
if module_name is None:
module_name = config.TOKENIZER
# Create the directory for the tokenizer data
assert config.project_dir is not None
basedir = config.project_dir / 'tokenizer'
if not basedir.exists():
basedir.mkdir()
elif not basedir.is_dir():
LOG.fatal("Tokenizer directory '%s' cannot be created.", basedir)
raise UsageError("Tokenizer setup failed.")
# Import and initialize the tokenizer.
tokenizer_module = _import_tokenizer(module_name)
tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
tokenizer.init_new_db(config, init_db=init_db)
with connect(config.get_libpq_dsn()) as conn:
properties.set_property(conn, 'tokenizer', module_name)
return tokenizer
def get_tokenizer_for_db(config: Configuration) -> AbstractTokenizer:
""" Instantiate a tokenizer for an existing database.
The function looks up the appropriate tokenizer in the database
and initialises it.
"""
assert config.project_dir is not None
basedir = config.project_dir / 'tokenizer'
if not basedir.is_dir():
# Directory will be repopulated by tokenizer below.
basedir.mkdir()
with connect(config.get_libpq_dsn()) as conn:
name = properties.get_property(conn, 'tokenizer')
if name is None:
LOG.fatal("Tokenizer was not set up properly. Database property missing.")
raise UsageError('Cannot initialize tokenizer.')
tokenizer_module = _import_tokenizer(name)
tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
tokenizer.init_from_project(config)
return tokenizer

View File

@@ -0,0 +1,196 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Helper class to create ICU rules from a configuration file.
"""
from typing import Mapping, Any, Dict, Optional
import io
import json
import logging
from icu import Transliterator
from nominatim_core.config import flatten_config_list, Configuration
from nominatim_core.db.properties import set_property, get_property
from nominatim_core.db.connection import Connection
from nominatim_core.errors import UsageError
from .place_sanitizer import PlaceSanitizer
from .icu_token_analysis import ICUTokenAnalysis
from .token_analysis.base import AnalysisModule, Analyzer
from ..data import country_info
LOG = logging.getLogger()
DBCFG_IMPORT_NORM_RULES = "tokenizer_import_normalisation"
DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration"
DBCFG_IMPORT_ANALYSIS_RULES = "tokenizer_import_analysis_rules"
def _get_section(rules: Mapping[str, Any], section: str) -> Any:
""" Get the section named 'section' from the rules. If the section does
not exist, raise a usage error with a meaningful message.
"""
if section not in rules:
LOG.fatal("Section '%s' not found in tokenizer config.", section)
raise UsageError("Syntax error in tokenizer configuration file.")
return rules[section]
class ICURuleLoader:
""" Compiler for ICU rules from a tokenizer configuration file.
"""
def __init__(self, config: Configuration) -> None:
self.config = config
rules = config.load_sub_configuration('icu_tokenizer.yaml',
config='TOKENIZER_CONFIG')
# Make sure country information is available to analyzers and sanitizers.
country_info.setup_country_config(config)
self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
self.analysis_rules = _get_section(rules, 'token-analysis')
self._setup_analysis()
# Load optional sanitizer rule set.
self.sanitizer_rules = rules.get('sanitizers', [])
def load_config_from_db(self, conn: Connection) -> None:
""" Get previously saved parts of the configuration from the
database.
"""
rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
if rules is not None:
self.normalization_rules = rules
rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
if rules is not None:
self.transliteration_rules = rules
rules = get_property(conn, DBCFG_IMPORT_ANALYSIS_RULES)
if rules:
self.analysis_rules = json.loads(rules)
else:
self.analysis_rules = []
self._setup_analysis()
def save_config_to_db(self, conn: Connection) -> None:
""" Save the part of the configuration that cannot be changed into
the database.
"""
set_property(conn, DBCFG_IMPORT_NORM_RULES, self.normalization_rules)
set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.transliteration_rules)
set_property(conn, DBCFG_IMPORT_ANALYSIS_RULES, json.dumps(self.analysis_rules))
def make_sanitizer(self) -> PlaceSanitizer:
""" Create a place sanitizer from the configured rules.
"""
return PlaceSanitizer(self.sanitizer_rules, self.config)
def make_token_analysis(self) -> ICUTokenAnalysis:
""" Create a token analyser from the reviouly loaded rules.
"""
return ICUTokenAnalysis(self.normalization_rules,
self.transliteration_rules, self.analysis)
def get_search_rules(self) -> str:
""" Return the ICU rules to be used during search.
The rules combine normalization and transliteration.
"""
# First apply the normalization rules.
rules = io.StringIO()
rules.write(self.normalization_rules)
# Then add transliteration.
rules.write(self.transliteration_rules)
return rules.getvalue()
def get_normalization_rules(self) -> str:
""" Return rules for normalisation of a term.
"""
return self.normalization_rules
def get_transliteration_rules(self) -> str:
""" Return the rules for converting a string into its asciii representation.
"""
return self.transliteration_rules
def _setup_analysis(self) -> None:
""" Process the rules used for creating the various token analyzers.
"""
self.analysis: Dict[Optional[str], TokenAnalyzerRule] = {}
if not isinstance(self.analysis_rules, list):
raise UsageError("Configuration section 'token-analysis' must be a list.")
norm = Transliterator.createFromRules("rule_loader_normalization",
self.normalization_rules)
trans = Transliterator.createFromRules("rule_loader_transliteration",
self.transliteration_rules)
for section in self.analysis_rules:
name = section.get('id', None)
if name in self.analysis:
if name is None:
LOG.fatal("ICU tokenizer configuration has two default token analyzers.")
else:
LOG.fatal("ICU tokenizer configuration has two token "
"analyzers with id '%s'.", name)
raise UsageError("Syntax error in ICU tokenizer config.")
self.analysis[name] = TokenAnalyzerRule(section, norm, trans,
self.config)
@staticmethod
def _cfg_to_icu_rules(rules: Mapping[str, Any], section: str) -> str:
""" Load an ICU ruleset from the given section. If the section is a
simple string, it is interpreted as a file name and the rules are
loaded verbatim from the given file. The filename is expected to be
relative to the tokenizer rule file. If the section is a list then
each line is assumed to be a rule. All rules are concatenated and returned.
"""
content = _get_section(rules, section)
if content is None:
return ''
return ';'.join(flatten_config_list(content, section)) + ';'
class TokenAnalyzerRule:
""" Factory for a single analysis module. The class saves the configuration
and creates a new token analyzer on request.
"""
def __init__(self, rules: Mapping[str, Any],
normalizer: Any, transliterator: Any,
config: Configuration) -> None:
analyzer_name = _get_section(rules, 'analyzer')
if not analyzer_name or not isinstance(analyzer_name, str):
raise UsageError("'analyzer' parameter needs to be simple string")
self._analysis_mod: AnalysisModule = \
config.load_plugin_module(analyzer_name, 'nominatim.tokenizer.token_analysis')
self.config = self._analysis_mod.configure(rules, normalizer,
transliterator)
def create(self, normalizer: Any, transliterator: Any) -> Analyzer:
""" Create a new analyser instance for the given rule.
"""
return self._analysis_mod.create(normalizer, transliterator, self.config)

View File

@@ -0,0 +1,43 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Container class collecting all components required to transform an OSM name
into a Nominatim token.
"""
from typing import Mapping, Optional, TYPE_CHECKING
from icu import Transliterator
from .token_analysis.base import Analyzer
if TYPE_CHECKING:
from typing import Any
from .icu_rule_loader import TokenAnalyzerRule # pylint: disable=cyclic-import
class ICUTokenAnalysis:
""" Container class collecting the transliterators and token analysis
modules for a single Analyser instance.
"""
def __init__(self, norm_rules: str, trans_rules: str,
analysis_rules: Mapping[Optional[str], 'TokenAnalyzerRule']):
self.normalizer = Transliterator.createFromRules("icu_normalization",
norm_rules)
trans_rules += ";[:Space:]+ > ' '"
self.to_ascii = Transliterator.createFromRules("icu_to_ascii",
trans_rules)
self.search = Transliterator.createFromRules("icu_search",
norm_rules + trans_rules)
self.analysis = {name: arules.create(self.normalizer, self.to_ascii)
for name, arules in analysis_rules.items()}
def get_analyzer(self, name: Optional[str]) -> Analyzer:
""" Return the given named analyzer. If no analyzer with that
name exists, return the default analyzer.
"""
return self.analysis.get(name) or self.analysis[None]

View File

@@ -0,0 +1,952 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Tokenizer implementing normalisation as used before Nominatim 4 but using
libICU instead of the PostgreSQL module.
"""
from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \
Dict, Set, Iterable
import itertools
import json
import logging
from pathlib import Path
from textwrap import dedent
from nominatim_core.db.connection import connect, Connection, Cursor
from nominatim_core.config import Configuration
from nominatim_core.db.utils import CopyBuffer
from nominatim_core.db.sql_preprocessor import SQLPreprocessor
from ..data.place_info import PlaceInfo
from ..data.place_name import PlaceName
from .icu_rule_loader import ICURuleLoader
from .place_sanitizer import PlaceSanitizer
from .icu_token_analysis import ICUTokenAnalysis
from .base import AbstractAnalyzer, AbstractTokenizer
DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
LOG = logging.getLogger()
WORD_TYPES =(('country_names', 'C'),
('postcodes', 'P'),
('full_word', 'W'),
('housenumbers', 'H'))
def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
""" Create a new instance of the tokenizer provided by this module.
"""
return ICUTokenizer(dsn, data_dir)
class ICUTokenizer(AbstractTokenizer):
""" This tokenizer uses libICU to convert names and queries to ASCII.
Otherwise it uses the same algorithms and data structures as the
normalization routines in Nominatim 3.
"""
def __init__(self, dsn: str, data_dir: Path) -> None:
self.dsn = dsn
self.data_dir = data_dir
self.loader: Optional[ICURuleLoader] = None
def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
""" Set up a new tokenizer for the database.
This copies all necessary data in the project directory to make
sure the tokenizer remains stable even over updates.
"""
self.loader = ICURuleLoader(config)
self._install_php(config.lib_dir.php, overwrite=True)
self._save_config()
if init_db:
self.update_sql_functions(config)
self._setup_db_tables(config)
self._create_base_indices(config, 'word')
def init_from_project(self, config: Configuration) -> None:
""" Initialise the tokenizer from the project directory.
"""
self.loader = ICURuleLoader(config)
with connect(self.dsn) as conn:
self.loader.load_config_from_db(conn)
self._install_php(config.lib_dir.php, overwrite=False)
def finalize_import(self, config: Configuration) -> None:
""" Do any required postprocessing to make the tokenizer data ready
for use.
"""
self._create_lookup_indices(config, 'word')
def update_sql_functions(self, config: Configuration) -> None:
""" Reimport the SQL functions for this tokenizer.
"""
with connect(self.dsn) as conn:
sqlp = SQLPreprocessor(conn, config)
sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
def check_database(self, config: Configuration) -> None:
""" Check that the tokenizer is set up correctly.
"""
# Will throw an error if there is an issue.
self.init_from_project(config)
def update_statistics(self, config: Configuration, threads: int = 2) -> None:
""" Recompute frequencies for all name words.
"""
with connect(self.dsn) as conn:
if not conn.table_exists('search_name'):
return
with conn.cursor() as cur:
cur.execute('ANALYSE search_name')
if threads > 1:
cur.execute('SET max_parallel_workers_per_gather TO %s',
(min(threads, 6),))
if conn.server_version_tuple() < (12, 0):
LOG.info('Computing word frequencies')
cur.drop_table('word_frequencies')
cur.drop_table('addressword_frequencies')
cur.execute("""CREATE TEMP TABLE word_frequencies AS
SELECT unnest(name_vector) as id, count(*)
FROM search_name GROUP BY id""")
cur.execute('CREATE INDEX ON word_frequencies(id)')
cur.execute("""CREATE TEMP TABLE addressword_frequencies AS
SELECT unnest(nameaddress_vector) as id, count(*)
FROM search_name GROUP BY id""")
cur.execute('CREATE INDEX ON addressword_frequencies(id)')
cur.execute("""CREATE OR REPLACE FUNCTION word_freq_update(wid INTEGER,
INOUT info JSONB)
AS $$
DECLARE rec RECORD;
BEGIN
IF info is null THEN
info = '{}'::jsonb;
END IF;
FOR rec IN SELECT count FROM word_frequencies WHERE id = wid
LOOP
info = info || jsonb_build_object('count', rec.count);
END LOOP;
FOR rec IN SELECT count FROM addressword_frequencies WHERE id = wid
LOOP
info = info || jsonb_build_object('addr_count', rec.count);
END LOOP;
IF info = '{}'::jsonb THEN
info = null;
END IF;
END;
$$ LANGUAGE plpgsql IMMUTABLE;
""")
LOG.info('Update word table with recomputed frequencies')
cur.drop_table('tmp_word')
cur.execute("""CREATE TABLE tmp_word AS
SELECT word_id, word_token, type, word,
word_freq_update(word_id, info) as info
FROM word
""")
cur.drop_table('word_frequencies')
cur.drop_table('addressword_frequencies')
else:
LOG.info('Computing word frequencies')
cur.drop_table('word_frequencies')
cur.execute("""
CREATE TEMP TABLE word_frequencies AS
WITH word_freq AS MATERIALIZED (
SELECT unnest(name_vector) as id, count(*)
FROM search_name GROUP BY id),
addr_freq AS MATERIALIZED (
SELECT unnest(nameaddress_vector) as id, count(*)
FROM search_name GROUP BY id)
SELECT coalesce(a.id, w.id) as id,
(CASE WHEN w.count is null THEN '{}'::JSONB
ELSE jsonb_build_object('count', w.count) END
||
CASE WHEN a.count is null THEN '{}'::JSONB
ELSE jsonb_build_object('addr_count', a.count) END) as info
FROM word_freq w FULL JOIN addr_freq a ON a.id = w.id;
""")
cur.execute('CREATE UNIQUE INDEX ON word_frequencies(id) INCLUDE(info)')
cur.execute('ANALYSE word_frequencies')
LOG.info('Update word table with recomputed frequencies')
cur.drop_table('tmp_word')
cur.execute("""CREATE TABLE tmp_word AS
SELECT word_id, word_token, type, word,
(CASE WHEN wf.info is null THEN word.info
ELSE coalesce(word.info, '{}'::jsonb) || wf.info
END) as info
FROM word LEFT JOIN word_frequencies wf
ON word.word_id = wf.id
""")
cur.drop_table('word_frequencies')
with conn.cursor() as cur:
cur.execute('SET max_parallel_workers_per_gather TO 0')
sqlp = SQLPreprocessor(conn, config)
sqlp.run_string(conn,
'GRANT SELECT ON tmp_word TO "{{config.DATABASE_WEBUSER}}"')
conn.commit()
self._create_base_indices(config, 'tmp_word')
self._create_lookup_indices(config, 'tmp_word')
self._move_temporary_word_table('tmp_word')
def _cleanup_housenumbers(self) -> None:
""" Remove unused house numbers.
"""
with connect(self.dsn) as conn:
if not conn.table_exists('search_name'):
return
with conn.cursor(name="hnr_counter") as cur:
cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
FROM word
WHERE type = 'H'
AND NOT EXISTS(SELECT * FROM search_name
WHERE ARRAY[word.word_id] && name_vector)
AND (char_length(coalesce(word, word_token)) > 6
OR coalesce(word, word_token) not similar to '\\d+')
""")
candidates = {token: wid for wid, token in cur}
with conn.cursor(name="hnr_counter") as cur:
cur.execute("""SELECT housenumber FROM placex
WHERE housenumber is not null
AND (char_length(housenumber) > 6
OR housenumber not similar to '\\d+')
""")
for row in cur:
for hnr in row[0].split(';'):
candidates.pop(hnr, None)
LOG.info("There are %s outdated housenumbers.", len(candidates))
LOG.debug("Outdated housenumbers: %s", candidates.keys())
if candidates:
with conn.cursor() as cur:
cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
(list(candidates.values()), ))
conn.commit()
def update_word_tokens(self) -> None:
""" Remove unused tokens.
"""
LOG.warning("Cleaning up housenumber tokens.")
self._cleanup_housenumbers()
LOG.warning("Tokenizer house-keeping done.")
def name_analyzer(self) -> 'ICUNameAnalyzer':
""" Create a new analyzer for tokenizing names and queries
using this tokinzer. Analyzers are context managers and should
be used accordingly:
```
with tokenizer.name_analyzer() as analyzer:
analyser.tokenize()
```
When used outside the with construct, the caller must ensure to
call the close() function before destructing the analyzer.
Analyzers are not thread-safe. You need to instantiate one per thread.
"""
assert self.loader is not None
return ICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
self.loader.make_token_analysis())
def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
""" Return a list of the `num` most frequent full words
in the database.
"""
with conn.cursor() as cur:
cur.execute("""SELECT word, sum((info->>'count')::int) as count
FROM word WHERE type = 'W'
GROUP BY word
ORDER BY count DESC LIMIT %s""", (num,))
return list(s[0].split('@')[0] for s in cur)
def _install_php(self, phpdir: Optional[Path], overwrite: bool = True) -> None:
""" Install the php script for the tokenizer.
"""
if phpdir is not None:
assert self.loader is not None
php_file = self.data_dir / "tokenizer.php"
if not php_file.exists() or overwrite:
php_file.write_text(dedent(f"""\
<?php
@define('CONST_Max_Word_Frequency', 10000000);
@define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
@define('CONST_Transliteration', "{self.loader.get_search_rules()}");
require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
def _save_config(self) -> None:
""" Save the configuration that needs to remain stable for the given
database as database properties.
"""
assert self.loader is not None
with connect(self.dsn) as conn:
self.loader.save_config_to_db(conn)
def _setup_db_tables(self, config: Configuration) -> None:
""" Set up the word table and fill it with pre-computed word
frequencies.
"""
with connect(self.dsn) as conn:
with conn.cursor() as cur:
cur.drop_table('word')
sqlp = SQLPreprocessor(conn, config)
sqlp.run_string(conn, """
CREATE TABLE word (
word_id INTEGER,
word_token text NOT NULL,
type text NOT NULL,
word text,
info jsonb
) {{db.tablespace.search_data}};
GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
DROP SEQUENCE IF EXISTS seq_word;
CREATE SEQUENCE seq_word start 1;
GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}";
""")
conn.commit()
def _create_base_indices(self, config: Configuration, table_name: str) -> None:
""" Set up the word table and fill it with pre-computed word
frequencies.
"""
with connect(self.dsn) as conn:
sqlp = SQLPreprocessor(conn, config)
sqlp.run_string(conn,
"""CREATE INDEX idx_{{table_name}}_word_token ON {{table_name}}
USING BTREE (word_token) {{db.tablespace.search_index}}""",
table_name=table_name)
for name, ctype in WORD_TYPES:
sqlp.run_string(conn,
"""CREATE INDEX idx_{{table_name}}_{{idx_name}} ON {{table_name}}
USING BTREE (word) {{db.tablespace.address_index}}
WHERE type = '{{column_type}}'
""",
table_name=table_name, idx_name=name,
column_type=ctype)
conn.commit()
def _create_lookup_indices(self, config: Configuration, table_name: str) -> None:
""" Create additional indexes used when running the API.
"""
with connect(self.dsn) as conn:
sqlp = SQLPreprocessor(conn, config)
# Index required for details lookup.
sqlp.run_string(conn, """
CREATE INDEX IF NOT EXISTS idx_{{table_name}}_word_id
ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}}
""",
table_name=table_name)
conn.commit()
def _move_temporary_word_table(self, old: str) -> None:
""" Rename all tables and indexes used by the tokenizer.
"""
with connect(self.dsn) as conn:
with conn.cursor() as cur:
cur.drop_table('word')
cur.execute(f"ALTER TABLE {old} RENAME TO word")
for idx in ('word_token', 'word_id'):
cur.execute(f"""ALTER INDEX idx_{old}_{idx}
RENAME TO idx_word_{idx}""")
for name, _ in WORD_TYPES:
cur.execute(f"""ALTER INDEX idx_{old}_{name}
RENAME TO idx_word_{name}""")
conn.commit()
class ICUNameAnalyzer(AbstractAnalyzer):
""" The ICU analyzer uses the ICU library for splitting names.
Each instance opens a connection to the database to request the
normalization.
"""
def __init__(self, dsn: str, sanitizer: PlaceSanitizer,
token_analysis: ICUTokenAnalysis) -> None:
self.conn: Optional[Connection] = connect(dsn).connection
self.conn.autocommit = True
self.sanitizer = sanitizer
self.token_analysis = token_analysis
self._cache = _TokenCache()
def close(self) -> None:
""" Free all resources used by the analyzer.
"""
if self.conn:
self.conn.close()
self.conn = None
def _search_normalized(self, name: str) -> str:
""" Return the search token transliteration of the given name.
"""
return cast(str, self.token_analysis.search.transliterate(name)).strip()
def _normalized(self, name: str) -> str:
""" Return the normalized version of the given name with all
non-relevant information removed.
"""
return cast(str, self.token_analysis.normalizer.transliterate(name)).strip()
def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
""" Return token information for the given list of words.
If a word starts with # it is assumed to be a full name
otherwise is a partial name.
The function returns a list of tuples with
(original word, word token, word id).
The function is used for testing and debugging only
and not necessarily efficient.
"""
assert self.conn is not None
full_tokens = {}
partial_tokens = {}
for word in words:
if word.startswith('#'):
full_tokens[word] = self._search_normalized(word[1:])
else:
partial_tokens[word] = self._search_normalized(word)
with self.conn.cursor() as cur:
cur.execute("""SELECT word_token, word_id
FROM word WHERE word_token = ANY(%s) and type = 'W'
""", (list(full_tokens.values()),))
full_ids = {r[0]: r[1] for r in cur}
cur.execute("""SELECT word_token, word_id
FROM word WHERE word_token = ANY(%s) and type = 'w'""",
(list(partial_tokens.values()),))
part_ids = {r[0]: r[1] for r in cur}
return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
+ [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
def normalize_postcode(self, postcode: str) -> str:
""" Convert the postcode to a standardized form.
This function must yield exactly the same result as the SQL function
'token_normalized_postcode()'.
"""
return postcode.strip().upper()
def update_postcodes_from_db(self) -> None:
""" Update postcode tokens in the word table from the location_postcode
table.
"""
assert self.conn is not None
analyzer = self.token_analysis.analysis.get('@postcode')
with self.conn.cursor() as cur:
# First get all postcode names currently in the word table.
cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'")
word_entries = set((entry[0] for entry in cur))
# Then compute the required postcode names from the postcode table.
needed_entries = set()
cur.execute("SELECT country_code, postcode FROM location_postcode")
for cc, postcode in cur:
info = PlaceInfo({'country_code': cc,
'class': 'place', 'type': 'postcode',
'address': {'postcode': postcode}})
address = self.sanitizer.process_names(info)[1]
for place in address:
if place.kind == 'postcode':
if analyzer is None:
postcode_name = place.name.strip().upper()
variant_base = None
else:
postcode_name = analyzer.get_canonical_id(place)
variant_base = place.get_attr("variant")
if variant_base:
needed_entries.add(f'{postcode_name}@{variant_base}')
else:
needed_entries.add(postcode_name)
break
# Now update the word table.
self._delete_unused_postcode_words(word_entries - needed_entries)
self._add_missing_postcode_words(needed_entries - word_entries)
def _delete_unused_postcode_words(self, tokens: Iterable[str]) -> None:
assert self.conn is not None
if tokens:
with self.conn.cursor() as cur:
cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
(list(tokens), ))
def _add_missing_postcode_words(self, tokens: Iterable[str]) -> None:
assert self.conn is not None
if not tokens:
return
analyzer = self.token_analysis.analysis.get('@postcode')
terms = []
for postcode_name in tokens:
if '@' in postcode_name:
term, variant = postcode_name.split('@', 2)
term = self._search_normalized(term)
if analyzer is None:
variants = [term]
else:
variants = analyzer.compute_variants(variant)
if term not in variants:
variants.append(term)
else:
variants = [self._search_normalized(postcode_name)]
terms.append((postcode_name, variants))
if terms:
with self.conn.cursor() as cur:
cur.execute_values("""SELECT create_postcode_word(pc, var)
FROM (VALUES %s) AS v(pc, var)""",
terms)
def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
should_replace: bool) -> None:
""" Replace the search index for special phrases with the new phrases.
If `should_replace` is True, then the previous set of will be
completely replaced. Otherwise the phrases are added to the
already existing ones.
"""
assert self.conn is not None
norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
for p in phrases))
with self.conn.cursor() as cur:
# Get the old phrases.
existing_phrases = set()
cur.execute("SELECT word, info FROM word WHERE type = 'S'")
for word, info in cur:
existing_phrases.add((word, info['class'], info['type'],
info.get('op') or '-'))
added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
if should_replace:
deleted = self._remove_special_phrases(cur, norm_phrases,
existing_phrases)
else:
deleted = 0
LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
len(norm_phrases), added, deleted)
def _add_special_phrases(self, cursor: Cursor,
new_phrases: Set[Tuple[str, str, str, str]],
existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
""" Add all phrases to the database that are not yet there.
"""
to_add = new_phrases - existing_phrases
added = 0
with CopyBuffer() as copystr:
for word, cls, typ, oper in to_add:
term = self._search_normalized(word)
if term:
copystr.add(term, 'S', word,
json.dumps({'class': cls, 'type': typ,
'op': oper if oper in ('in', 'near') else None}))
added += 1
copystr.copy_out(cursor, 'word',
columns=['word_token', 'type', 'word', 'info'])
return added
def _remove_special_phrases(self, cursor: Cursor,
new_phrases: Set[Tuple[str, str, str, str]],
existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
""" Remove all phrases from the database that are no longer in the
new phrase list.
"""
to_delete = existing_phrases - new_phrases
if to_delete:
cursor.execute_values(
""" DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
WHERE type = 'S' and word = name
and info->>'class' = in_class and info->>'type' = in_type
and ((op = '-' and info->>'op' is null) or op = info->>'op')
""", to_delete)
return len(to_delete)
def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
""" Add default names for the given country to the search index.
"""
# Make sure any name preprocessing for country names applies.
info = PlaceInfo({'name': names, 'country_code': country_code,
'rank_address': 4, 'class': 'boundary',
'type': 'administrative'})
self._add_country_full_names(country_code,
self.sanitizer.process_names(info)[0],
internal=True)
def _add_country_full_names(self, country_code: str, names: Sequence[PlaceName],
internal: bool = False) -> None:
""" Add names for the given country from an already sanitized
name list.
"""
assert self.conn is not None
word_tokens = set()
for name in names:
norm_name = self._search_normalized(name.name)
if norm_name:
word_tokens.add(norm_name)
with self.conn.cursor() as cur:
# Get existing names
cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
FROM word
WHERE type = 'C' and word = %s""",
(country_code, ))
# internal/external names
existing_tokens: Dict[bool, Set[str]] = {True: set(), False: set()}
for word in cur:
existing_tokens[word[1]].add(word[0])
# Delete names that no longer exist.
gone_tokens = existing_tokens[internal] - word_tokens
if internal:
gone_tokens.update(existing_tokens[False] & word_tokens)
if gone_tokens:
cur.execute("""DELETE FROM word
USING unnest(%s) as token
WHERE type = 'C' and word = %s
and word_token = token""",
(list(gone_tokens), country_code))
# Only add those names that are not yet in the list.
new_tokens = word_tokens - existing_tokens[True]
if not internal:
new_tokens -= existing_tokens[False]
if new_tokens:
if internal:
sql = """INSERT INTO word (word_token, type, word, info)
(SELECT token, 'C', %s, '{"internal": "yes"}'
FROM unnest(%s) as token)
"""
else:
sql = """INSERT INTO word (word_token, type, word)
(SELECT token, 'C', %s
FROM unnest(%s) as token)
"""
cur.execute(sql, (country_code, list(new_tokens)))
def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
""" Determine tokenizer information about the given place.
Returns a JSON-serializable structure that will be handed into
the database via the token_info field.
"""
token_info = _TokenInfo()
names, address = self.sanitizer.process_names(place)
if names:
token_info.set_names(*self._compute_name_tokens(names))
if place.is_country():
assert place.country_code is not None
self._add_country_full_names(place.country_code, names)
if address:
self._process_place_address(token_info, address)
return token_info.to_dict()
def _process_place_address(self, token_info: '_TokenInfo',
address: Sequence[PlaceName]) -> None:
for item in address:
if item.kind == 'postcode':
token_info.set_postcode(self._add_postcode(item))
elif item.kind == 'housenumber':
token_info.add_housenumber(*self._compute_housenumber_token(item))
elif item.kind == 'street':
token_info.add_street(self._retrieve_full_tokens(item.name))
elif item.kind == 'place':
if not item.suffix:
token_info.add_place(itertools.chain(*self._compute_name_tokens([item])))
elif not item.kind.startswith('_') and not item.suffix and \
item.kind not in ('country', 'full', 'inclusion'):
token_info.add_address_term(item.kind,
itertools.chain(*self._compute_name_tokens([item])))
def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]:
""" Normalize the housenumber and return the word token and the
canonical form.
"""
assert self.conn is not None
analyzer = self.token_analysis.analysis.get('@housenumber')
result: Tuple[Optional[int], Optional[str]] = (None, None)
if analyzer is None:
# When no custom analyzer is set, simply normalize and transliterate
norm_name = self._search_normalized(hnr.name)
if norm_name:
result = self._cache.housenumbers.get(norm_name, result)
if result[0] is None:
with self.conn.cursor() as cur:
hid = cur.scalar("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
result = hid, norm_name
self._cache.housenumbers[norm_name] = result
else:
# Otherwise use the analyzer to determine the canonical name.
# Per convention we use the first variant as the 'lookup name', the
# name that gets saved in the housenumber field of the place.
word_id = analyzer.get_canonical_id(hnr)
if word_id:
result = self._cache.housenumbers.get(word_id, result)
if result[0] is None:
variants = analyzer.compute_variants(word_id)
if variants:
with self.conn.cursor() as cur:
hid = cur.scalar("SELECT create_analyzed_hnr_id(%s, %s)",
(word_id, list(variants)))
result = hid, variants[0]
self._cache.housenumbers[word_id] = result
return result
def _retrieve_full_tokens(self, name: str) -> List[int]:
""" Get the full name token for the given name, if it exists.
The name is only retrieved for the standard analyser.
"""
assert self.conn is not None
norm_name = self._search_normalized(name)
# return cached if possible
if norm_name in self._cache.fulls:
return self._cache.fulls[norm_name]
with self.conn.cursor() as cur:
cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
(norm_name, ))
full = [row[0] for row in cur]
self._cache.fulls[norm_name] = full
return full
def _compute_name_tokens(self, names: Sequence[PlaceName]) -> Tuple[Set[int], Set[int]]:
""" Computes the full name and partial name tokens for the given
dictionary of names.
"""
assert self.conn is not None
full_tokens: Set[int] = set()
partial_tokens: Set[int] = set()
for name in names:
analyzer_id = name.get_attr('analyzer')
analyzer = self.token_analysis.get_analyzer(analyzer_id)
word_id = analyzer.get_canonical_id(name)
if analyzer_id is None:
token_id = word_id
else:
token_id = f'{word_id}@{analyzer_id}'
full, part = self._cache.names.get(token_id, (None, None))
if full is None:
variants = analyzer.compute_variants(word_id)
if not variants:
continue
with self.conn.cursor() as cur:
cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
(token_id, variants))
full, part = cast(Tuple[int, List[int]], cur.fetchone())
self._cache.names[token_id] = (full, part)
assert part is not None
full_tokens.add(full)
partial_tokens.update(part)
return full_tokens, partial_tokens
def _add_postcode(self, item: PlaceName) -> Optional[str]:
""" Make sure the normalized postcode is present in the word table.
"""
assert self.conn is not None
analyzer = self.token_analysis.analysis.get('@postcode')
if analyzer is None:
postcode_name = item.name.strip().upper()
variant_base = None
else:
postcode_name = analyzer.get_canonical_id(item)
variant_base = item.get_attr("variant")
if variant_base:
postcode = f'{postcode_name}@{variant_base}'
else:
postcode = postcode_name
if postcode not in self._cache.postcodes:
term = self._search_normalized(postcode_name)
if not term:
return None
variants = {term}
if analyzer is not None and variant_base:
variants.update(analyzer.compute_variants(variant_base))
with self.conn.cursor() as cur:
cur.execute("SELECT create_postcode_word(%s, %s)",
(postcode, list(variants)))
self._cache.postcodes.add(postcode)
return postcode_name
class _TokenInfo:
""" Collect token information to be sent back to the database.
"""
def __init__(self) -> None:
self.names: Optional[str] = None
self.housenumbers: Set[str] = set()
self.housenumber_tokens: Set[int] = set()
self.street_tokens: Optional[Set[int]] = None
self.place_tokens: Set[int] = set()
self.address_tokens: Dict[str, str] = {}
self.postcode: Optional[str] = None
def _mk_array(self, tokens: Iterable[Any]) -> str:
return f"{{{','.join((str(s) for s in tokens))}}}"
def to_dict(self) -> Dict[str, Any]:
""" Return the token information in database importable format.
"""
out: Dict[str, Any] = {}
if self.names:
out['names'] = self.names
if self.housenumbers:
out['hnr'] = ';'.join(self.housenumbers)
out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
if self.street_tokens is not None:
out['street'] = self._mk_array(self.street_tokens)
if self.place_tokens:
out['place'] = self._mk_array(self.place_tokens)
if self.address_tokens:
out['addr'] = self.address_tokens
if self.postcode:
out['postcode'] = self.postcode
return out
def set_names(self, fulls: Iterable[int], partials: Iterable[int]) -> None:
""" Adds token information for the normalised names.
"""
self.names = self._mk_array(itertools.chain(fulls, partials))
def add_housenumber(self, token: Optional[int], hnr: Optional[str]) -> None:
""" Extract housenumber information from a list of normalised
housenumbers.
"""
if token:
assert hnr is not None
self.housenumbers.add(hnr)
self.housenumber_tokens.add(token)
def add_street(self, tokens: Iterable[int]) -> None:
""" Add addr:street match terms.
"""
if self.street_tokens is None:
self.street_tokens = set()
self.street_tokens.update(tokens)
def add_place(self, tokens: Iterable[int]) -> None:
""" Add addr:place search and match terms.
"""
self.place_tokens.update(tokens)
def add_address_term(self, key: str, partials: Iterable[int]) -> None:
""" Add additional address terms.
"""
array = self._mk_array(partials)
if len(array) > 2:
self.address_tokens[key] = array
def set_postcode(self, postcode: Optional[str]) -> None:
""" Set the postcode to the given one.
"""
self.postcode = postcode
class _TokenCache:
""" Cache for token information to avoid repeated database queries.
This cache is not thread-safe and needs to be instantiated per
analyzer.
"""
def __init__(self) -> None:
self.names: Dict[str, Tuple[int, List[int]]] = {}
self.partials: Dict[str, int] = {}
self.fulls: Dict[str, List[int]] = {}
self.postcodes: Set[str] = set()
self.housenumbers: Dict[str, Tuple[Optional[int], Optional[str]]] = {}

View File

@@ -0,0 +1,681 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Tokenizer implementing normalisation as used before Nominatim 4.
"""
from typing import Optional, Sequence, List, Tuple, Mapping, Any, Callable, \
cast, Dict, Set, Iterable
from collections import OrderedDict
import logging
from pathlib import Path
import re
import shutil
from textwrap import dedent
from icu import Transliterator
import psycopg2
import psycopg2.extras
from nominatim_core.errors import UsageError
from nominatim_core.db.connection import connect, Connection
from nominatim_core.config import Configuration
from nominatim_core.db import properties
from nominatim_core.db import utils as db_utils
from nominatim_core.db.sql_preprocessor import SQLPreprocessor
from ..data.place_info import PlaceInfo
from .base import AbstractAnalyzer, AbstractTokenizer
DBCFG_NORMALIZATION = "tokenizer_normalization"
DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
LOG = logging.getLogger()
def create(dsn: str, data_dir: Path) -> 'LegacyTokenizer':
""" Create a new instance of the tokenizer provided by this module.
"""
return LegacyTokenizer(dsn, data_dir)
def _install_module(config_module_path: str, src_dir: Path, module_dir: Path) -> str:
""" Copies the PostgreSQL normalisation module into the project
directory if necessary. For historical reasons the module is
saved in the '/module' subdirectory and not with the other tokenizer
data.
The function detects when the installation is run from the
build directory. It doesn't touch the module in that case.
"""
# Custom module locations are simply used as is.
if config_module_path:
LOG.info("Using custom path for database module at '%s'", config_module_path)
return config_module_path
# Compatibility mode for builddir installations.
if module_dir.exists() and src_dir.samefile(module_dir):
LOG.info('Running from build directory. Leaving database module as is.')
return str(module_dir)
# In any other case install the module in the project directory.
if not module_dir.exists():
module_dir.mkdir()
destfile = module_dir / 'nominatim.so'
shutil.copy(str(src_dir / 'nominatim.so'), str(destfile))
destfile.chmod(0o755)
LOG.info('Database module installed at %s', str(destfile))
return str(module_dir)
def _check_module(module_dir: str, conn: Connection) -> None:
""" Try to use the PostgreSQL module to confirm that it is correctly
installed and accessible from PostgreSQL.
"""
with conn.cursor() as cur:
try:
cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
RETURNS text AS %s, 'transliteration'
LANGUAGE c IMMUTABLE STRICT;
DROP FUNCTION nominatim_test_import_func(text)
""", (f'{module_dir}/nominatim.so', ))
except psycopg2.DatabaseError as err:
LOG.fatal("Error accessing database module: %s", err)
raise UsageError("Database module cannot be accessed.") from err
class LegacyTokenizer(AbstractTokenizer):
""" The legacy tokenizer uses a special PostgreSQL module to normalize
names and queries. The tokenizer thus implements normalization through
calls to the database.
"""
def __init__(self, dsn: str, data_dir: Path) -> None:
self.dsn = dsn
self.data_dir = data_dir
self.normalization: Optional[str] = None
def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
""" Set up a new tokenizer for the database.
This copies all necessary data in the project directory to make
sure the tokenizer remains stable even over updates.
"""
assert config.project_dir is not None
module_dir = _install_module(config.DATABASE_MODULE_PATH,
config.lib_dir.module,
config.project_dir / 'module')
self.normalization = config.TERM_NORMALIZATION
self._install_php(config, overwrite=True)
with connect(self.dsn) as conn:
_check_module(module_dir, conn)
self._save_config(conn, config)
conn.commit()
if init_db:
self.update_sql_functions(config)
self._init_db_tables(config)
def init_from_project(self, config: Configuration) -> None:
""" Initialise the tokenizer from the project directory.
"""
assert config.project_dir is not None
with connect(self.dsn) as conn:
self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
if not (config.project_dir / 'module' / 'nominatim.so').exists():
_install_module(config.DATABASE_MODULE_PATH,
config.lib_dir.module,
config.project_dir / 'module')
self._install_php(config, overwrite=False)
def finalize_import(self, config: Configuration) -> None:
""" Do any required postprocessing to make the tokenizer data ready
for use.
"""
with connect(self.dsn) as conn:
sqlp = SQLPreprocessor(conn, config)
sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
def update_sql_functions(self, config: Configuration) -> None:
""" Reimport the SQL functions for this tokenizer.
"""
assert config.project_dir is not None
with connect(self.dsn) as conn:
max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ)
modulepath = config.DATABASE_MODULE_PATH or \
str((config.project_dir / 'module').resolve())
sqlp = SQLPreprocessor(conn, config)
sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer.sql',
max_word_freq=max_word_freq,
modulepath=modulepath)
def check_database(self, _: Configuration) -> Optional[str]:
""" Check that the tokenizer is set up correctly.
"""
hint = """\
The Postgresql extension nominatim.so was not correctly loaded.
Error: {error}
Hints:
* Check the output of the CMmake/make installation step
* Does nominatim.so exist?
* Does nominatim.so exist on the database server?
* Can nominatim.so be accessed by the database user?
"""
with connect(self.dsn) as conn:
with conn.cursor() as cur:
try:
out = cur.scalar("SELECT make_standard_name('a')")
except psycopg2.Error as err:
return hint.format(error=str(err))
if out != 'a':
return hint.format(error='Unexpected result for make_standard_name()')
return None
def migrate_database(self, config: Configuration) -> None:
""" Initialise the project directory of an existing database for
use with this tokenizer.
This is a special migration function for updating existing databases
to new software versions.
"""
assert config.project_dir is not None
self.normalization = config.TERM_NORMALIZATION
module_dir = _install_module(config.DATABASE_MODULE_PATH,
config.lib_dir.module,
config.project_dir / 'module')
with connect(self.dsn) as conn:
_check_module(module_dir, conn)
self._save_config(conn, config)
def update_statistics(self, config: Configuration, threads: int = 1) -> None:
""" Recompute the frequency of full words.
"""
with connect(self.dsn) as conn:
if conn.table_exists('search_name'):
with conn.cursor() as cur:
cur.drop_table("word_frequencies")
LOG.info("Computing word frequencies")
cur.execute("""CREATE TEMP TABLE word_frequencies AS
SELECT unnest(name_vector) as id, count(*)
FROM search_name GROUP BY id""")
cur.execute("CREATE INDEX ON word_frequencies(id)")
LOG.info("Update word table with recomputed frequencies")
cur.execute("""UPDATE word SET search_name_count = count
FROM word_frequencies
WHERE word_token like ' %' and word_id = id""")
cur.drop_table("word_frequencies")
conn.commit()
def update_word_tokens(self) -> None:
""" No house-keeping implemented for the legacy tokenizer.
"""
LOG.info("No tokenizer clean-up available.")
def name_analyzer(self) -> 'LegacyNameAnalyzer':
""" Create a new analyzer for tokenizing names and queries
using this tokinzer. Analyzers are context managers and should
be used accordingly:
```
with tokenizer.name_analyzer() as analyzer:
analyser.tokenize()
```
When used outside the with construct, the caller must ensure to
call the close() function before destructing the analyzer.
Analyzers are not thread-safe. You need to instantiate one per thread.
"""
normalizer = Transliterator.createFromRules("phrase normalizer",
self.normalization)
return LegacyNameAnalyzer(self.dsn, normalizer)
def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
""" Return a list of the `num` most frequent full words
in the database.
"""
with conn.cursor() as cur:
cur.execute(""" SELECT word FROM word WHERE word is not null
ORDER BY search_name_count DESC LIMIT %s""", (num,))
return list(s[0] for s in cur)
def _install_php(self, config: Configuration, overwrite: bool = True) -> None:
""" Install the php script for the tokenizer.
"""
if config.lib_dir.php is not None:
php_file = self.data_dir / "tokenizer.php"
if not php_file.exists() or overwrite:
php_file.write_text(dedent(f"""\
<?php
@define('CONST_Max_Word_Frequency', {config.MAX_WORD_FREQUENCY});
@define('CONST_Term_Normalization_Rules', "{config.TERM_NORMALIZATION}");
require_once('{config.lib_dir.php}/tokenizer/legacy_tokenizer.php');
"""), encoding='utf-8')
def _init_db_tables(self, config: Configuration) -> None:
""" Set up the word table and fill it with pre-computed word
frequencies.
"""
with connect(self.dsn) as conn:
sqlp = SQLPreprocessor(conn, config)
sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
conn.commit()
LOG.warning("Precomputing word tokens")
db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
def _save_config(self, conn: Connection, config: Configuration) -> None:
""" Save the configuration that needs to remain stable for the given
database as database properties.
"""
assert self.normalization is not None
properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
class LegacyNameAnalyzer(AbstractAnalyzer):
""" The legacy analyzer uses the special Postgresql module for
splitting names.
Each instance opens a connection to the database to request the
normalization.
"""
def __init__(self, dsn: str, normalizer: Any):
self.conn: Optional[Connection] = connect(dsn).connection
self.conn.autocommit = True
self.normalizer = normalizer
psycopg2.extras.register_hstore(self.conn)
self._cache = _TokenCache(self.conn)
def close(self) -> None:
""" Free all resources used by the analyzer.
"""
if self.conn:
self.conn.close()
self.conn = None
def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
""" Return token information for the given list of words.
If a word starts with # it is assumed to be a full name
otherwise is a partial name.
The function returns a list of tuples with
(original word, word token, word id).
The function is used for testing and debugging only
and not necessarily efficient.
"""
assert self.conn is not None
with self.conn.cursor() as cur:
cur.execute("""SELECT t.term, word_token, word_id
FROM word, (SELECT unnest(%s::TEXT[]) as term) t
WHERE word_token = (CASE
WHEN left(t.term, 1) = '#' THEN
' ' || make_standard_name(substring(t.term from 2))
ELSE
make_standard_name(t.term)
END)
and class is null and country_code is null""",
(words, ))
return [(r[0], r[1], r[2]) for r in cur]
def normalize(self, phrase: str) -> str:
""" Normalize the given phrase, i.e. remove all properties that
are irrelevant for search.
"""
return cast(str, self.normalizer.transliterate(phrase))
def normalize_postcode(self, postcode: str) -> str:
""" Convert the postcode to a standardized form.
This function must yield exactly the same result as the SQL function
'token_normalized_postcode()'.
"""
return postcode.strip().upper()
def update_postcodes_from_db(self) -> None:
""" Update postcode tokens in the word table from the location_postcode
table.
"""
assert self.conn is not None
with self.conn.cursor() as cur:
# This finds us the rows in location_postcode and word that are
# missing in the other table.
cur.execute("""SELECT * FROM
(SELECT pc, word FROM
(SELECT distinct(postcode) as pc FROM location_postcode) p
FULL JOIN
(SELECT word FROM word
WHERE class ='place' and type = 'postcode') w
ON pc = word) x
WHERE pc is null or word is null""")
to_delete = []
to_add = []
for postcode, word in cur:
if postcode is None:
to_delete.append(word)
else:
to_add.append(postcode)
if to_delete:
cur.execute("""DELETE FROM WORD
WHERE class ='place' and type = 'postcode'
and word = any(%s)
""", (to_delete, ))
if to_add:
cur.execute("""SELECT count(create_postcode_id(pc))
FROM unnest(%s) as pc
""", (to_add, ))
def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
should_replace: bool) -> None:
""" Replace the search index for special phrases with the new phrases.
"""
assert self.conn is not None
norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
for p in phrases))
with self.conn.cursor() as cur:
# Get the old phrases.
existing_phrases = set()
cur.execute("""SELECT word, class, type, operator FROM word
WHERE class != 'place'
OR (type != 'house' AND type != 'postcode')""")
for label, cls, typ, oper in cur:
existing_phrases.add((label, cls, typ, oper or '-'))
to_add = norm_phrases - existing_phrases
to_delete = existing_phrases - norm_phrases
if to_add:
cur.execute_values(
""" INSERT INTO word (word_id, word_token, word, class, type,
search_name_count, operator)
(SELECT nextval('seq_word'), ' ' || make_standard_name(name), name,
class, type, 0,
CASE WHEN op in ('in', 'near') THEN op ELSE null END
FROM (VALUES %s) as v(name, class, type, op))""",
to_add)
if to_delete and should_replace:
cur.execute_values(
""" DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
WHERE word = name and class = in_class and type = in_type
and ((op = '-' and operator is null) or op = operator)""",
to_delete)
LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
len(norm_phrases), len(to_add), len(to_delete))
def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
""" Add names for the given country to the search index.
"""
assert self.conn is not None
with self.conn.cursor() as cur:
cur.execute(
"""INSERT INTO word (word_id, word_token, country_code)
(SELECT nextval('seq_word'), lookup_token, %s
FROM (SELECT DISTINCT ' ' || make_standard_name(n) as lookup_token
FROM unnest(%s)n) y
WHERE NOT EXISTS(SELECT * FROM word
WHERE word_token = lookup_token and country_code = %s))
""", (country_code, list(names.values()), country_code))
def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
""" Determine tokenizer information about the given place.
Returns a JSON-serialisable structure that will be handed into
the database via the token_info field.
"""
assert self.conn is not None
token_info = _TokenInfo(self._cache)
names = place.name
if names:
token_info.add_names(self.conn, names)
if place.is_country():
assert place.country_code is not None
self.add_country_names(place.country_code, names)
address = place.address
if address:
self._process_place_address(token_info, address)
return token_info.data
def _process_place_address(self, token_info: '_TokenInfo', address: Mapping[str, str]) -> None:
assert self.conn is not None
hnrs = []
addr_terms = []
for key, value in address.items():
if key == 'postcode':
# Make sure the normalized postcode is present in the word table.
if re.search(r'[:,;]', value) is None:
norm_pc = self.normalize_postcode(value)
token_info.set_postcode(norm_pc)
self._cache.add_postcode(self.conn, norm_pc)
elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
hnrs.append(value)
elif key == 'street':
token_info.add_street(self.conn, value)
elif key == 'place':
token_info.add_place(self.conn, value)
elif not key.startswith('_') \
and key not in ('country', 'full', 'inclusion'):
addr_terms.append((key, value))
if hnrs:
token_info.add_housenumbers(self.conn, hnrs)
if addr_terms:
token_info.add_address_terms(self.conn, addr_terms)
class _TokenInfo:
""" Collect token information to be sent back to the database.
"""
def __init__(self, cache: '_TokenCache') -> None:
self.cache = cache
self.data: Dict[str, Any] = {}
def add_names(self, conn: Connection, names: Mapping[str, str]) -> None:
""" Add token information for the names of the place.
"""
with conn.cursor() as cur:
# Create the token IDs for all names.
self.data['names'] = cur.scalar("SELECT make_keywords(%s)::text",
(names, ))
def add_housenumbers(self, conn: Connection, hnrs: Sequence[str]) -> None:
""" Extract housenumber information from the address.
"""
if len(hnrs) == 1:
token = self.cache.get_housenumber(hnrs[0])
if token is not None:
self.data['hnr_tokens'] = token
self.data['hnr'] = hnrs[0]
return
# split numbers if necessary
simple_list: List[str] = []
for hnr in hnrs:
simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
if len(simple_list) > 1:
simple_list = list(set(simple_list))
with conn.cursor() as cur:
cur.execute("SELECT * FROM create_housenumbers(%s)", (simple_list, ))
result = cur.fetchone()
assert result is not None
self.data['hnr_tokens'], self.data['hnr'] = result
def set_postcode(self, postcode: str) -> None:
""" Set or replace the postcode token with the given value.
"""
self.data['postcode'] = postcode
def add_street(self, conn: Connection, street: str) -> None:
""" Add addr:street match terms.
"""
def _get_street(name: str) -> Optional[str]:
with conn.cursor() as cur:
return cast(Optional[str],
cur.scalar("SELECT word_ids_from_name(%s)::text", (name, )))
tokens = self.cache.streets.get(street, _get_street)
self.data['street'] = tokens or '{}'
def add_place(self, conn: Connection, place: str) -> None:
""" Add addr:place search and match terms.
"""
def _get_place(name: str) -> Tuple[List[int], List[int]]:
with conn.cursor() as cur:
cur.execute("""SELECT make_keywords(hstore('name' , %s))::text,
word_ids_from_name(%s)::text""",
(name, name))
return cast(Tuple[List[int], List[int]], cur.fetchone())
self.data['place_search'], self.data['place_match'] = \
self.cache.places.get(place, _get_place)
def add_address_terms(self, conn: Connection, terms: Sequence[Tuple[str, str]]) -> None:
""" Add additional address terms.
"""
def _get_address_term(name: str) -> Tuple[List[int], List[int]]:
with conn.cursor() as cur:
cur.execute("""SELECT addr_ids_from_name(%s)::text,
word_ids_from_name(%s)::text""",
(name, name))
return cast(Tuple[List[int], List[int]], cur.fetchone())
tokens = {}
for key, value in terms:
items = self.cache.address_terms.get(value, _get_address_term)
if items[0] or items[1]:
tokens[key] = items
if tokens:
self.data['addr'] = tokens
class _LRU:
""" Least recently used cache that accepts a generator function to
produce the item when there is a cache miss.
"""
def __init__(self, maxsize: int = 128):
self.data: 'OrderedDict[str, Any]' = OrderedDict()
self.maxsize = maxsize
def get(self, key: str, generator: Callable[[str], Any]) -> Any:
""" Get the item with the given key from the cache. If nothing
is found in the cache, generate the value through the
generator function and store it in the cache.
"""
value = self.data.get(key)
if value is not None:
self.data.move_to_end(key)
else:
value = generator(key)
if len(self.data) >= self.maxsize:
self.data.popitem(last=False)
self.data[key] = value
return value
class _TokenCache:
""" Cache for token information to avoid repeated database queries.
This cache is not thread-safe and needs to be instantiated per
analyzer.
"""
def __init__(self, conn: Connection):
# various LRU caches
self.streets = _LRU(maxsize=256)
self.places = _LRU(maxsize=128)
self.address_terms = _LRU(maxsize=1024)
# Lookup houseunumbers up to 100 and cache them
with conn.cursor() as cur:
cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text
FROM generate_series(1, 100) as i""")
self._cached_housenumbers: Dict[str, str] = {str(r[0]): r[1] for r in cur}
# For postcodes remember the ones that have already been added
self.postcodes: Set[str] = set()
def get_housenumber(self, number: str) -> Optional[str]:
""" Get a housenumber token from the cache.
"""
return self._cached_housenumbers.get(number)
def add_postcode(self, conn: Connection, postcode: str) -> None:
""" Make sure the given postcode is in the database.
"""
if postcode not in self.postcodes:
with conn.cursor() as cur:
cur.execute('SELECT create_postcode_id(%s)', (postcode, ))
self.postcodes.add(postcode)

View File

@@ -0,0 +1,53 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Handler for cleaning name and address tags in place information before it
is handed to the token analysis.
"""
from typing import Optional, List, Mapping, Sequence, Callable, Any, Tuple
from nominatim_core.errors import UsageError
from nominatim_core.config import Configuration
from .sanitizers.config import SanitizerConfig
from .sanitizers.base import SanitizerHandler, ProcessInfo
from ..data.place_name import PlaceName
from ..data.place_info import PlaceInfo
class PlaceSanitizer:
""" Controller class which applies sanitizer functions on the place
names and address before they are used by the token analysers.
"""
def __init__(self, rules: Optional[Sequence[Mapping[str, Any]]],
config: Configuration) -> None:
self.handlers: List[Callable[[ProcessInfo], None]] = []
if rules:
for func in rules:
if 'step' not in func:
raise UsageError("Sanitizer rule is missing the 'step' attribute.")
if not isinstance(func['step'], str):
raise UsageError("'step' attribute must be a simple string.")
module: SanitizerHandler = \
config.load_plugin_module(func['step'], 'nominatim.tokenizer.sanitizers')
self.handlers.append(module.create(SanitizerConfig(func)))
def process_names(self, place: PlaceInfo) -> Tuple[List[PlaceName], List[PlaceName]]:
""" Extract a sanitized list of names and address parts from the
given place. The function returns a tuple
(list of names, list of address names)
"""
obj = ProcessInfo(place)
for func in self.handlers:
func(obj)
return obj.names, obj.address

View File

@@ -0,0 +1,64 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Common data types and protocols for sanitizers.
"""
from typing import Optional, List, Mapping, Callable
from nominatim_core.typing import Protocol, Final
from ...data.place_info import PlaceInfo
from ...data.place_name import PlaceName
from .config import SanitizerConfig
class ProcessInfo:
""" Container class for information handed into to handler functions.
The 'names' and 'address' members are mutable. A handler must change
them by either modifying the lists place or replacing the old content
with a new list.
"""
def __init__(self, place: PlaceInfo):
self.place: Final = place
self.names = self._convert_name_dict(place.name)
self.address = self._convert_name_dict(place.address)
@staticmethod
def _convert_name_dict(names: Optional[Mapping[str, str]]) -> List[PlaceName]:
""" Convert a dictionary of names into a list of PlaceNames.
The dictionary key is split into the primary part of the key
and the suffix (the part after an optional colon).
"""
out = []
if names:
for key, value in names.items():
parts = key.split(':', 1)
out.append(PlaceName(value.strip(),
parts[0].strip(),
parts[1].strip() if len(parts) > 1 else None))
return out
class SanitizerHandler(Protocol):
""" Protocol for sanitizer modules.
"""
def create(self, config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
"""
Create a function for sanitizing a place.
Arguments:
config: A dictionary with the additional configuration options
specified in the tokenizer configuration
Return:
The result must be a callable that takes a place description
and transforms name and address as required.
"""

View File

@@ -0,0 +1,80 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Sanitizer that preprocesses address tags for house numbers. The sanitizer
allows to
* define which tags are to be considered house numbers (see 'filter-kind')
* split house number lists into individual numbers (see 'delimiters')
Arguments:
delimiters: Define the set of characters to be used for
splitting a list of house numbers into parts. (default: ',;')
filter-kind: Define the address tags that are considered to be a
house number. Either takes a single string or a list of strings,
where each string is a regular expression. An address item
is considered a house number if the 'kind' fully matches any
of the given regular expressions. (default: 'housenumber')
convert-to-name: Define house numbers that should be treated as a name
instead of a house number. Either takes a single string
or a list of strings, where each string is a regular
expression that must match the full house number value.
"""
from typing import Callable, Iterator, List
from ...data.place_name import PlaceName
from .base import ProcessInfo
from .config import SanitizerConfig
class _HousenumberSanitizer:
def __init__(self, config: SanitizerConfig) -> None:
self.filter_kind = config.get_filter('filter-kind', ['housenumber'])
self.split_regexp = config.get_delimiter()
self.filter_name = config.get_filter('convert-to-name', 'FAIL_ALL')
def __call__(self, obj: ProcessInfo) -> None:
if not obj.address:
return
new_address: List[PlaceName] = []
for item in obj.address:
if self.filter_kind(item.kind):
if self.filter_name(item.name):
obj.names.append(item.clone(kind='housenumber'))
else:
new_address.extend(item.clone(kind='housenumber', name=n)
for n in self.sanitize(item.name))
else:
# Don't touch other address items.
new_address.append(item)
obj.address = new_address
def sanitize(self, value: str) -> Iterator[str]:
""" Extract housenumbers in a regularized format from an OSM value.
The function works as a generator that yields all valid housenumbers
that can be created from the value.
"""
for hnr in self.split_regexp.split(value):
if hnr:
yield from self._regularize(hnr)
def _regularize(self, hnr: str) -> Iterator[str]:
yield hnr
def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
""" Create a housenumber processing function.
"""
return _HousenumberSanitizer(config)

View File

@@ -0,0 +1,80 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Sanitizer that filters postcodes by their officially allowed pattern.
Arguments:
convert-to-address: If set to 'yes' (the default), then postcodes that do
not conform with their country-specific pattern are
converted to an address component. That means that
the postcode does not take part when computing the
postcode centroids of a country but is still searchable.
When set to 'no', non-conforming postcodes are not
searchable either.
default-pattern: Pattern to use, when there is none available for the
country in question. Warning: will not be used for
objects that have no country assigned. These are always
assumed to have no postcode.
"""
from typing import Callable, Optional, Tuple
from ...data.postcode_format import PostcodeFormatter
from .base import ProcessInfo
from .config import SanitizerConfig
class _PostcodeSanitizer:
def __init__(self, config: SanitizerConfig) -> None:
self.convert_to_address = config.get_bool('convert-to-address', True)
self.matcher = PostcodeFormatter()
default_pattern = config.get('default-pattern')
if default_pattern is not None and isinstance(default_pattern, str):
self.matcher.set_default_pattern(default_pattern)
def __call__(self, obj: ProcessInfo) -> None:
if not obj.address:
return
postcodes = ((i, o) for i, o in enumerate(obj.address) if o.kind == 'postcode')
for pos, postcode in postcodes:
formatted = self.scan(postcode.name, obj.place.country_code)
if formatted is None:
if self.convert_to_address:
postcode.kind = 'unofficial_postcode'
else:
obj.address.pop(pos)
else:
postcode.name = formatted[0]
postcode.set_attr('variant', formatted[1])
def scan(self, postcode: str, country: Optional[str]) -> Optional[Tuple[str, str]]:
""" Check the postcode for correct formatting and return the
normalized version. Returns None if the postcode does not
correspond to the official format of the given country.
"""
match = self.matcher.match(country, postcode)
if match is None:
return None
assert country is not None
return self.matcher.normalize(country, match),\
' '.join(filter(lambda p: p is not None, match.groups()))
def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
""" Create a function that filters postcodes by their officially allowed pattern.
"""
return _PostcodeSanitizer(config)

View File

@@ -0,0 +1,46 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Sanitizer that preprocesses tags from the TIGER import.
It makes the following changes:
* remove state reference from tiger:county
"""
from typing import Callable
import re
from .base import ProcessInfo
from .config import SanitizerConfig
COUNTY_MATCH = re.compile('(.*), [A-Z][A-Z]')
def _clean_tiger_county(obj: ProcessInfo) -> None:
""" Remove the state reference from tiger:county tags.
This transforms a name like 'Hamilton, AL' into 'Hamilton'.
If no state reference is detected at the end, the name is left as is.
"""
if not obj.address:
return
for item in obj.address:
if item.kind == 'tiger' and item.suffix == 'county':
m = COUNTY_MATCH.fullmatch(item.name)
if m:
item.name = m[1]
# Switch kind and suffix, the split left them reversed.
item.kind = 'county'
item.suffix = 'tiger'
return
def create(_: SanitizerConfig) -> Callable[[ProcessInfo], None]:
""" Create a function that preprocesses tags from the TIGER import.
"""
return _clean_tiger_county

View File

@@ -0,0 +1,151 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Configuration for Sanitizers.
"""
from typing import Sequence, Union, Optional, Pattern, Callable, Any, TYPE_CHECKING
from collections import UserDict
import re
from nominatim_core.errors import UsageError
# working around missing generics in Python < 3.8
# See https://github.com/python/typing/issues/60#issuecomment-869757075
if TYPE_CHECKING:
_BaseUserDict = UserDict[str, Any]
else:
_BaseUserDict = UserDict
class SanitizerConfig(_BaseUserDict):
""" The `SanitizerConfig` class is a read-only dictionary
with configuration options for the sanitizer.
In addition to the usual dictionary functions, the class provides
accessors to standard sanitizer options that are used by many of the
sanitizers.
"""
def get_string_list(self, param: str, default: Sequence[str] = tuple()) -> Sequence[str]:
""" Extract a configuration parameter as a string list.
Arguments:
param: Name of the configuration parameter.
default: Takes a tuple or list of strings which will
be returned if the parameter is missing in the
sanitizer configuration.
Note that if this default parameter is not
provided then an empty list is returned.
Returns:
If the parameter value is a simple string, it is returned as a
one-item list. If the parameter value does not exist, the given
default is returned. If the parameter value is a list, it is
checked to contain only strings before being returned.
"""
values = self.data.get(param, None)
if values is None:
return list(default)
if isinstance(values, str):
return [values] if values else []
if not isinstance(values, (list, tuple)):
raise UsageError(f"Parameter '{param}' must be string or list of strings.")
if any(not isinstance(value, str) for value in values):
raise UsageError(f"Parameter '{param}' must be string or list of strings.")
return values
def get_bool(self, param: str, default: Optional[bool] = None) -> bool:
""" Extract a configuration parameter as a boolean.
Arguments:
param: Name of the configuration parameter. The parameter must
contain one of the yaml boolean values or an
UsageError will be raised.
default: Value to return, when the parameter is missing.
When set to `None`, the parameter must be defined.
Returns:
Boolean value of the given parameter.
"""
value = self.data.get(param, default)
if not isinstance(value, bool):
raise UsageError(f"Parameter '{param}' must be a boolean value ('yes' or 'no').")
return value
def get_delimiter(self, default: str = ',;') -> Pattern[str]:
""" Return the 'delimiters' parameter in the configuration as a
compiled regular expression that can be used to split strings on
these delimiters.
Arguments:
default: Delimiters to be used when 'delimiters' parameter
is not explicitly configured.
Returns:
A regular expression pattern which can be used to
split a string. The regular expression makes sure that the
resulting names are stripped and that repeated delimiters
are ignored. It may still create empty fields on occasion. The
code needs to filter those.
"""
delimiter_set = set(self.data.get('delimiters', default))
if not delimiter_set:
raise UsageError("Empty 'delimiter' parameter not allowed for sanitizer.")
return re.compile('\\s*[{}]+\\s*'.format(''.join('\\' + d for d in delimiter_set)))
def get_filter(self, param: str, default: Union[str, Sequence[str]] = 'PASS_ALL'
) -> Callable[[str], bool]:
""" Returns a filter function for the given parameter of the sanitizer
configuration.
The value provided for the parameter in sanitizer configuration
should be a string or list of strings, where each string is a regular
expression. These regular expressions will later be used by the
filter function to filter strings.
Arguments:
param: The parameter for which the filter function
will be created.
default: Defines the behaviour of filter function if
parameter is missing in the sanitizer configuration.
Takes a string(PASS_ALL or FAIL_ALL) or a list of strings.
Any other value of string or an empty list is not allowed,
and will raise a ValueError. If the value is PASS_ALL, the filter
function will let all strings to pass, if the value is FAIL_ALL,
filter function will let no strings to pass.
If value provided is a list of strings each string
is treated as a regular expression. In this case these regular
expressions will be used by the filter function.
By default allow filter function to let all strings pass.
Returns:
A filter function that takes a target string as the argument and
returns True if it fully matches any of the regular expressions
otherwise returns False.
"""
filters = self.get_string_list(param) or default
if filters == 'PASS_ALL':
return lambda _: True
if filters == 'FAIL_ALL':
return lambda _: False
if filters and isinstance(filters, (list, tuple)):
regexes = [re.compile(regex) for regex in filters]
return lambda target: any(regex.fullmatch(target) for regex in regexes)
raise ValueError("Default parameter must be a non-empty list or a string value \
('PASS_ALL' or 'FAIL_ALL').")

View File

@@ -0,0 +1,128 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Sanitizer which prevents certain tags from getting into the search index.
It remove tags which matches all properties given below.
Arguments:
type: Define which type of tags should be considered for removal.
There are two types of tags 'name' and 'address' tags.
Takes a string 'name' or 'address'. (default: 'name')
filter-kind: Define which 'kind' of tags should be removed.
Takes a string or list of strings where each
string is a regular expression. A tag is considered
to be a candidate for removal if its 'kind' property
fully matches any of the given regular expressions.
Note that by default all 'kind' of tags are considered.
suffix: Define the 'suffix' property of the tags which should be
removed. Takes a string or list of strings where each
string is a regular expression. A tag is considered to be a
candidate for removal if its 'suffix' property fully
matches any of the given regular expressions. Note that by
default tags with any suffix value are considered including
those which don't have a suffix at all.
name: Define the 'name' property corresponding to the 'kind' property
of the tag. Takes a string or list of strings where each string
is a regular expression. A tag is considered to be a candidate
for removal if its name fully matches any of the given regular
expressions. Note that by default tags with any 'name' are
considered.
country_code: Define the country code of places whose tags should be
considered for removed. Takes a string or list of strings
where each string is a two-letter lower-case country code.
Note that by default tags of places with any country code
are considered including those which don't have a country
code at all.
rank_address: Define the address rank of places whose tags should be
considered for removal. Takes a string or list of strings
where each string is a number or range of number or the
form <from>-<to>.
Note that default is '0-30', which means that tags of all
places are considered.
See https://nominatim.org/release-docs/latest/customize/Ranking/#address-rank
to learn more about address rank.
"""
from typing import Callable, List, Tuple, Sequence
from ...data.place_name import PlaceName
from .base import ProcessInfo
from .config import SanitizerConfig
class _TagSanitizer:
def __init__(self, config: SanitizerConfig) -> None:
self.type = config.get('type', 'name')
self.filter_kind = config.get_filter('filter-kind')
self.country_codes = config.get_string_list('country_code', [])
self.filter_suffix = config.get_filter('suffix')
self.filter_name = config.get_filter('name')
self.allowed_ranks = self._set_allowed_ranks(
config.get_string_list("rank_address", ["0-30"])
)
self.has_country_code = config.get('country_code', None) is not None
def __call__(self, obj: ProcessInfo) -> None:
tags = obj.names if self.type == 'name' else obj.address
if not tags \
or not self.allowed_ranks[obj.place.rank_address] \
or self.has_country_code \
and obj.place.country_code not in self.country_codes:
return
filtered_tags: List[PlaceName] = []
for tag in tags:
if not self.filter_kind(tag.kind) \
or not self.filter_suffix(tag.suffix or '') \
or not self.filter_name(tag.name):
filtered_tags.append(tag)
if self.type == 'name':
obj.names = filtered_tags
else:
obj.address = filtered_tags
def _set_allowed_ranks(self, ranks: Sequence[str]) -> Tuple[bool, ...]:
""" Returns a tuple of 31 boolean values corresponding to the
address ranks 0-30. Value at index 'i' is True if rank 'i'
is present in the ranks or lies in the range of any of the
ranks provided in the sanitizer configuration, otherwise
the value is False.
"""
allowed_ranks = [False] * 31
for rank in ranks:
intvl = [int(x) for x in rank.split('-')]
start, end = intvl[0], intvl[0] if len(intvl) == 1 else intvl[1]
for i in range(start, end + 1):
allowed_ranks[i] = True
return tuple(allowed_ranks)
def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
""" Create a function to process removal of certain tags.
"""
return _TagSanitizer(config)

View File

@@ -0,0 +1,39 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Sanitizer that splits lists of names into their components.
Arguments:
delimiters: Define the set of characters to be used for
splitting the list. (default: ',;')
"""
from typing import Callable
from .base import ProcessInfo
from .config import SanitizerConfig
def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
""" Create a name processing function that splits name values with
multiple values into their components.
"""
regexp = config.get_delimiter()
def _process(obj: ProcessInfo) -> None:
if not obj.names:
return
new_names = []
for name in obj.names:
split_names = regexp.split(name.name)
if len(split_names) == 1:
new_names.append(name)
else:
new_names.extend(name.clone(name=n) for n in split_names if n)
obj.names = new_names
return _process

View File

@@ -0,0 +1,34 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
This sanitizer creates additional name variants for names that have
addendums in brackets (e.g. "Halle (Saale)"). The additional variant contains
only the main name part with the bracket part removed.
"""
from typing import Callable
from .base import ProcessInfo
from .config import SanitizerConfig
def create(_: SanitizerConfig) -> Callable[[ProcessInfo], None]:
""" Create a name processing function that creates additional name variants
for bracket addendums.
"""
def _process(obj: ProcessInfo) -> None:
""" Add variants for names that have a bracket extension.
"""
if obj.names:
new_names = []
for name in (n for n in obj.names if '(' in n.name):
new_name = name.name.split('(')[0].strip()
if new_name:
new_names.append(name.clone(name=new_name))
obj.names.extend(new_names)
return _process

View File

@@ -0,0 +1,99 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
This sanitizer sets the `analyzer` property depending on the
language of the tag. The language is taken from the suffix of the name.
If a name already has an analyzer tagged, then this is kept.
Arguments:
filter-kind: Restrict the names the sanitizer should be applied to
the given tags. The parameter expects a list of
regular expressions which are matched against 'kind'.
Note that a match against the full string is expected.
whitelist: Restrict the set of languages that should be tagged.
Expects a list of acceptable suffixes. When unset,
all 2- and 3-letter lower-case codes are accepted.
use-defaults: Configure what happens when the name has no suffix.
When set to 'all', a variant is created for
each of the default languages in the country
the feature is in. When set to 'mono', a variant is
only created, when exactly one language is spoken
in the country. The default is to do nothing with
the default languages of a country.
mode: Define how the variants are created and may be 'replace' or
'append'. When set to 'append' the original name (without
any analyzer tagged) is retained. (default: replace)
"""
from typing import Callable, Dict, Optional, List
from ...data import country_info
from .base import ProcessInfo
from .config import SanitizerConfig
class _AnalyzerByLanguage:
""" Processor for tagging the language of names in a place.
"""
def __init__(self, config: SanitizerConfig) -> None:
self.filter_kind = config.get_filter('filter-kind')
self.replace = config.get('mode', 'replace') != 'append'
self.whitelist = config.get('whitelist')
self._compute_default_languages(config.get('use-defaults', 'no'))
def _compute_default_languages(self, use_defaults: str) -> None:
self.deflangs: Dict[Optional[str], List[str]] = {}
if use_defaults in ('mono', 'all'):
for ccode, clangs in country_info.iterate('languages'):
if len(clangs) == 1 or use_defaults == 'all':
if self.whitelist:
self.deflangs[ccode] = [l for l in clangs if l in self.whitelist]
else:
self.deflangs[ccode] = clangs
def _suffix_matches(self, suffix: str) -> bool:
if self.whitelist is None:
return len(suffix) in (2, 3) and suffix.islower()
return suffix in self.whitelist
def __call__(self, obj: ProcessInfo) -> None:
if not obj.names:
return
more_names = []
for name in (n for n in obj.names
if not n.has_attr('analyzer') and self.filter_kind(n.kind)):
if name.suffix:
langs = [name.suffix] if self._suffix_matches(name.suffix) else None
else:
langs = self.deflangs.get(obj.place.country_code)
if langs:
if self.replace:
name.set_attr('analyzer', langs[0])
else:
more_names.append(name.clone(attr={'analyzer': langs[0]}))
more_names.extend(name.clone(attr={'analyzer': l}) for l in langs[1:])
obj.names.extend(more_names)
def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
""" Create a function that sets the analyzer property depending on the
language of the tag.
"""
return _AnalyzerByLanguage(config)

View File

@@ -0,0 +1,117 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
This sanitizer maps OSM data to Japanese block addresses.
It replaces blocknumber and housenumber with housenumber,
and quarter and neighbourhood with place.
"""
from typing import Callable
from typing import List, Optional
from .base import ProcessInfo
from .config import SanitizerConfig
from ...data.place_name import PlaceName
def create(_: SanitizerConfig) -> Callable[[ProcessInfo], None]:
"""Set up the sanitizer
"""
return tag_japanese
def reconbine_housenumber(
new_address: List[PlaceName],
tmp_housenumber: Optional[str],
tmp_blocknumber: Optional[str]
) -> List[PlaceName]:
""" Recombine the tag of housenumber by using housenumber and blocknumber
"""
if tmp_blocknumber and tmp_housenumber:
new_address.append(
PlaceName(
kind='housenumber',
name=f'{tmp_blocknumber}-{tmp_housenumber}',
suffix=''
)
)
elif tmp_blocknumber:
new_address.append(
PlaceName(
kind='housenumber',
name=tmp_blocknumber,
suffix=''
)
)
elif tmp_housenumber:
new_address.append(
PlaceName(
kind='housenumber',
name=tmp_housenumber,
suffix=''
)
)
return new_address
def reconbine_place(
new_address: List[PlaceName],
tmp_neighbourhood: Optional[str],
tmp_quarter: Optional[str]
) -> List[PlaceName]:
""" Recombine the tag of place by using neighbourhood and quarter
"""
if tmp_neighbourhood and tmp_quarter:
new_address.append(
PlaceName(
kind='place',
name=f'{tmp_quarter}{tmp_neighbourhood}',
suffix=''
)
)
elif tmp_neighbourhood:
new_address.append(
PlaceName(
kind='place',
name=tmp_neighbourhood,
suffix=''
)
)
elif tmp_quarter:
new_address.append(
PlaceName(
kind='place',
name=tmp_quarter,
suffix=''
)
)
return new_address
def tag_japanese(obj: ProcessInfo) -> None:
"""Recombine kind of address
"""
if obj.place.country_code != 'jp':
return
tmp_housenumber = None
tmp_blocknumber = None
tmp_neighbourhood = None
tmp_quarter = None
new_address = []
for item in obj.address:
if item.kind == 'housenumber':
tmp_housenumber = item.name
elif item.kind == 'block_number':
tmp_blocknumber = item.name
elif item.kind == 'neighbourhood':
tmp_neighbourhood = item.name
elif item.kind == 'quarter':
tmp_quarter = item.name
else:
new_address.append(item)
new_address = reconbine_housenumber(new_address, tmp_housenumber, tmp_blocknumber)
new_address = reconbine_place(new_address, tmp_neighbourhood, tmp_quarter)
obj.address = [item for item in new_address if item.name is not None]

View File

@@ -0,0 +1,96 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Common data types and protocols for analysers.
"""
from typing import Mapping, List, Any
from nominatim_core.typing import Protocol
from ...data.place_name import PlaceName
class Analyzer(Protocol):
""" The `create()` function of an analysis module needs to return an
object that implements the following functions.
"""
def get_canonical_id(self, name: PlaceName) -> str:
""" Return the canonical form of the given name. The canonical ID must
be unique (the same ID must always yield the same variants) and
must be a form from which the variants can be derived.
Arguments:
name: Extended place name description as prepared by
the sanitizers.
Returns:
ID string with a canonical form of the name. The string may
be empty, when the analyzer cannot analyze the name at all,
for example because the character set in use does not match.
"""
def compute_variants(self, canonical_id: str) -> List[str]:
""" Compute the transliterated spelling variants for the given
canonical ID.
Arguments:
canonical_id: ID string previously computed with
`get_canonical_id()`.
Returns:
A list of possible spelling variants. All strings must have
been transformed with the global normalizer and
transliterator ICU rules. Otherwise they cannot be matched
against the input by the query frontend.
The list may be empty, when there are no useful
spelling variants. This may happen when an analyzer only
usually outputs additional variants to the canonical spelling
and there are no such variants.
"""
class AnalysisModule(Protocol):
""" The setup of the token analysis is split into two parts:
configuration and analyser factory. A token analysis module must
therefore implement the two functions here described.
"""
def configure(self, rules: Mapping[str, Any],
normalizer: Any, transliterator: Any) -> Any:
""" Prepare the configuration of the analysis module.
This function should prepare all data that can be shared
between instances of this analyser.
Arguments:
rules: A dictionary with the additional configuration options
as specified in the tokenizer configuration.
normalizer: an ICU Transliterator with the compiled
global normalization rules.
transliterator: an ICU Transliterator with the compiled
global transliteration rules.
Returns:
A data object with configuration data. This will be handed
as is into the `create()` function and may be
used freely by the analysis module as needed.
"""
def create(self, normalizer: Any, transliterator: Any, config: Any) -> Analyzer:
""" Create a new instance of the analyser.
A separate instance of the analyser is created for each thread
when used in multi-threading context.
Arguments:
normalizer: an ICU Transliterator with the compiled normalization
rules.
transliterator: an ICU Transliterator with the compiled
transliteration rules.
config: The object that was returned by the call to configure().
Returns:
A new analyzer instance. This must be an object that implements
the Analyzer protocol.
"""

View File

@@ -0,0 +1,139 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Parser for configuration for variants.
"""
from typing import Any, Iterator, Tuple, List, Optional, Set, NamedTuple
from collections import defaultdict
import itertools
import re
from nominatim_core.config import flatten_config_list
from nominatim_core.errors import UsageError
class ICUVariant(NamedTuple):
""" A single replacement rule for variant creation.
"""
source: str
replacement: str
def get_variant_config(in_rules: Any,
normalizer: Any) -> Tuple[List[Tuple[str, List[str]]], str]:
""" Convert the variant definition from the configuration into
replacement sets.
Returns a tuple containing the replacement set and the list of characters
used in the replacements.
"""
immediate = defaultdict(list)
chars: Set[str] = set()
if in_rules:
vset: Set[ICUVariant] = set()
rules = flatten_config_list(in_rules, 'variants')
vmaker = _VariantMaker(normalizer)
for section in rules:
for rule in (section.get('words') or []):
vset.update(vmaker.compute(rule))
# Intermediate reorder by source. Also compute required character set.
for variant in vset:
if variant.source[-1] == ' ' and variant.replacement[-1] == ' ':
replstr = variant.replacement[:-1]
else:
replstr = variant.replacement
immediate[variant.source].append(replstr)
chars.update(variant.source)
return list(immediate.items()), ''.join(chars)
class _VariantMaker:
""" Generator for all necessary ICUVariants from a single variant rule.
All text in rules is normalized to make sure the variants match later.
"""
def __init__(self, normalizer: Any) -> None:
self.norm = normalizer
def compute(self, rule: Any) -> Iterator[ICUVariant]:
""" Generator for all ICUVariant tuples from a single variant rule.
"""
parts = re.split(r'(\|)?([=-])>', rule)
if len(parts) != 4:
raise UsageError(f"Syntax error in variant rule: {rule}")
decompose = parts[1] is None
src_terms = [self._parse_variant_word(t) for t in parts[0].split(',')]
repl_terms = (self.norm.transliterate(t).strip() for t in parts[3].split(','))
# If the source should be kept, add a 1:1 replacement
if parts[2] == '-':
for src in src_terms:
if src:
for froms, tos in _create_variants(*src, src[0], decompose):
yield ICUVariant(froms, tos)
for src, repl in itertools.product(src_terms, repl_terms):
if src and repl:
for froms, tos in _create_variants(*src, repl, decompose):
yield ICUVariant(froms, tos)
def _parse_variant_word(self, name: str) -> Optional[Tuple[str, str, str]]:
name = name.strip()
match = re.fullmatch(r'([~^]?)([^~$^]*)([~$]?)', name)
if match is None or (match.group(1) == '~' and match.group(3) == '~'):
raise UsageError(f"Invalid variant word descriptor '{name}'")
norm_name = self.norm.transliterate(match.group(2)).strip()
if not norm_name:
return None
return norm_name, match.group(1), match.group(3)
_FLAG_MATCH = {'^': '^ ',
'$': ' ^',
'': ' '}
def _create_variants(src: str, preflag: str, postflag: str,
repl: str, decompose: bool) -> Iterator[Tuple[str, str]]:
if preflag == '~':
postfix = _FLAG_MATCH[postflag]
# suffix decomposition
src = src + postfix
repl = repl + postfix
yield src, repl
yield ' ' + src, ' ' + repl
if decompose:
yield src, ' ' + repl
yield ' ' + src, repl
elif postflag == '~':
# prefix decomposition
prefix = _FLAG_MATCH[preflag]
src = prefix + src
repl = prefix + repl
yield src, repl
yield src + ' ', repl + ' '
if decompose:
yield src, repl + ' '
yield src + ' ', repl
else:
prefix = _FLAG_MATCH[preflag]
postfix = _FLAG_MATCH[postflag]
yield prefix + src + postfix, prefix + repl + postfix

View File

@@ -0,0 +1,150 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Generic processor for names that creates abbreviation variants.
"""
from typing import Mapping, Dict, Any, Iterable, Iterator, Optional, List, cast
import itertools
import datrie
from nominatim_core.errors import UsageError
from ...data.place_name import PlaceName
from .config_variants import get_variant_config
from .generic_mutation import MutationVariantGenerator
### Configuration section
def configure(rules: Mapping[str, Any], normalizer: Any, _: Any) -> Dict[str, Any]:
""" Extract and preprocess the configuration for this module.
"""
config: Dict[str, Any] = {}
config['replacements'], config['chars'] = get_variant_config(rules.get('variants'),
normalizer)
config['variant_only'] = rules.get('mode', '') == 'variant-only'
# parse mutation rules
config['mutations'] = []
for rule in rules.get('mutations', []):
if 'pattern' not in rule:
raise UsageError("Missing field 'pattern' in mutation configuration.")
if not isinstance(rule['pattern'], str):
raise UsageError("Field 'pattern' in mutation configuration "
"must be a simple text field.")
if 'replacements' not in rule:
raise UsageError("Missing field 'replacements' in mutation configuration.")
if not isinstance(rule['replacements'], list):
raise UsageError("Field 'replacements' in mutation configuration "
"must be a list of texts.")
config['mutations'].append((rule['pattern'], rule['replacements']))
return config
### Analysis section
def create(normalizer: Any, transliterator: Any,
config: Mapping[str, Any]) -> 'GenericTokenAnalysis':
""" Create a new token analysis instance for this module.
"""
return GenericTokenAnalysis(normalizer, transliterator, config)
class GenericTokenAnalysis:
""" Collects the different transformation rules for normalisation of names
and provides the functions to apply the transformations.
"""
def __init__(self, norm: Any, to_ascii: Any, config: Mapping[str, Any]) -> None:
self.norm = norm
self.to_ascii = to_ascii
self.variant_only = config['variant_only']
# Set up datrie
if config['replacements']:
self.replacements = datrie.Trie(config['chars'])
for src, repllist in config['replacements']:
self.replacements[src] = repllist
else:
self.replacements = None
# set up mutation rules
self.mutations = [MutationVariantGenerator(*cfg) for cfg in config['mutations']]
def get_canonical_id(self, name: PlaceName) -> str:
""" Return the normalized form of the name. This is the standard form
from which possible variants for the name can be derived.
"""
return cast(str, self.norm.transliterate(name.name)).strip()
def compute_variants(self, norm_name: str) -> List[str]:
""" Compute the spelling variants for the given normalized name
and transliterate the result.
"""
variants = self._generate_word_variants(norm_name)
for mutation in self.mutations:
variants = mutation.generate(variants)
return [name for name in self._transliterate_unique_list(norm_name, variants) if name]
def _transliterate_unique_list(self, norm_name: str,
iterable: Iterable[str]) -> Iterator[Optional[str]]:
seen = set()
if self.variant_only:
seen.add(norm_name)
for variant in map(str.strip, iterable):
if variant not in seen:
seen.add(variant)
yield self.to_ascii.transliterate(variant).strip()
def _generate_word_variants(self, norm_name: str) -> Iterable[str]:
baseform = '^ ' + norm_name + ' ^'
baselen = len(baseform)
partials = ['']
startpos = 0
if self.replacements is not None:
pos = 0
force_space = False
while pos < baselen:
full, repl = self.replacements.longest_prefix_item(baseform[pos:],
(None, None))
if full is not None:
done = baseform[startpos:pos]
partials = [v + done + r
for v, r in itertools.product(partials, repl)
if not force_space or r.startswith(' ')]
if len(partials) > 128:
# If too many variants are produced, they are unlikely
# to be helpful. Only use the original term.
startpos = 0
break
startpos = pos + len(full)
if full[-1] == ' ':
startpos -= 1
force_space = True
pos = startpos
else:
pos += 1
force_space = False
# No variants detected? Fast return.
if startpos == 0:
return (norm_name, )
if startpos < baselen:
return (part[1:] + baseform[startpos:-1] for part in partials)
return (part[1:-1] for part in partials)

View File

@@ -0,0 +1,57 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Creator for mutation variants for the generic token analysis.
"""
from typing import Sequence, Iterable, Iterator, Tuple
import itertools
import logging
import re
from nominatim_core.errors import UsageError
LOG = logging.getLogger()
def _zigzag(outer: Iterable[str], inner: Iterable[str]) -> Iterator[str]:
return itertools.chain.from_iterable(itertools.zip_longest(outer, inner, fillvalue=''))
class MutationVariantGenerator:
""" Generates name variants by applying a regular expression to the name
and replacing it with one or more variants. When the regular expression
matches more than once, each occurrence is replaced with all replacement
patterns.
"""
def __init__(self, pattern: str, replacements: Sequence[str]):
self.pattern = re.compile(pattern)
self.replacements = replacements
if self.pattern.groups > 0:
LOG.fatal("The mutation pattern %s contains a capturing group. "
"This is not allowed.", pattern)
raise UsageError("Bad mutation pattern in configuration.")
def generate(self, names: Iterable[str]) -> Iterator[str]:
""" Generator function for the name variants. 'names' is an iterable
over a set of names for which the variants are to be generated.
"""
for name in names:
parts = self.pattern.split(name)
if len(parts) == 1:
yield name
else:
for seps in self._fillers(len(parts)):
yield ''.join(_zigzag(parts, seps))
def _fillers(self, num_parts: int) -> Iterator[Tuple[str, ...]]:
""" Returns a generator for strings to join the given number of string
parts in all possible combinations.
"""
return itertools.product(self.replacements, repeat=num_parts - 1)

View File

@@ -0,0 +1,70 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Specialized processor for housenumbers. Analyses common housenumber patterns
and creates variants for them.
"""
from typing import Any, List, cast
import re
from ...data.place_name import PlaceName
from .generic_mutation import MutationVariantGenerator
RE_NON_DIGIT = re.compile('[^0-9]')
RE_DIGIT_ALPHA = re.compile(r'(\d)\s*([^\d\s␣])')
RE_ALPHA_DIGIT = re.compile(r'([^\s\d␣])\s*(\d)')
RE_NAMED_PART = re.compile(r'[a-z]{4}')
### Configuration section
def configure(*_: Any) -> None:
""" All behaviour is currently hard-coded.
"""
return None
### Analysis section
def create(normalizer: Any, transliterator: Any, config: None) -> 'HousenumberTokenAnalysis': # pylint: disable=W0613
""" Create a new token analysis instance for this module.
"""
return HousenumberTokenAnalysis(normalizer, transliterator)
class HousenumberTokenAnalysis:
""" Detects common housenumber patterns and normalizes them.
"""
def __init__(self, norm: Any, trans: Any) -> None:
self.norm = norm
self.trans = trans
self.mutator = MutationVariantGenerator('', (' ', ''))
def get_canonical_id(self, name: PlaceName) -> str:
""" Return the normalized form of the housenumber.
"""
# shortcut for number-only numbers, which make up 90% of the data.
if RE_NON_DIGIT.search(name.name) is None:
return name.name
norm = cast(str, self.trans.transliterate(self.norm.transliterate(name.name)))
# If there is a significant non-numeric part, use as is.
if RE_NAMED_PART.search(norm) is None:
# Otherwise add optional spaces between digits and letters.
(norm_opt, cnt1) = RE_DIGIT_ALPHA.subn(r'\1␣\2', norm)
(norm_opt, cnt2) = RE_ALPHA_DIGIT.subn(r'\1␣\2', norm_opt)
# Avoid creating too many variants per number.
if cnt1 + cnt2 <= 4:
return norm_opt
return norm
def compute_variants(self, norm_name: str) -> List[str]:
""" Compute the spelling variants for the given normalized housenumber.
Generates variants for optional spaces (marked with '').
"""
return list(self.mutator.generate([norm_name]))

View File

@@ -0,0 +1,65 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2022 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Specialized processor for postcodes. Supports a 'lookup' variant of the
token, which produces variants with optional spaces.
"""
from typing import Any, List
from ...data.place_name import PlaceName
from .generic_mutation import MutationVariantGenerator
### Configuration section
def configure(*_: Any) -> None:
""" All behaviour is currently hard-coded.
"""
return None
### Analysis section
def create(normalizer: Any, transliterator: Any, config: None) -> 'PostcodeTokenAnalysis': # pylint: disable=W0613
""" Create a new token analysis instance for this module.
"""
return PostcodeTokenAnalysis(normalizer, transliterator)
class PostcodeTokenAnalysis:
""" Special normalization and variant generation for postcodes.
This analyser must not be used with anything but postcodes as
it follows some special rules: the canonial ID is the form that
is used for the output. `compute_variants` then needs to ensure that
the generated variants once more follow the standard normalization
and transliteration, so that postcodes are correctly recognised by
the search algorithm.
"""
def __init__(self, norm: Any, trans: Any) -> None:
self.norm = norm
self.trans = trans
self.mutator = MutationVariantGenerator(' ', (' ', ''))
def get_canonical_id(self, name: PlaceName) -> str:
""" Return the standard form of the postcode.
"""
return name.name.strip().upper()
def compute_variants(self, norm_name: str) -> List[str]:
""" Compute the spelling variants for the given normalized postcode.
Takes the canonical form of the postcode, normalizes it using the
standard rules and then creates variants of the result where
all spaces are optional.
"""
# Postcodes follow their own transliteration rules.
# Make sure at this point, that the terms are normalized in a way
# that they are searchable with the standard transliteration rules.
return [self.trans.transliterate(term) for term in
self.mutator.generate([self.norm.transliterate(norm_name)]) if term]

View File

@@ -0,0 +1,10 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Module with functions for importing, updating Nominatim databases
as well as general maintenance helpers.
"""

View File

@@ -0,0 +1,68 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Function to add additional OSM data from a file or the API into the database.
"""
from typing import Any, MutableMapping
from pathlib import Path
import logging
import urllib
from nominatim_core.db.connection import connect
from nominatim_core.utils.url_utils import get_url
from .exec_utils import run_osm2pgsql
LOG = logging.getLogger()
def _run_osm2pgsql(dsn: str, options: MutableMapping[str, Any]) -> None:
run_osm2pgsql(options)
# Handle deletions
with connect(dsn) as conn:
with conn.cursor() as cur:
cur.execute('SELECT flush_deleted_places()')
conn.commit()
def add_data_from_file(dsn: str, fname: str, options: MutableMapping[str, Any]) -> int:
""" Adds data from a OSM file to the database. The file may be a normal
OSM file or a diff file in all formats supported by libosmium.
"""
options['import_file'] = Path(fname)
options['append'] = True
_run_osm2pgsql(dsn, options)
# No status update. We don't know where the file came from.
return 0
def add_osm_object(dsn: str, osm_type: str, osm_id: int, use_main_api: bool,
options: MutableMapping[str, Any]) -> int:
""" Add or update a single OSM object from the latest version of the
API.
"""
if use_main_api:
base_url = f'https://www.openstreetmap.org/api/0.6/{osm_type}/{osm_id}'
if osm_type in ('way', 'relation'):
base_url += '/full'
else:
# use Overpass API
if osm_type == 'node':
data = f'node({osm_id});out meta;'
elif osm_type == 'way':
data = f'(way({osm_id});>;);out meta;'
else:
data = f'(rel(id:{osm_id});>;);out meta;'
base_url = 'https://overpass-api.de/api/interpreter?' \
+ urllib.parse.urlencode({'data': data})
options['append'] = True
options['import_data'] = get_url(base_url).encode('utf-8')
_run_osm2pgsql(dsn, options)
return 0

View File

@@ -0,0 +1,106 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Functions for database analysis and maintenance.
"""
from typing import Optional, Tuple, Any, cast
import logging
from psycopg2.extras import Json, register_hstore
from psycopg2 import DataError
from nominatim_core.typing import DictCursorResult
from nominatim_core.config import Configuration
from nominatim_core.db.connection import connect, Cursor
from nominatim_core.errors import UsageError
from ..tokenizer import factory as tokenizer_factory
from ..data.place_info import PlaceInfo
LOG = logging.getLogger()
def _get_place_info(cursor: Cursor, osm_id: Optional[str],
place_id: Optional[int]) -> DictCursorResult:
sql = """SELECT place_id, extra.*
FROM placex, LATERAL placex_indexing_prepare(placex) as extra
"""
values: Tuple[Any, ...]
if osm_id:
osm_type = osm_id[0].upper()
if osm_type not in 'NWR' or not osm_id[1:].isdigit():
LOG.fatal('OSM ID must be of form <N|W|R><id>. Got: %s', osm_id)
raise UsageError("OSM ID parameter badly formatted")
sql += ' WHERE placex.osm_type = %s AND placex.osm_id = %s'
values = (osm_type, int(osm_id[1:]))
elif place_id is not None:
sql += ' WHERE placex.place_id = %s'
values = (place_id, )
else:
LOG.fatal("No OSM object given to index.")
raise UsageError("OSM object not found")
cursor.execute(sql + ' LIMIT 1', values)
if cursor.rowcount < 1:
LOG.fatal("OSM object %s not found in database.", osm_id)
raise UsageError("OSM object not found")
return cast(DictCursorResult, cursor.fetchone())
def analyse_indexing(config: Configuration, osm_id: Optional[str] = None,
place_id: Optional[int] = None) -> None:
""" Analyse indexing of a single Nominatim object.
"""
with connect(config.get_libpq_dsn()) as conn:
register_hstore(conn)
with conn.cursor() as cur:
place = _get_place_info(cur, osm_id, place_id)
cur.execute("update placex set indexed_status = 2 where place_id = %s",
(place['place_id'], ))
cur.execute("""SET auto_explain.log_min_duration = '0';
SET auto_explain.log_analyze = 'true';
SET auto_explain.log_nested_statements = 'true';
LOAD 'auto_explain';
SET client_min_messages = LOG;
SET log_min_messages = FATAL""")
tokenizer = tokenizer_factory.get_tokenizer_for_db(config)
with tokenizer.name_analyzer() as analyzer:
cur.execute("""UPDATE placex
SET indexed_status = 0, address = %s, token_info = %s,
name = %s, linked_place_id = %s
WHERE place_id = %s""",
(place['address'],
Json(analyzer.process_place(PlaceInfo(place))),
place['name'], place['linked_place_id'], place['place_id']))
# we do not want to keep the results
conn.rollback()
for msg in conn.notices:
print(msg)
def clean_deleted_relations(config: Configuration, age: str) -> None:
""" Clean deleted relations older than a given age
"""
with connect(config.get_libpq_dsn()) as conn:
with conn.cursor() as cur:
try:
cur.execute("""SELECT place_force_delete(p.place_id)
FROM import_polygon_delete d, placex p
WHERE p.osm_type = d.osm_type AND p.osm_id = d.osm_id
AND age(p.indexed_date) > %s::interval""",
(age, ))
except DataError as exc:
raise UsageError('Invalid PostgreSQL time interval format') from exc
conn.commit()

View File

@@ -0,0 +1,350 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Collection of functions that check if the database is complete and functional.
"""
from typing import Callable, Optional, Any, Union, Tuple, Mapping, List
from enum import Enum
from textwrap import dedent
from nominatim_core.config import Configuration
from nominatim_core.db.connection import connect, Connection
from nominatim_core.db import properties
from nominatim_core.errors import UsageError
from ..tokenizer import factory as tokenizer_factory
from . import freeze
from ..version import NOMINATIM_VERSION, parse_version
CHECKLIST = []
class CheckState(Enum):
""" Possible states of a check. FATAL stops check execution entirely.
"""
OK = 0
FAIL = 1
FATAL = 2
NOT_APPLICABLE = 3
WARN = 4
CheckResult = Union[CheckState, Tuple[CheckState, Mapping[str, Any]]]
CheckFunc = Callable[[Connection, Configuration], CheckResult]
def _check(hint: Optional[str] = None) -> Callable[[CheckFunc], CheckFunc]:
""" Decorator for checks. It adds the function to the list of
checks to execute and adds the code for printing progress messages.
"""
def decorator(func: CheckFunc) -> CheckFunc:
title = (func.__doc__ or '').split('\n', 1)[0].strip()
def run_check(conn: Connection, config: Configuration) -> CheckState:
print(title, end=' ... ')
ret = func(conn, config)
if isinstance(ret, tuple):
ret, params = ret
else:
params = {}
if ret == CheckState.OK:
print('\033[92mOK\033[0m')
elif ret == CheckState.WARN:
print('\033[93mWARNING\033[0m')
if hint:
print('')
print(dedent(hint.format(**params)))
elif ret == CheckState.NOT_APPLICABLE:
print('not applicable')
else:
print('\x1B[31mFailed\033[0m')
if hint:
print(dedent(hint.format(**params)))
return ret
CHECKLIST.append(run_check)
return run_check
return decorator
class _BadConnection:
def __init__(self, msg: str) -> None:
self.msg = msg
def close(self) -> None:
""" Dummy function to provide the implementation.
"""
def check_database(config: Configuration) -> int:
""" Run a number of checks on the database and return the status.
"""
try:
conn = connect(config.get_libpq_dsn()).connection
except UsageError as err:
conn = _BadConnection(str(err)) # type: ignore[assignment]
overall_result = 0
for check in CHECKLIST:
ret = check(conn, config)
if ret == CheckState.FATAL:
conn.close()
return 1
if ret in (CheckState.FATAL, CheckState.FAIL):
overall_result = 1
conn.close()
return overall_result
def _get_indexes(conn: Connection) -> List[str]:
indexes = ['idx_place_addressline_address_place_id',
'idx_placex_rank_search',
'idx_placex_rank_address',
'idx_placex_parent_place_id',
'idx_placex_geometry_reverse_lookuppolygon',
'idx_placex_geometry_placenode',
'idx_osmline_parent_place_id',
'idx_osmline_parent_osm_id',
'idx_postcode_id',
'idx_postcode_postcode'
]
if conn.table_exists('search_name'):
indexes.extend(('idx_search_name_nameaddress_vector',
'idx_search_name_name_vector',
'idx_search_name_centroid'))
if conn.server_version_tuple() >= (11, 0, 0):
indexes.extend(('idx_placex_housenumber',
'idx_osmline_parent_osm_id_with_hnr'))
if conn.table_exists('place'):
indexes.extend(('idx_location_area_country_place_id',
'idx_place_osm_unique',
'idx_placex_rank_address_sector',
'idx_placex_rank_boundaries_sector'))
return indexes
# CHECK FUNCTIONS
#
# Functions are executed in the order they appear here.
@_check(hint="""\
{error}
Hints:
* Is the database server started?
* Check the NOMINATIM_DATABASE_DSN variable in your local .env
* Try connecting to the database with the same settings
Project directory: {config.project_dir}
Current setting of NOMINATIM_DATABASE_DSN: {config.DATABASE_DSN}
""")
def check_connection(conn: Any, config: Configuration) -> CheckResult:
""" Checking database connection
"""
if isinstance(conn, _BadConnection):
return CheckState.FATAL, dict(error=conn.msg, config=config)
return CheckState.OK
@_check(hint="""\
Database version ({db_version}) doesn't match Nominatim version ({nom_version})
Hints:
* Are you connecting to the correct database?
{instruction}
Check the Migration chapter of the Administration Guide.
Project directory: {config.project_dir}
Current setting of NOMINATIM_DATABASE_DSN: {config.DATABASE_DSN}
""")
def check_database_version(conn: Connection, config: Configuration) -> CheckResult:
""" Checking database_version matches Nominatim software version
"""
if conn.table_exists('nominatim_properties'):
db_version_str = properties.get_property(conn, 'database_version')
else:
db_version_str = None
if db_version_str is not None:
db_version = parse_version(db_version_str)
if db_version == NOMINATIM_VERSION:
return CheckState.OK
instruction = (
'Run migrations: nominatim admin --migrate'
if db_version < NOMINATIM_VERSION
else 'You need to upgrade the Nominatim software.'
)
else:
instruction = ''
return CheckState.FATAL, dict(db_version=db_version_str,
nom_version=NOMINATIM_VERSION,
instruction=instruction,
config=config)
@_check(hint="""\
placex table not found
Hints:
* Are you connecting to the correct database?
* Did the import process finish without errors?
Project directory: {config.project_dir}
Current setting of NOMINATIM_DATABASE_DSN: {config.DATABASE_DSN}
""")
def check_placex_table(conn: Connection, config: Configuration) -> CheckResult:
""" Checking for placex table
"""
if conn.table_exists('placex'):
return CheckState.OK
return CheckState.FATAL, dict(config=config)
@_check(hint="""placex table has no data. Did the import finish successfully?""")
def check_placex_size(conn: Connection, _: Configuration) -> CheckResult:
""" Checking for placex content
"""
with conn.cursor() as cur:
cnt = cur.scalar('SELECT count(*) FROM (SELECT * FROM placex LIMIT 100) x')
return CheckState.OK if cnt > 0 else CheckState.FATAL
@_check(hint="""{msg}""")
def check_tokenizer(_: Connection, config: Configuration) -> CheckResult:
""" Checking that tokenizer works
"""
try:
tokenizer = tokenizer_factory.get_tokenizer_for_db(config)
except UsageError:
return CheckState.FAIL, dict(msg="""\
Cannot load tokenizer. Did the import finish successfully?""")
result = tokenizer.check_database(config)
if result is None:
return CheckState.OK
return CheckState.FAIL, dict(msg=result)
@_check(hint="""\
Wikipedia/Wikidata importance tables missing.
Quality of search results may be degraded. Reverse geocoding is unaffected.
See https://nominatim.org/release-docs/latest/admin/Import/#wikipediawikidata-rankings
""")
def check_existance_wikipedia(conn: Connection, _: Configuration) -> CheckResult:
""" Checking for wikipedia/wikidata data
"""
if not conn.table_exists('search_name') or not conn.table_exists('place'):
return CheckState.NOT_APPLICABLE
with conn.cursor() as cur:
if conn.table_exists('wikimedia_importance'):
cnt = cur.scalar('SELECT count(*) FROM wikimedia_importance')
else:
cnt = cur.scalar('SELECT count(*) FROM wikipedia_article')
return CheckState.WARN if cnt == 0 else CheckState.OK
@_check(hint="""\
The indexing didn't finish. {count} entries are not yet indexed.
To index the remaining entries, run: {index_cmd}
""")
def check_indexing(conn: Connection, _: Configuration) -> CheckResult:
""" Checking indexing status
"""
with conn.cursor() as cur:
cnt = cur.scalar('SELECT count(*) FROM placex WHERE indexed_status > 0')
if cnt == 0:
return CheckState.OK
if freeze.is_frozen(conn):
index_cmd="""\
Database is marked frozen, it cannot be updated.
Low counts of unindexed places are fine."""
return CheckState.WARN, dict(count=cnt, index_cmd=index_cmd)
if conn.index_exists('idx_placex_rank_search'):
# Likely just an interrupted update.
index_cmd = 'nominatim index'
else:
# Looks like the import process got interrupted.
index_cmd = 'nominatim import --continue indexing'
return CheckState.FAIL, dict(count=cnt, index_cmd=index_cmd)
@_check(hint="""\
The following indexes are missing:
{indexes}
Rerun the index creation with: nominatim import --continue db-postprocess
""")
def check_database_indexes(conn: Connection, _: Configuration) -> CheckResult:
""" Checking that database indexes are complete
"""
missing = []
for index in _get_indexes(conn):
if not conn.index_exists(index):
missing.append(index)
if missing:
return CheckState.FAIL, dict(indexes='\n '.join(missing))
return CheckState.OK
@_check(hint="""\
At least one index is invalid. That can happen, e.g. when index creation was
disrupted and later restarted. You should delete the affected indices
and recreate them.
Invalid indexes:
{indexes}
""")
def check_database_index_valid(conn: Connection, _: Configuration) -> CheckResult:
""" Checking that all database indexes are valid
"""
with conn.cursor() as cur:
cur.execute(""" SELECT relname FROM pg_class, pg_index
WHERE pg_index.indisvalid = false
AND pg_index.indexrelid = pg_class.oid""")
broken = [c[0] for c in cur]
if broken:
return CheckState.FAIL, dict(indexes='\n '.join(broken))
return CheckState.OK
@_check(hint="""\
{error}
Run TIGER import again: nominatim add-data --tiger-data <DIR>
""")
def check_tiger_table(conn: Connection, config: Configuration) -> CheckResult:
""" Checking TIGER external data table.
"""
if not config.get_bool('USE_US_TIGER_DATA'):
return CheckState.NOT_APPLICABLE
if not conn.table_exists('location_property_tiger'):
return CheckState.FAIL, dict(error='TIGER data table not found.')
with conn.cursor() as cur:
if cur.scalar('SELECT count(*) FROM location_property_tiger') == 0:
return CheckState.FAIL, dict(error='TIGER data table is empty.')
return CheckState.OK

View File

@@ -0,0 +1,166 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Collection of host system information including software versions, memory,
storage, and database configuration.
"""
import os
import subprocess
import sys
from pathlib import Path
from typing import List, Optional, Tuple, Union
import psutil
from psycopg2.extensions import make_dsn, parse_dsn
from nominatim_core.config import Configuration
from nominatim_core.db.connection import connect
from ..version import NOMINATIM_VERSION
def convert_version(ver_tup: Tuple[int, int]) -> str:
"""converts tuple version (ver_tup) to a string representation"""
return ".".join(map(str, ver_tup))
def friendly_memory_string(mem: float) -> str:
"""Create a user friendly string for the amount of memory specified as mem"""
mem_magnitude = ("bytes", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
mag = 0
# determine order of magnitude
while mem > 1000:
mem /= 1000
mag += 1
return f"{mem:.1f} {mem_magnitude[mag]}"
def run_command(cmd: Union[str, List[str]]) -> str:
"""Runs a command using the shell and returns the output from stdout"""
try:
if sys.version_info < (3, 7):
cap_out = subprocess.run(cmd, stdout=subprocess.PIPE, check=False)
else:
cap_out = subprocess.run(cmd, capture_output=True, check=False)
return cap_out.stdout.decode("utf-8")
except FileNotFoundError:
# non-Linux system should end up here
return f"Unknown (unable to find the '{cmd}' command)"
def os_name_info() -> str:
"""Obtain Operating System Name (and possibly the version)"""
os_info = None
# man page os-release(5) details meaning of the fields
if Path("/etc/os-release").is_file():
os_info = from_file_find_line_portion(
"/etc/os-release", "PRETTY_NAME", "=")
# alternative location
elif Path("/usr/lib/os-release").is_file():
os_info = from_file_find_line_portion(
"/usr/lib/os-release", "PRETTY_NAME", "="
)
# fallback on Python's os name
if os_info is None or os_info == "":
os_info = os.name
# if the above is insufficient, take a look at neofetch's approach to OS detection
return os_info
# Note: Intended to be used on informational files like /proc
def from_file_find_line_portion(
filename: str, start: str, sep: str, fieldnum: int = 1
) -> Optional[str]:
"""open filename, finds the line starting with the 'start' string.
Splits the line using separator and returns a "fieldnum" from the split."""
with open(filename, encoding='utf8') as file:
result = ""
for line in file:
if line.startswith(start):
result = line.split(sep)[fieldnum].strip()
return result
def get_postgresql_config(version: int) -> str:
"""Retrieve postgres configuration file"""
try:
with open(f"/etc/postgresql/{version}/main/postgresql.conf", encoding='utf8') as file:
db_config = file.read()
file.close()
return db_config
except IOError:
return f"**Could not read '/etc/postgresql/{version}/main/postgresql.conf'**"
def report_system_information(config: Configuration) -> None:
"""Generate a report about the host system including software versions, memory,
storage, and database configuration."""
with connect(make_dsn(config.get_libpq_dsn(), dbname='postgres')) as conn:
postgresql_ver: str = convert_version(conn.server_version_tuple())
with conn.cursor() as cur:
num = cur.scalar("SELECT count(*) FROM pg_catalog.pg_database WHERE datname=%s",
(parse_dsn(config.get_libpq_dsn())['dbname'], ))
nominatim_db_exists = num == 1 if isinstance(num, int) else False
if nominatim_db_exists:
with connect(config.get_libpq_dsn()) as conn:
postgis_ver: str = convert_version(conn.postgis_version_tuple())
else:
postgis_ver = "Unable to connect to database"
postgresql_config: str = get_postgresql_config(int(float(postgresql_ver)))
# Note: psutil.disk_partitions() is similar to run_command("lsblk")
# Note: run_command("systemd-detect-virt") only works on Linux, on other OSes
# should give a message: "Unknown (unable to find the 'systemd-detect-virt' command)"
# Generates the Markdown report.
report = f"""
**Instructions**
Use this information in your issue report at https://github.com/osm-search/Nominatim/issues
Redirect the output to a file:
$ ./collect_os_info.py > report.md
**Software Environment:**
- Python version: {sys.version}
- Nominatim version: {NOMINATIM_VERSION!s}
- PostgreSQL version: {postgresql_ver}
- PostGIS version: {postgis_ver}
- OS: {os_name_info()}
**Hardware Configuration:**
- RAM: {friendly_memory_string(psutil.virtual_memory().total)}
- number of CPUs: {psutil.cpu_count(logical=False)}
- bare metal/AWS/other cloud service (per systemd-detect-virt(1)): {run_command("systemd-detect-virt")}
- type and size of disks:
**`df -h` - df - report file system disk space usage: **
```
{run_command(["df", "-h"])}
```
**lsblk - list block devices: **
```
{run_command("lsblk")}
```
**Postgresql Configuration:**
```
{postgresql_config}
```
**Notes**
Please add any notes about anything above anything above that is incorrect.
"""
print(report)

View File

@@ -0,0 +1,265 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Exporting a Nominatim database to SQlite.
"""
from typing import Set, Any
import datetime as dt
import logging
from pathlib import Path
import sqlalchemy as sa
import nominatim_api as napi
from nominatim_api.search.query_analyzer_factory import make_query_analyzer
from nominatim_core.typing import SaSelect, SaRow
from nominatim_core.db.sqlalchemy_types import Geometry, IntArray
LOG = logging.getLogger()
async def convert(project_dir: Path, outfile: Path, options: Set[str]) -> None:
""" Export an existing database to sqlite. The resulting database
will be usable against the Python frontend of Nominatim.
"""
api = napi.NominatimAPIAsync(project_dir)
try:
outapi = napi.NominatimAPIAsync(project_dir,
{'NOMINATIM_DATABASE_DSN': f"sqlite:dbname={outfile}",
'NOMINATIM_DATABASE_RW': '1'})
try:
async with api.begin() as src, outapi.begin() as dest:
writer = SqliteWriter(src, dest, options)
await writer.write()
finally:
await outapi.close()
finally:
await api.close()
class SqliteWriter:
""" Worker class which creates a new SQLite database.
"""
def __init__(self, src: napi.SearchConnection,
dest: napi.SearchConnection, options: Set[str]) -> None:
self.src = src
self.dest = dest
self.options = options
async def write(self) -> None:
""" Create the database structure and copy the data from
the source database to the destination.
"""
LOG.warning('Setting up spatialite')
await self.dest.execute(sa.select(sa.func.InitSpatialMetaData(True, 'WGS84')))
await self.create_tables()
await self.copy_data()
if 'search' in self.options:
await self.create_word_table()
await self.create_indexes()
async def create_tables(self) -> None:
""" Set up the database tables.
"""
LOG.warning('Setting up tables')
if 'search' not in self.options:
self.dest.t.meta.remove(self.dest.t.search_name)
else:
await self.create_class_tables()
await self.dest.connection.run_sync(self.dest.t.meta.create_all)
# Convert all Geometry columns to Spatialite geometries
for table in self.dest.t.meta.sorted_tables:
for col in table.c:
if isinstance(col.type, Geometry):
await self.dest.execute(sa.select(
sa.func.RecoverGeometryColumn(table.name, col.name, 4326,
col.type.subtype.upper(), 'XY')))
async def create_class_tables(self) -> None:
""" Set up the table that serve class/type-specific geometries.
"""
sql = sa.text("""SELECT tablename FROM pg_tables
WHERE tablename LIKE 'place_classtype_%'""")
for res in await self.src.execute(sql):
for db in (self.src, self.dest):
sa.Table(res[0], db.t.meta,
sa.Column('place_id', sa.BigInteger),
sa.Column('centroid', Geometry))
async def create_word_table(self) -> None:
""" Create the word table.
This table needs the property information to determine the
correct format. Therefore needs to be done after all other
data has been copied.
"""
await make_query_analyzer(self.src)
await make_query_analyzer(self.dest)
src = self.src.t.meta.tables['word']
dest = self.dest.t.meta.tables['word']
await self.dest.connection.run_sync(dest.create)
LOG.warning("Copying word table")
async_result = await self.src.connection.stream(sa.select(src))
async for partition in async_result.partitions(10000):
data = [{k: getattr(r, k) for k in r._fields} for r in partition]
await self.dest.execute(dest.insert(), data)
await self.dest.connection.run_sync(sa.Index('idx_word_woken', dest.c.word_token).create)
async def copy_data(self) -> None:
""" Copy data for all registered tables.
"""
def _getfield(row: SaRow, key: str) -> Any:
value = getattr(row, key)
if isinstance(value, dt.datetime):
if value.tzinfo is not None:
value = value.astimezone(dt.timezone.utc)
return value
for table in self.dest.t.meta.sorted_tables:
LOG.warning("Copying '%s'", table.name)
async_result = await self.src.connection.stream(self.select_from(table.name))
async for partition in async_result.partitions(10000):
data = [{('class_' if k == 'class' else k): _getfield(r, k)
for k in r._fields}
for r in partition]
await self.dest.execute(table.insert(), data)
# Set up a minimal copy of pg_tables used to look up the class tables later.
pg_tables = sa.Table('pg_tables', self.dest.t.meta,
sa.Column('schemaname', sa.Text, default='public'),
sa.Column('tablename', sa.Text))
await self.dest.connection.run_sync(pg_tables.create)
data = [{'tablename': t} for t in self.dest.t.meta.tables]
await self.dest.execute(pg_tables.insert().values(data))
async def create_indexes(self) -> None:
""" Add indexes necessary for the frontend.
"""
# reverse place node lookup needs an extra table to simulate a
# partial index with adaptive buffering.
await self.dest.execute(sa.text(
""" CREATE TABLE placex_place_node_areas AS
SELECT place_id, ST_Expand(geometry,
14.0 * exp(-0.2 * rank_search) - 0.03) as geometry
FROM placex
WHERE rank_address between 5 and 25
and osm_type = 'N'
and linked_place_id is NULL """))
await self.dest.execute(sa.select(
sa.func.RecoverGeometryColumn('placex_place_node_areas', 'geometry',
4326, 'GEOMETRY', 'XY')))
await self.dest.execute(sa.select(sa.func.CreateSpatialIndex(
'placex_place_node_areas', 'geometry')))
# Remaining indexes.
await self.create_spatial_index('country_grid', 'geometry')
await self.create_spatial_index('placex', 'geometry')
await self.create_spatial_index('osmline', 'linegeo')
await self.create_spatial_index('tiger', 'linegeo')
await self.create_index('placex', 'place_id')
await self.create_index('placex', 'parent_place_id')
await self.create_index('placex', 'rank_address')
await self.create_index('addressline', 'place_id')
await self.create_index('postcode', 'place_id')
await self.create_index('osmline', 'place_id')
await self.create_index('tiger', 'place_id')
if 'search' in self.options:
await self.create_spatial_index('postcode', 'geometry')
await self.create_spatial_index('search_name', 'centroid')
await self.create_index('search_name', 'place_id')
await self.create_index('osmline', 'parent_place_id')
await self.create_index('tiger', 'parent_place_id')
await self.create_search_index()
for t in self.dest.t.meta.tables:
if t.startswith('place_classtype_'):
await self.dest.execute(sa.select(
sa.func.CreateSpatialIndex(t, 'centroid')))
async def create_spatial_index(self, table: str, column: str) -> None:
""" Create a spatial index on the given table and column.
"""
await self.dest.execute(sa.select(
sa.func.CreateSpatialIndex(getattr(self.dest.t, table).name, column)))
async def create_index(self, table_name: str, column: str) -> None:
""" Create a simple index on the given table and column.
"""
table = getattr(self.dest.t, table_name)
await self.dest.connection.run_sync(
sa.Index(f"idx_{table}_{column}", getattr(table.c, column)).create)
async def create_search_index(self) -> None:
""" Create the tables and indexes needed for word lookup.
"""
LOG.warning("Creating reverse search table")
rsn = sa.Table('reverse_search_name', self.dest.t.meta,
sa.Column('word', sa.Integer()),
sa.Column('column', sa.Text()),
sa.Column('places', IntArray))
await self.dest.connection.run_sync(rsn.create)
tsrc = self.src.t.search_name
for column in ('name_vector', 'nameaddress_vector'):
sql = sa.select(sa.func.unnest(getattr(tsrc.c, column)).label('word'),
sa.func.ArrayAgg(tsrc.c.place_id).label('places'))\
.group_by('word')
async_result = await self.src.connection.stream(sql)
async for partition in async_result.partitions(100):
data = []
for row in partition:
row.places.sort()
data.append({'word': row.word,
'column': column,
'places': row.places})
await self.dest.execute(rsn.insert(), data)
await self.dest.connection.run_sync(
sa.Index('idx_reverse_search_name_word', rsn.c.word).create)
def select_from(self, table: str) -> SaSelect:
""" Create the SQL statement to select the source columns and rows.
"""
columns = self.src.t.meta.tables[table].c
if table == 'placex':
# SQLite struggles with Geometries that are larger than 5MB,
# so simplify those.
return sa.select(*(c for c in columns if not isinstance(c.type, Geometry)),
sa.func.ST_AsText(columns.centroid).label('centroid'),
sa.func.ST_AsText(
sa.case((sa.func.ST_MemSize(columns.geometry) < 5000000,
columns.geometry),
else_=sa.func.ST_SimplifyPreserveTopology(
columns.geometry, 0.0001)
)).label('geometry'))
sql = sa.select(*(sa.func.ST_AsText(c).label(c.name)
if isinstance(c.type, Geometry) else c for c in columns))
return sql

View File

@@ -0,0 +1,272 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Functions for setting up and importing a new Nominatim database.
"""
from typing import Tuple, Optional, Union, Sequence, MutableMapping, Any
import logging
import os
import selectors
import subprocess
from pathlib import Path
import psutil
from psycopg2 import sql as pysql
from nominatim_core.errors import UsageError
from nominatim_core.config import Configuration
from nominatim_core.db.connection import connect, get_pg_env, Connection
from nominatim_core.db.async_connection import DBConnection
from nominatim_core.db.sql_preprocessor import SQLPreprocessor
from .exec_utils import run_osm2pgsql
from ..version import POSTGRESQL_REQUIRED_VERSION, POSTGIS_REQUIRED_VERSION
LOG = logging.getLogger()
def _require_version(module: str, actual: Tuple[int, int], expected: Tuple[int, int]) -> None:
""" Compares the version for the given module and raises an exception
if the actual version is too old.
"""
if actual < expected:
LOG.fatal('Minimum supported version of %s is %d.%d. '
'Found version %d.%d.',
module, expected[0], expected[1], actual[0], actual[1])
raise UsageError(f'{module} is too old.')
def _require_loaded(extension_name: str, conn: Connection) -> None:
""" Check that the given extension is loaded. """
if not conn.extension_loaded(extension_name):
LOG.fatal('Required module %s is not loaded.', extension_name)
raise UsageError(f'{extension_name} is not loaded.')
def check_existing_database_plugins(dsn: str) -> None:
""" Check that the database has the required plugins installed."""
with connect(dsn) as conn:
_require_version('PostgreSQL server',
conn.server_version_tuple(),
POSTGRESQL_REQUIRED_VERSION)
_require_version('PostGIS',
conn.postgis_version_tuple(),
POSTGIS_REQUIRED_VERSION)
_require_loaded('hstore', conn)
def setup_database_skeleton(dsn: str, rouser: Optional[str] = None) -> None:
""" Create a new database for Nominatim and populate it with the
essential extensions.
The function fails when the database already exists or Postgresql or
PostGIS versions are too old.
Uses `createdb` to create the database.
If 'rouser' is given, then the function also checks that the user
with that given name exists.
Requires superuser rights by the caller.
"""
proc = subprocess.run(['createdb'], env=get_pg_env(dsn), check=False)
if proc.returncode != 0:
raise UsageError('Creating new database failed.')
with connect(dsn) as conn:
_require_version('PostgreSQL server',
conn.server_version_tuple(),
POSTGRESQL_REQUIRED_VERSION)
if rouser is not None:
with conn.cursor() as cur:
cnt = cur.scalar('SELECT count(*) FROM pg_user where usename = %s',
(rouser, ))
if cnt == 0:
LOG.fatal("Web user '%s' does not exist. Create it with:\n"
"\n createuser %s", rouser, rouser)
raise UsageError('Missing read-only user.')
# Create extensions.
with conn.cursor() as cur:
cur.execute('CREATE EXTENSION IF NOT EXISTS hstore')
cur.execute('CREATE EXTENSION IF NOT EXISTS postgis')
postgis_version = conn.postgis_version_tuple()
if postgis_version[0] >= 3:
cur.execute('CREATE EXTENSION IF NOT EXISTS postgis_raster')
conn.commit()
_require_version('PostGIS',
conn.postgis_version_tuple(),
POSTGIS_REQUIRED_VERSION)
def import_osm_data(osm_files: Union[Path, Sequence[Path]],
options: MutableMapping[str, Any],
drop: bool = False, ignore_errors: bool = False) -> None:
""" Import the given OSM files. 'options' contains the list of
default settings for osm2pgsql.
"""
options['import_file'] = osm_files
options['append'] = False
options['threads'] = 1
if not options['flatnode_file'] and options['osm2pgsql_cache'] == 0:
# Make some educated guesses about cache size based on the size
# of the import file and the available memory.
mem = psutil.virtual_memory()
fsize = 0
if isinstance(osm_files, list):
for fname in osm_files:
fsize += os.stat(str(fname)).st_size
else:
fsize = os.stat(str(osm_files)).st_size
options['osm2pgsql_cache'] = int(min((mem.available + mem.cached) * 0.75,
fsize * 2) / 1024 / 1024) + 1
run_osm2pgsql(options)
with connect(options['dsn']) as conn:
if not ignore_errors:
with conn.cursor() as cur:
cur.execute('SELECT * FROM place LIMIT 1')
if cur.rowcount == 0:
raise UsageError('No data imported by osm2pgsql.')
if drop:
conn.drop_table('planet_osm_nodes')
if drop and options['flatnode_file']:
Path(options['flatnode_file']).unlink()
def create_tables(conn: Connection, config: Configuration, reverse_only: bool = False) -> None:
""" Create the set of basic tables.
When `reverse_only` is True, then the main table for searching will
be skipped and only reverse search is possible.
"""
sql = SQLPreprocessor(conn, config)
sql.env.globals['db']['reverse_only'] = reverse_only
sql.run_sql_file(conn, 'tables.sql')
def create_table_triggers(conn: Connection, config: Configuration) -> None:
""" Create the triggers for the tables. The trigger functions must already
have been imported with refresh.create_functions().
"""
sql = SQLPreprocessor(conn, config)
sql.run_sql_file(conn, 'table-triggers.sql')
def create_partition_tables(conn: Connection, config: Configuration) -> None:
""" Create tables that have explicit partitioning.
"""
sql = SQLPreprocessor(conn, config)
sql.run_sql_file(conn, 'partition-tables.src.sql')
def truncate_data_tables(conn: Connection) -> None:
""" Truncate all data tables to prepare for a fresh load.
"""
with conn.cursor() as cur:
cur.execute('TRUNCATE placex')
cur.execute('TRUNCATE place_addressline')
cur.execute('TRUNCATE location_area')
cur.execute('TRUNCATE location_area_country')
cur.execute('TRUNCATE location_property_tiger')
cur.execute('TRUNCATE location_property_osmline')
cur.execute('TRUNCATE location_postcode')
if conn.table_exists('search_name'):
cur.execute('TRUNCATE search_name')
cur.execute('DROP SEQUENCE IF EXISTS seq_place')
cur.execute('CREATE SEQUENCE seq_place start 100000')
cur.execute("""SELECT tablename FROM pg_tables
WHERE tablename LIKE 'location_road_%'""")
for table in [r[0] for r in list(cur)]:
cur.execute('TRUNCATE ' + table)
conn.commit()
_COPY_COLUMNS = pysql.SQL(',').join(map(pysql.Identifier,
('osm_type', 'osm_id', 'class', 'type',
'name', 'admin_level', 'address',
'extratags', 'geometry')))
def load_data(dsn: str, threads: int) -> None:
""" Copy data into the word and placex table.
"""
sel = selectors.DefaultSelector()
# Then copy data from place to placex in <threads - 1> chunks.
place_threads = max(1, threads - 1)
for imod in range(place_threads):
conn = DBConnection(dsn)
conn.connect()
conn.perform(
pysql.SQL("""INSERT INTO placex ({columns})
SELECT {columns} FROM place
WHERE osm_id % {total} = {mod}
AND NOT (class='place' and (type='houses' or type='postcode'))
AND ST_IsValid(geometry)
""").format(columns=_COPY_COLUMNS,
total=pysql.Literal(place_threads),
mod=pysql.Literal(imod)))
sel.register(conn, selectors.EVENT_READ, conn)
# Address interpolations go into another table.
conn = DBConnection(dsn)
conn.connect()
conn.perform("""INSERT INTO location_property_osmline (osm_id, address, linegeo)
SELECT osm_id, address, geometry FROM place
WHERE class='place' and type='houses' and osm_type='W'
and ST_GeometryType(geometry) = 'ST_LineString'
""")
sel.register(conn, selectors.EVENT_READ, conn)
# Now wait for all of them to finish.
todo = place_threads + 1
while todo > 0:
for key, _ in sel.select(1):
conn = key.data
sel.unregister(conn)
conn.wait()
conn.close()
todo -= 1
print('.', end='', flush=True)
print('\n')
with connect(dsn) as syn_conn:
with syn_conn.cursor() as cur:
cur.execute('ANALYSE')
def create_search_indices(conn: Connection, config: Configuration,
drop: bool = False, threads: int = 1) -> None:
""" Create tables that have explicit partitioning.
"""
# If index creation failed and left an index invalid, they need to be
# cleaned out first, so that the script recreates them.
with conn.cursor() as cur:
cur.execute("""SELECT relname FROM pg_class, pg_index
WHERE pg_index.indisvalid = false
AND pg_index.indexrelid = pg_class.oid""")
bad_indices = [row[0] for row in list(cur)]
for idx in bad_indices:
LOG.info("Drop invalid index %s.", idx)
cur.execute(pysql.SQL('DROP INDEX {}').format(pysql.Identifier(idx)))
conn.commit()
sql = SQLPreprocessor(conn, config)
sql.run_parallel_sql_file(config.get_libpq_dsn(),
'indices.sql', min(8, threads), drop=drop)

View File

@@ -0,0 +1,84 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Helper functions for executing external programs.
"""
from typing import Any, Mapping
import logging
import os
import subprocess
import shutil
from nominatim_core.typing import StrPath
from nominatim_core.db.connection import get_pg_env
LOG = logging.getLogger()
def run_php_server(server_address: str, base_dir: StrPath) -> None:
""" Run the built-in server from the given directory.
"""
subprocess.run(['/usr/bin/env', 'php', '-S', server_address],
cwd=str(base_dir), check=True)
def run_osm2pgsql(options: Mapping[str, Any]) -> None:
""" Run osm2pgsql with the given options.
"""
env = get_pg_env(options['dsn'])
osm2pgsql_cmd = options['osm2pgsql']
if osm2pgsql_cmd is None:
osm2pgsql_cmd = shutil.which('osm2pgsql')
if osm2pgsql_cmd is None:
raise RuntimeError('osm2pgsql executable not found. Please install osm2pgsql first.')
cmd = [str(osm2pgsql_cmd),
'--slim',
'--log-progress', 'true',
'--number-processes', '1' if options['append'] else str(options['threads']),
'--cache', str(options['osm2pgsql_cache']),
'--style', str(options['osm2pgsql_style'])
]
if str(options['osm2pgsql_style']).endswith('.lua'):
env['LUA_PATH'] = ';'.join((str(options['osm2pgsql_style_path'] / '?.lua'),
os.environ.get('LUAPATH', ';')))
cmd.extend(('--output', 'flex'))
else:
cmd.extend(('--output', 'gazetteer', '--hstore', '--latlon'))
cmd.append('--append' if options['append'] else '--create')
if options['flatnode_file']:
cmd.extend(('--flat-nodes', options['flatnode_file']))
for key, param in (('slim_data', '--tablespace-slim-data'),
('slim_index', '--tablespace-slim-index'),
('main_data', '--tablespace-main-data'),
('main_index', '--tablespace-main-index')):
if options['tablespaces'][key]:
cmd.extend((param, options['tablespaces'][key]))
if options['tablespaces']['main_data']:
env['NOMINATIM_TABLESPACE_PLACE_DATA'] = options['tablespaces']['main_data']
if options['tablespaces']['main_index']:
env['NOMINATIM_TABLESPACE_PLACE_INDEX'] = options['tablespaces']['main_index']
if options.get('disable_jit', False):
env['PGOPTIONS'] = '-c jit=off -c max_parallel_workers_per_gather=0'
if 'import_data' in options:
cmd.extend(('-r', 'xml', '-'))
elif isinstance(options['import_file'], list):
for fname in options['import_file']:
cmd.append(str(fname))
else:
cmd.append(str(options['import_file']))
subprocess.run(cmd, cwd=options.get('cwd', '.'),
input=options.get('import_data'),
env=env, check=True)

View File

@@ -0,0 +1,58 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Functions for removing unnecessary data from the database.
"""
from typing import Optional
from pathlib import Path
from psycopg2 import sql as pysql
from nominatim_core.db.connection import Connection
UPDATE_TABLES = [
'address_levels',
'gb_postcode',
'import_osmosis_log',
'import_polygon_%',
'location_area%',
'location_road%',
'place',
'planet_osm_%',
'search_name_%',
'us_postcode',
'wikipedia_%'
]
def drop_update_tables(conn: Connection) -> None:
""" Drop all tables only necessary for updating the database from
OSM replication data.
"""
parts = (pysql.SQL("(tablename LIKE {})").format(pysql.Literal(t)) for t in UPDATE_TABLES)
with conn.cursor() as cur:
cur.execute(pysql.SQL("SELECT tablename FROM pg_tables WHERE ")
+ pysql.SQL(' or ').join(parts))
tables = [r[0] for r in cur]
for table in tables:
cur.drop_table(table, cascade=True)
conn.commit()
def drop_flatnode_file(fpath: Optional[Path]) -> None:
""" Remove the flatnode file if it exists.
"""
if fpath and fpath.exists():
fpath.unlink()
def is_frozen(conn: Connection) -> bool:
""" Returns true if database is in a frozen state
"""
return conn.table_exists('place') is False

View File

@@ -0,0 +1,405 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Functions for database migration to newer software versions.
"""
from typing import List, Tuple, Callable, Any
import logging
from psycopg2 import sql as pysql
from nominatim_core.errors import UsageError
from nominatim_core.config import Configuration
from nominatim_core.db import properties
from nominatim_core.db.connection import connect, Connection
from ..version import NominatimVersion, NOMINATIM_VERSION, parse_version
from ..tokenizer import factory as tokenizer_factory
from . import refresh
LOG = logging.getLogger()
_MIGRATION_FUNCTIONS : List[Tuple[NominatimVersion, Callable[..., None]]] = []
def migrate(config: Configuration, paths: Any) -> int:
""" Check for the current database version and execute migrations,
if necesssary.
"""
with connect(config.get_libpq_dsn()) as conn:
if conn.table_exists('nominatim_properties'):
db_version_str = properties.get_property(conn, 'database_version')
else:
db_version_str = None
if db_version_str is not None:
db_version = parse_version(db_version_str)
if db_version == NOMINATIM_VERSION:
LOG.warning("Database already at latest version (%s)", db_version_str)
return 0
LOG.info("Detected database version: %s", db_version_str)
else:
db_version = _guess_version(conn)
for version, func in _MIGRATION_FUNCTIONS:
if db_version < version or \
(db_version == (3, 5, 0, 99) and version == (3, 5, 0, 99)):
title = func.__doc__ or ''
LOG.warning("Running: %s (%s)", title.split('\n', 1)[0], version)
kwargs = dict(conn=conn, config=config, paths=paths)
func(**kwargs)
conn.commit()
LOG.warning('Updating SQL functions.')
refresh.create_functions(conn, config)
tokenizer = tokenizer_factory.get_tokenizer_for_db(config)
tokenizer.update_sql_functions(config)
properties.set_property(conn, 'database_version', str(NOMINATIM_VERSION))
conn.commit()
return 0
def _guess_version(conn: Connection) -> NominatimVersion:
""" Guess a database version when there is no property table yet.
Only migrations for 3.6 and later are supported, so bail out
when the version seems older.
"""
with conn.cursor() as cur:
# In version 3.6, the country_name table was updated. Check for that.
cnt = cur.scalar("""SELECT count(*) FROM
(SELECT svals(name) FROM country_name
WHERE country_code = 'gb')x;
""")
if cnt < 100:
LOG.fatal('It looks like your database was imported with a version '
'prior to 3.6.0. Automatic migration not possible.')
raise UsageError('Migration not possible.')
return NominatimVersion(3, 5, 0, 99)
def _migration(major: int, minor: int, patch: int = 0,
dbpatch: int = 0) -> Callable[[Callable[..., None]], Callable[..., None]]:
""" Decorator for a single migration step. The parameters describe the
version after which the migration is applicable, i.e before changing
from the given version to the next, the migration is required.
All migrations are run in the order in which they are defined in this
file. Do not run global SQL scripts for migrations as you cannot be sure
that these scripts do the same in later versions.
Functions will always be reimported in full at the end of the migration
process, so the migration functions may leave a temporary state behind
there.
"""
def decorator(func: Callable[..., None]) -> Callable[..., None]:
version = NominatimVersion(major, minor, patch, dbpatch)
_MIGRATION_FUNCTIONS.append((version, func))
return func
return decorator
@_migration(3, 5, 0, 99)
def import_status_timestamp_change(conn: Connection, **_: Any) -> None:
""" Add timezone to timestamp in status table.
The import_status table has been changed to include timezone information
with the time stamp.
"""
with conn.cursor() as cur:
cur.execute("""ALTER TABLE import_status ALTER COLUMN lastimportdate
TYPE timestamp with time zone;""")
@_migration(3, 5, 0, 99)
def add_nominatim_property_table(conn: Connection, config: Configuration, **_: Any) -> None:
""" Add nominatim_property table.
"""
if not conn.table_exists('nominatim_properties'):
with conn.cursor() as cur:
cur.execute(pysql.SQL("""CREATE TABLE nominatim_properties (
property TEXT,
value TEXT);
GRANT SELECT ON TABLE nominatim_properties TO {};
""").format(pysql.Identifier(config.DATABASE_WEBUSER)))
@_migration(3, 6, 0, 0)
def change_housenumber_transliteration(conn: Connection, **_: Any) -> None:
""" Transliterate housenumbers.
The database schema switched from saving raw housenumbers in
placex.housenumber to saving transliterated ones.
Note: the function create_housenumber_id() has been dropped in later
versions.
"""
with conn.cursor() as cur:
cur.execute("""CREATE OR REPLACE FUNCTION create_housenumber_id(housenumber TEXT)
RETURNS TEXT AS $$
DECLARE
normtext TEXT;
BEGIN
SELECT array_to_string(array_agg(trans), ';')
INTO normtext
FROM (SELECT lookup_word as trans,
getorcreate_housenumber_id(lookup_word)
FROM (SELECT make_standard_name(h) as lookup_word
FROM regexp_split_to_table(housenumber, '[,;]') h) x) y;
return normtext;
END;
$$ LANGUAGE plpgsql STABLE STRICT;""")
cur.execute("DELETE FROM word WHERE class = 'place' and type = 'house'")
cur.execute("""UPDATE placex
SET housenumber = create_housenumber_id(housenumber)
WHERE housenumber is not null""")
@_migration(3, 7, 0, 0)
def switch_placenode_geometry_index(conn: Connection, **_: Any) -> None:
""" Replace idx_placex_geometry_reverse_placeNode index.
Make the index slightly more permissive, so that it can also be used
when matching up boundaries and place nodes. It makes the index
idx_placex_adminname index unnecessary.
"""
with conn.cursor() as cur:
cur.execute(""" CREATE INDEX IF NOT EXISTS idx_placex_geometry_placenode ON placex
USING GIST (geometry)
WHERE osm_type = 'N' and rank_search < 26
and class = 'place' and type != 'postcode'
and linked_place_id is null""")
cur.execute(""" DROP INDEX IF EXISTS idx_placex_adminname """)
@_migration(3, 7, 0, 1)
def install_legacy_tokenizer(conn: Connection, config: Configuration, **_: Any) -> None:
""" Setup legacy tokenizer.
If no other tokenizer has been configured yet, then create the
configuration for the backwards-compatible legacy tokenizer
"""
if properties.get_property(conn, 'tokenizer') is None:
with conn.cursor() as cur:
for table in ('placex', 'location_property_osmline'):
has_column = cur.scalar("""SELECT count(*) FROM information_schema.columns
WHERE table_name = %s
and column_name = 'token_info'""",
(table, ))
if has_column == 0:
cur.execute(pysql.SQL('ALTER TABLE {} ADD COLUMN token_info JSONB')
.format(pysql.Identifier(table)))
tokenizer = tokenizer_factory.create_tokenizer(config, init_db=False,
module_name='legacy')
tokenizer.migrate_database(config) # type: ignore[attr-defined]
@_migration(4, 0, 99, 0)
def create_tiger_housenumber_index(conn: Connection, **_: Any) -> None:
""" Create idx_location_property_tiger_parent_place_id with included
house number.
The inclusion is needed for efficient lookup of housenumbers in
full address searches.
"""
if conn.server_version_tuple() >= (11, 0, 0):
with conn.cursor() as cur:
cur.execute(""" CREATE INDEX IF NOT EXISTS
idx_location_property_tiger_housenumber_migrated
ON location_property_tiger
USING btree(parent_place_id)
INCLUDE (startnumber, endnumber) """)
@_migration(4, 0, 99, 1)
def create_interpolation_index_on_place(conn: Connection, **_: Any) -> None:
""" Create idx_place_interpolations for lookup of interpolation lines
on updates.
"""
with conn.cursor() as cur:
cur.execute("""CREATE INDEX IF NOT EXISTS idx_place_interpolations
ON place USING gist(geometry)
WHERE osm_type = 'W' and address ? 'interpolation'""")
@_migration(4, 0, 99, 2)
def add_step_column_for_interpolation(conn: Connection, **_: Any) -> None:
""" Add a new column 'step' to the interpolations table.
Also converts the data into the stricter format which requires that
startnumbers comply with the odd/even requirements.
"""
if conn.table_has_column('location_property_osmline', 'step'):
return
with conn.cursor() as cur:
# Mark invalid all interpolations with no intermediate numbers.
cur.execute("""UPDATE location_property_osmline SET startnumber = null
WHERE endnumber - startnumber <= 1 """)
# Align the start numbers where odd/even does not match.
cur.execute("""UPDATE location_property_osmline
SET startnumber = startnumber + 1,
linegeo = ST_LineSubString(linegeo,
1.0 / (endnumber - startnumber)::float,
1)
WHERE (interpolationtype = 'odd' and startnumber % 2 = 0)
or (interpolationtype = 'even' and startnumber % 2 = 1)
""")
# Mark invalid odd/even interpolations with no intermediate numbers.
cur.execute("""UPDATE location_property_osmline SET startnumber = null
WHERE interpolationtype in ('odd', 'even')
and endnumber - startnumber = 2""")
# Finally add the new column and populate it.
cur.execute("ALTER TABLE location_property_osmline ADD COLUMN step SMALLINT")
cur.execute("""UPDATE location_property_osmline
SET step = CASE WHEN interpolationtype = 'all'
THEN 1 ELSE 2 END
""")
@_migration(4, 0, 99, 3)
def add_step_column_for_tiger(conn: Connection, **_: Any) -> None:
""" Add a new column 'step' to the tiger data table.
"""
if conn.table_has_column('location_property_tiger', 'step'):
return
with conn.cursor() as cur:
cur.execute("ALTER TABLE location_property_tiger ADD COLUMN step SMALLINT")
cur.execute("""UPDATE location_property_tiger
SET step = CASE WHEN interpolationtype = 'all'
THEN 1 ELSE 2 END
""")
@_migration(4, 0, 99, 4)
def add_derived_name_column_for_country_names(conn: Connection, **_: Any) -> None:
""" Add a new column 'derived_name' which in the future takes the
country names as imported from OSM data.
"""
if not conn.table_has_column('country_name', 'derived_name'):
with conn.cursor() as cur:
cur.execute("ALTER TABLE country_name ADD COLUMN derived_name public.HSTORE")
@_migration(4, 0, 99, 5)
def mark_internal_country_names(conn: Connection, config: Configuration, **_: Any) -> None:
""" Names from the country table should be marked as internal to prevent
them from being deleted. Only necessary for ICU tokenizer.
"""
import psycopg2.extras # pylint: disable=import-outside-toplevel
tokenizer = tokenizer_factory.get_tokenizer_for_db(config)
with tokenizer.name_analyzer() as analyzer:
with conn.cursor() as cur:
psycopg2.extras.register_hstore(cur)
cur.execute("SELECT country_code, name FROM country_name")
for country_code, names in cur:
if not names:
names = {}
names['countrycode'] = country_code
analyzer.add_country_names(country_code, names)
@_migration(4, 1, 99, 0)
def add_place_deletion_todo_table(conn: Connection, **_: Any) -> None:
""" Add helper table for deleting data on updates.
The table is only necessary when updates are possible, i.e.
the database is not in freeze mode.
"""
if conn.table_exists('place'):
with conn.cursor() as cur:
cur.execute("""CREATE TABLE IF NOT EXISTS place_to_be_deleted (
osm_type CHAR(1),
osm_id BIGINT,
class TEXT,
type TEXT,
deferred BOOLEAN)""")
@_migration(4, 1, 99, 1)
def split_pending_index(conn: Connection, **_: Any) -> None:
""" Reorganise indexes for pending updates.
"""
if conn.table_exists('place'):
with conn.cursor() as cur:
cur.execute("""CREATE INDEX IF NOT EXISTS idx_placex_rank_address_sector
ON placex USING BTREE (rank_address, geometry_sector)
WHERE indexed_status > 0""")
cur.execute("""CREATE INDEX IF NOT EXISTS idx_placex_rank_boundaries_sector
ON placex USING BTREE (rank_search, geometry_sector)
WHERE class = 'boundary' and type = 'administrative'
and indexed_status > 0""")
cur.execute("DROP INDEX IF EXISTS idx_placex_pendingsector")
@_migration(4, 2, 99, 0)
def enable_forward_dependencies(conn: Connection, **_: Any) -> None:
""" Create indexes for updates with forward dependency tracking (long-running).
"""
if conn.table_exists('planet_osm_ways'):
with conn.cursor() as cur:
cur.execute("""SELECT * FROM pg_indexes
WHERE tablename = 'planet_osm_ways'
and indexdef LIKE '%nodes%'""")
if cur.rowcount == 0:
cur.execute("""CREATE OR REPLACE FUNCTION public.planet_osm_index_bucket(bigint[])
RETURNS bigint[]
LANGUAGE sql IMMUTABLE
AS $function$
SELECT ARRAY(SELECT DISTINCT unnest($1) >> 5)
$function$""")
cur.execute("""CREATE INDEX planet_osm_ways_nodes_bucket_idx
ON planet_osm_ways
USING gin (planet_osm_index_bucket(nodes))
WITH (fastupdate=off)""")
cur.execute("""CREATE INDEX planet_osm_rels_parts_idx
ON planet_osm_rels USING gin (parts)
WITH (fastupdate=off)""")
cur.execute("ANALYZE planet_osm_ways")
@_migration(4, 2, 99, 1)
def add_improved_geometry_reverse_placenode_index(conn: Connection, **_: Any) -> None:
""" Create improved index for reverse lookup of place nodes.
"""
with conn.cursor() as cur:
cur.execute("""CREATE INDEX IF NOT EXISTS idx_placex_geometry_reverse_lookupPlaceNode
ON placex
USING gist (ST_Buffer(geometry, reverse_place_diameter(rank_search)))
WHERE rank_address between 4 and 25 AND type != 'postcode'
AND name is not null AND linked_place_id is null AND osm_type = 'N'
""")
@_migration(4, 4, 99, 0)
def create_postcode_area_lookup_index(conn: Connection, **_: Any) -> None:
""" Create index needed for looking up postcode areas from postocde points.
"""
with conn.cursor() as cur:
cur.execute("""CREATE INDEX IF NOT EXISTS idx_placex_postcode_areas
ON placex USING BTREE (country_code, postcode)
WHERE osm_type = 'R' AND class = 'boundary' AND type = 'postal_code'
""")
@_migration(4, 4, 99, 1)
def create_postcode_parent_index(conn: Connection, **_: Any) -> None:
""" Create index needed for updating postcodes when a parent changes.
"""
if conn.table_exists('planet_osm_ways'):
with conn.cursor() as cur:
cur.execute("""CREATE INDEX IF NOT EXISTS
idx_location_postcode_parent_place_id
ON location_postcode USING BTREE (parent_place_id)""")

View File

@@ -0,0 +1,234 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Functions for importing, updating and otherwise maintaining the table
of artificial postcode centroids.
"""
from typing import Optional, Tuple, Dict, List, TextIO
from collections import defaultdict
from pathlib import Path
import csv
import gzip
import logging
from math import isfinite
from psycopg2 import sql as pysql
from nominatim_core.db.connection import connect, Connection
from nominatim_core.utils.centroid import PointsCentroid
from ..data.postcode_format import PostcodeFormatter, CountryPostcodeMatcher
from ..tokenizer.base import AbstractAnalyzer, AbstractTokenizer
LOG = logging.getLogger()
def _to_float(numstr: str, max_value: float) -> float:
""" Convert the number in string into a float. The number is expected
to be in the range of [-max_value, max_value]. Otherwise rises a
ValueError.
"""
num = float(numstr)
if not isfinite(num) or num <= -max_value or num >= max_value:
raise ValueError()
return num
class _PostcodeCollector:
""" Collector for postcodes of a single country.
"""
def __init__(self, country: str, matcher: Optional[CountryPostcodeMatcher]):
self.country = country
self.matcher = matcher
self.collected: Dict[str, PointsCentroid] = defaultdict(PointsCentroid)
self.normalization_cache: Optional[Tuple[str, Optional[str]]] = None
def add(self, postcode: str, x: float, y: float) -> None:
""" Add the given postcode to the collection cache. If the postcode
already existed, it is overwritten with the new centroid.
"""
if self.matcher is not None:
normalized: Optional[str]
if self.normalization_cache and self.normalization_cache[0] == postcode:
normalized = self.normalization_cache[1]
else:
match = self.matcher.match(postcode)
normalized = self.matcher.normalize(match) if match else None
self.normalization_cache = (postcode, normalized)
if normalized:
self.collected[normalized] += (x, y)
def commit(self, conn: Connection, analyzer: AbstractAnalyzer, project_dir: Path) -> None:
""" Update postcodes for the country from the postcodes selected so far
as well as any externally supplied postcodes.
"""
self._update_from_external(analyzer, project_dir)
to_add, to_delete, to_update = self._compute_changes(conn)
LOG.info("Processing country '%s' (%s added, %s deleted, %s updated).",
self.country, len(to_add), len(to_delete), len(to_update))
with conn.cursor() as cur:
if to_add:
cur.execute_values(
"""INSERT INTO location_postcode
(place_id, indexed_status, country_code,
postcode, geometry) VALUES %s""",
to_add,
template=pysql.SQL("""(nextval('seq_place'), 1, {},
%s, 'SRID=4326;POINT(%s %s)')
""").format(pysql.Literal(self.country)))
if to_delete:
cur.execute("""DELETE FROM location_postcode
WHERE country_code = %s and postcode = any(%s)
""", (self.country, to_delete))
if to_update:
cur.execute_values(
pysql.SQL("""UPDATE location_postcode
SET indexed_status = 2,
geometry = ST_SetSRID(ST_Point(v.x, v.y), 4326)
FROM (VALUES %s) AS v (pc, x, y)
WHERE country_code = {} and postcode = pc
""").format(pysql.Literal(self.country)), to_update)
def _compute_changes(self, conn: Connection) \
-> Tuple[List[Tuple[str, float, float]], List[str], List[Tuple[str, float, float]]]:
""" Compute which postcodes from the collected postcodes have to be
added or modified and which from the location_postcode table
have to be deleted.
"""
to_update = []
to_delete = []
with conn.cursor() as cur:
cur.execute("""SELECT postcode, ST_X(geometry), ST_Y(geometry)
FROM location_postcode
WHERE country_code = %s""",
(self.country, ))
for postcode, x, y in cur:
pcobj = self.collected.pop(postcode, None)
if pcobj:
newx, newy = pcobj.centroid()
if (x - newx) > 0.0000001 or (y - newy) > 0.0000001:
to_update.append((postcode, newx, newy))
else:
to_delete.append(postcode)
to_add = [(k, *v.centroid()) for k, v in self.collected.items()]
self.collected = defaultdict(PointsCentroid)
return to_add, to_delete, to_update
def _update_from_external(self, analyzer: AbstractAnalyzer, project_dir: Path) -> None:
""" Look for an external postcode file for the active country in
the project directory and add missing postcodes when found.
"""
csvfile = self._open_external(project_dir)
if csvfile is None:
return
try:
reader = csv.DictReader(csvfile)
for row in reader:
if 'postcode' not in row or 'lat' not in row or 'lon' not in row:
LOG.warning("Bad format for external postcode file for country '%s'."
" Ignored.", self.country)
return
postcode = analyzer.normalize_postcode(row['postcode'])
if postcode not in self.collected:
try:
# Do float conversation separately, it might throw
centroid = (_to_float(row['lon'], 180),
_to_float(row['lat'], 90))
self.collected[postcode] += centroid
except ValueError:
LOG.warning("Bad coordinates %s, %s in %s country postcode file.",
row['lat'], row['lon'], self.country)
finally:
csvfile.close()
def _open_external(self, project_dir: Path) -> Optional[TextIO]:
fname = project_dir / f'{self.country}_postcodes.csv'
if fname.is_file():
LOG.info("Using external postcode file '%s'.", fname)
return open(fname, 'r', encoding='utf-8')
fname = project_dir / f'{self.country}_postcodes.csv.gz'
if fname.is_file():
LOG.info("Using external postcode file '%s'.", fname)
return gzip.open(fname, 'rt')
return None
def update_postcodes(dsn: str, project_dir: Path, tokenizer: AbstractTokenizer) -> None:
""" Update the table of artificial postcodes.
Computes artificial postcode centroids from the placex table,
potentially enhances it with external data and then updates the
postcodes in the table 'location_postcode'.
"""
matcher = PostcodeFormatter()
with tokenizer.name_analyzer() as analyzer:
with connect(dsn) as conn:
# First get the list of countries that currently have postcodes.
# (Doing this before starting to insert, so it is fast on import.)
with conn.cursor() as cur:
cur.execute("SELECT DISTINCT country_code FROM location_postcode")
todo_countries = set((row[0] for row in cur))
# Recompute the list of valid postcodes from placex.
with conn.cursor(name="placex_postcodes") as cur:
cur.execute("""
SELECT cc, pc, ST_X(centroid), ST_Y(centroid)
FROM (SELECT
COALESCE(plx.country_code,
get_country_code(ST_Centroid(pl.geometry))) as cc,
pl.address->'postcode' as pc,
COALESCE(plx.centroid, ST_Centroid(pl.geometry)) as centroid
FROM place AS pl LEFT OUTER JOIN placex AS plx
ON pl.osm_id = plx.osm_id AND pl.osm_type = plx.osm_type
WHERE pl.address ? 'postcode' AND pl.geometry IS NOT null) xx
WHERE pc IS NOT null AND cc IS NOT null
ORDER BY cc, pc""")
collector = None
for country, postcode, x, y in cur:
if collector is None or country != collector.country:
if collector is not None:
collector.commit(conn, analyzer, project_dir)
collector = _PostcodeCollector(country, matcher.get_matcher(country))
todo_countries.discard(country)
collector.add(postcode, x, y)
if collector is not None:
collector.commit(conn, analyzer, project_dir)
# Now handle any countries that are only in the postcode table.
for country in todo_countries:
fmt = matcher.get_matcher(country)
_PostcodeCollector(country, fmt).commit(conn, analyzer, project_dir)
conn.commit()
analyzer.update_postcodes_from_db()
def can_compute(dsn: str) -> bool:
"""
Check that the place table exists so that
postcodes can be computed.
"""
with connect(dsn) as conn:
return conn.table_exists('place')

View File

@@ -0,0 +1,346 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Functions for bringing auxiliary data in the database up-to-date.
"""
from typing import MutableSequence, Tuple, Any, Type, Mapping, Sequence, List, cast
import csv
import gzip
import logging
from textwrap import dedent
from pathlib import Path
from psycopg2 import sql as pysql
from nominatim_core.config import Configuration
from nominatim_core.db.connection import Connection, connect
from nominatim_core.db.utils import execute_file, CopyBuffer
from nominatim_core.db.sql_preprocessor import SQLPreprocessor
from ..version import NOMINATIM_VERSION
LOG = logging.getLogger()
OSM_TYPE = {'N': 'node', 'W': 'way', 'R': 'relation'}
def _add_address_level_rows_from_entry(rows: MutableSequence[Tuple[Any, ...]],
entry: Mapping[str, Any]) -> None:
""" Converts a single entry from the JSON format for address rank
descriptions into a flat format suitable for inserting into a
PostgreSQL table and adds these lines to `rows`.
"""
countries = entry.get('countries') or (None, )
for key, values in entry['tags'].items():
for value, ranks in values.items():
if isinstance(ranks, list):
rank_search, rank_address = ranks
else:
rank_search = rank_address = ranks
if not value:
value = None
for country in countries:
rows.append((country, key, value, rank_search, rank_address))
def load_address_levels(conn: Connection, table: str, levels: Sequence[Mapping[str, Any]]) -> None:
""" Replace the `address_levels` table with the contents of `levels'.
A new table is created any previously existing table is dropped.
The table has the following columns:
country, class, type, rank_search, rank_address
"""
rows: List[Tuple[Any, ...]] = []
for entry in levels:
_add_address_level_rows_from_entry(rows, entry)
with conn.cursor() as cur:
cur.drop_table(table)
cur.execute(pysql.SQL("""CREATE TABLE {} (
country_code varchar(2),
class TEXT,
type TEXT,
rank_search SMALLINT,
rank_address SMALLINT)
""").format(pysql.Identifier(table)))
cur.execute_values(pysql.SQL("INSERT INTO {} VALUES %s")
.format(pysql.Identifier(table)), rows)
cur.execute(pysql.SQL('CREATE UNIQUE INDEX ON {} (country_code, class, type)')
.format(pysql.Identifier(table)))
conn.commit()
def load_address_levels_from_config(conn: Connection, config: Configuration) -> None:
""" Replace the `address_levels` table with the content as
defined in the given configuration. Uses the parameter
NOMINATIM_ADDRESS_LEVEL_CONFIG to determine the location of the
configuration file.
"""
cfg = config.load_sub_configuration('', config='ADDRESS_LEVEL_CONFIG')
load_address_levels(conn, 'address_levels', cfg)
def create_functions(conn: Connection, config: Configuration,
enable_diff_updates: bool = True,
enable_debug: bool = False) -> None:
""" (Re)create the PL/pgSQL functions.
"""
sql = SQLPreprocessor(conn, config)
sql.run_sql_file(conn, 'functions.sql',
disable_diff_updates=not enable_diff_updates,
debug=enable_debug)
WEBSITE_SCRIPTS = (
'deletable.php',
'details.php',
'lookup.php',
'polygons.php',
'reverse.php',
'search.php',
'status.php'
)
# constants needed by PHP scripts: PHP name, config name, type
PHP_CONST_DEFS = (
('Database_DSN', 'DATABASE_DSN', str),
('Default_Language', 'DEFAULT_LANGUAGE', str),
('Log_DB', 'LOG_DB', bool),
('Log_File', 'LOG_FILE', Path),
('NoAccessControl', 'CORS_NOACCESSCONTROL', bool),
('Places_Max_ID_count', 'LOOKUP_MAX_COUNT', int),
('PolygonOutput_MaximumTypes', 'POLYGON_OUTPUT_MAX_TYPES', int),
('Search_BatchMode', 'SEARCH_BATCH_MODE', bool),
('Search_NameOnlySearchFrequencyThreshold', 'SEARCH_NAME_ONLY_THRESHOLD', str),
('Use_US_Tiger_Data', 'USE_US_TIGER_DATA', bool),
('MapIcon_URL', 'MAPICON_URL', str),
('Search_WithinCountries', 'SEARCH_WITHIN_COUNTRIES', bool),
)
def import_wikipedia_articles(dsn: str, data_path: Path, ignore_errors: bool = False) -> int:
""" Replaces the wikipedia importance tables with new data.
The import is run in a single transaction so that the new data
is replace seamlessly.
Returns 0 if all was well and 1 if the importance file could not
be found. Throws an exception if there was an error reading the file.
"""
if import_importance_csv(dsn, data_path / 'wikimedia-importance.csv.gz') == 0 \
or import_importance_sql(dsn, data_path / 'wikimedia-importance.sql.gz',
ignore_errors) == 0:
return 0
return 1
def import_importance_csv(dsn: str, data_file: Path) -> int:
""" Replace wikipedia importance table with data from a
single CSV file.
The file must be a gzipped CSV and have the following columns:
language, title, importance, wikidata_id
Other columns may be present but will be ignored.
"""
if not data_file.exists():
return 1
# Only import the first occurance of a wikidata ID.
# This keeps indexes and table small.
wd_done = set()
with connect(dsn) as conn:
with conn.cursor() as cur:
cur.drop_table('wikipedia_article')
cur.drop_table('wikipedia_redirect')
cur.drop_table('wikimedia_importance')
cur.execute("""CREATE TABLE wikimedia_importance (
language TEXT NOT NULL,
title TEXT NOT NULL,
importance double precision NOT NULL,
wikidata TEXT
) """)
with gzip.open(str(data_file), 'rt') as fd, CopyBuffer() as buf:
for row in csv.DictReader(fd, delimiter='\t', quotechar='|'):
wd_id = int(row['wikidata_id'][1:])
buf.add(row['language'], row['title'], row['importance'],
None if wd_id in wd_done else row['wikidata_id'])
wd_done.add(wd_id)
if buf.size() > 10000000:
with conn.cursor() as cur:
buf.copy_out(cur, 'wikimedia_importance',
columns=['language', 'title', 'importance',
'wikidata'])
with conn.cursor() as cur:
buf.copy_out(cur, 'wikimedia_importance',
columns=['language', 'title', 'importance', 'wikidata'])
with conn.cursor() as cur:
cur.execute("""CREATE INDEX IF NOT EXISTS idx_wikimedia_importance_title
ON wikimedia_importance (title)""")
cur.execute("""CREATE INDEX IF NOT EXISTS idx_wikimedia_importance_wikidata
ON wikimedia_importance (wikidata)
WHERE wikidata is not null""")
conn.commit()
return 0
def import_importance_sql(dsn: str, data_file: Path, ignore_errors: bool) -> int:
""" Replace wikipedia importance table with data from an SQL file.
"""
if not data_file.exists():
return 1
pre_code = """BEGIN;
DROP TABLE IF EXISTS "wikipedia_article";
DROP TABLE IF EXISTS "wikipedia_redirect";
DROP TABLE IF EXISTS "wikipedia_importance";
"""
post_code = "COMMIT"
execute_file(dsn, data_file, ignore_errors=ignore_errors,
pre_code=pre_code, post_code=post_code)
return 0
def import_secondary_importance(dsn: str, data_path: Path, ignore_errors: bool = False) -> int:
""" Replaces the secondary importance raster data table with new data.
Returns 0 if all was well and 1 if the raster SQL file could not
be found. Throws an exception if there was an error reading the file.
"""
datafile = data_path / 'secondary_importance.sql.gz'
if not datafile.exists():
return 1
with connect(dsn) as conn:
postgis_version = conn.postgis_version_tuple()
if postgis_version[0] < 3:
LOG.error('PostGIS version is too old for using OSM raster data.')
return 2
execute_file(dsn, datafile, ignore_errors=ignore_errors)
return 0
def recompute_importance(conn: Connection) -> None:
""" Recompute wikipedia links and importance for all entries in placex.
This is a long-running operations that must not be executed in
parallel with updates.
"""
with conn.cursor() as cur:
cur.execute('ALTER TABLE placex DISABLE TRIGGER ALL')
cur.execute("""
UPDATE placex SET (wikipedia, importance) =
(SELECT wikipedia, importance
FROM compute_importance(extratags, country_code, rank_search, centroid))
""")
cur.execute("""
UPDATE placex s SET wikipedia = d.wikipedia, importance = d.importance
FROM placex d
WHERE s.place_id = d.linked_place_id and d.wikipedia is not null
and (s.wikipedia is null or s.importance < d.importance);
""")
cur.execute('ALTER TABLE placex ENABLE TRIGGER ALL')
conn.commit()
def _quote_php_variable(var_type: Type[Any], config: Configuration,
conf_name: str) -> str:
if var_type == bool:
return 'true' if config.get_bool(conf_name) else 'false'
if var_type == int:
return cast(str, getattr(config, conf_name))
if not getattr(config, conf_name):
return 'false'
if var_type == Path:
value = str(config.get_path(conf_name) or '')
else:
value = getattr(config, conf_name)
quoted = value.replace("'", "\\'")
return f"'{quoted}'"
def setup_website(basedir: Path, config: Configuration, conn: Connection) -> None:
""" Create the website script stubs.
"""
if config.lib_dir.php is None:
LOG.info("Python frontend does not require website setup. Skipping.")
return
if not basedir.exists():
LOG.info('Creating website directory.')
basedir.mkdir()
assert config.project_dir is not None
basedata = dedent(f"""\
<?php
@define('CONST_Debug', $_GET['debug'] ?? false);
@define('CONST_LibDir', '{config.lib_dir.php}');
@define('CONST_TokenizerDir', '{config.project_dir / 'tokenizer'}');
@define('CONST_NominatimVersion', '{NOMINATIM_VERSION!s}');
""")
for php_name, conf_name, var_type in PHP_CONST_DEFS:
varout = _quote_php_variable(var_type, config, conf_name)
basedata += f"@define('CONST_{php_name}', {varout});\n"
template = "\nrequire_once(CONST_LibDir.'/website/{}');\n"
search_name_table_exists = bool(conn and conn.table_exists('search_name'))
for script in WEBSITE_SCRIPTS:
if not search_name_table_exists and script == 'search.php':
out = template.format('reverse-only-search.php')
else:
out = template.format(script)
(basedir / script).write_text(basedata + out, 'utf-8')
def invalidate_osm_object(osm_type: str, osm_id: int, conn: Connection,
recursive: bool = True) -> None:
""" Mark the given OSM object for reindexing. When 'recursive' is set
to True (the default), then all dependent objects are marked for
reindexing as well.
'osm_type' must be on of 'N' (node), 'W' (way) or 'R' (relation).
If the given object does not exist, then nothing happens.
"""
assert osm_type in ('N', 'R', 'W')
LOG.warning("Invalidating OSM %s %s%s.",
OSM_TYPE[osm_type], osm_id,
' and its dependent places' if recursive else '')
with conn.cursor() as cur:
if recursive:
sql = """SELECT place_force_update(place_id)
FROM placex WHERE osm_type = %s and osm_id = %s"""
else:
sql = """UPDATE placex SET indexed_status = 2
WHERE osm_type = %s and osm_id = %s"""
cur.execute(sql, (osm_type, osm_id))

View File

@@ -0,0 +1,206 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Functions for updating a database from a replication source.
"""
from typing import ContextManager, MutableMapping, Any, Generator, cast, Iterator
from contextlib import contextmanager
import datetime as dt
from enum import Enum
import logging
import time
import types
import urllib.request as urlrequest
import requests
from nominatim_core.errors import UsageError
from nominatim_core.db import status
from nominatim_core.db.connection import Connection, connect
from .exec_utils import run_osm2pgsql
try:
from osmium.replication.server import ReplicationServer
from osmium import WriteHandler
from osmium import version as pyo_version
except ImportError as exc:
logging.getLogger().critical("pyosmium not installed. Replication functions not available.\n"
"To install pyosmium via pip: pip3 install osmium")
raise UsageError("replication tools not available") from exc
LOG = logging.getLogger()
def init_replication(conn: Connection, base_url: str,
socket_timeout: int = 60) -> None:
""" Set up replication for the server at the given base URL.
"""
LOG.info("Using replication source: %s", base_url)
date = status.compute_database_date(conn)
# margin of error to make sure we get all data
date -= dt.timedelta(hours=3)
with _make_replication_server(base_url, socket_timeout) as repl:
seq = repl.timestamp_to_sequence(date)
if seq is None:
LOG.fatal("Cannot reach the configured replication service '%s'.\n"
"Does the URL point to a directory containing OSM update data?",
base_url)
raise UsageError("Failed to reach replication service")
status.set_status(conn, date=date, seq=seq)
LOG.warning("Updates initialised at sequence %s (%s)", seq, date)
def check_for_updates(conn: Connection, base_url: str,
socket_timeout: int = 60) -> int:
""" Check if new data is available from the replication service at the
given base URL.
"""
_, seq, _ = status.get_status(conn)
if seq is None:
LOG.error("Replication not set up. "
"Please run 'nominatim replication --init' first.")
return 254
with _make_replication_server(base_url, socket_timeout) as repl:
state = repl.get_state_info()
if state is None:
LOG.error("Cannot get state for URL %s.", base_url)
return 253
if state.sequence <= seq:
LOG.warning("Database is up to date.")
return 2
LOG.warning("New data available (%i => %i).", seq, state.sequence)
return 0
class UpdateState(Enum):
""" Possible states after an update has run.
"""
UP_TO_DATE = 0
MORE_PENDING = 2
NO_CHANGES = 3
def update(dsn: str, options: MutableMapping[str, Any],
socket_timeout: int = 60) -> UpdateState:
""" Update database from the next batch of data. Returns the state of
updates according to `UpdateState`.
"""
with connect(dsn) as conn:
startdate, startseq, indexed = status.get_status(conn)
conn.commit()
if startseq is None:
LOG.error("Replication not set up. "
"Please run 'nominatim replication --init' first.")
raise UsageError("Replication not set up.")
assert startdate is not None
if not indexed and options['indexed_only']:
LOG.info("Skipping update. There is data that needs indexing.")
return UpdateState.MORE_PENDING
last_since_update = dt.datetime.now(dt.timezone.utc) - startdate
update_interval = dt.timedelta(seconds=options['update_interval'])
if last_since_update < update_interval:
duration = (update_interval - last_since_update).seconds
LOG.warning("Sleeping for %s sec before next update.", duration)
time.sleep(duration)
if options['import_file'].exists():
options['import_file'].unlink()
# Read updates into file.
with _make_replication_server(options['base_url'], socket_timeout) as repl:
outhandler = WriteHandler(str(options['import_file']))
endseq = repl.apply_diffs(outhandler, startseq + 1,
max_size=options['max_diff_size'] * 1024)
outhandler.close()
if endseq is None:
return UpdateState.NO_CHANGES
with connect(dsn) as conn:
run_osm2pgsql_updates(conn, options)
# Write the current status to the file
endstate = repl.get_state_info(endseq)
status.set_status(conn, endstate.timestamp if endstate else None,
seq=endseq, indexed=False)
conn.commit()
return UpdateState.UP_TO_DATE
def run_osm2pgsql_updates(conn: Connection, options: MutableMapping[str, Any]) -> None:
""" Run osm2pgsql in append mode.
"""
# Remove any stale deletion marks.
with conn.cursor() as cur:
cur.execute('TRUNCATE place_to_be_deleted')
conn.commit()
# Consume updates with osm2pgsql.
options['append'] = True
options['disable_jit'] = conn.server_version_tuple() >= (11, 0)
run_osm2pgsql(options)
# Handle deletions
with conn.cursor() as cur:
cur.execute('SELECT flush_deleted_places()')
conn.commit()
def _make_replication_server(url: str, timeout: int) -> ContextManager[ReplicationServer]:
""" Returns a ReplicationServer in form of a context manager.
Creates a light wrapper around older versions of pyosmium that did
not support the context manager interface.
"""
if hasattr(ReplicationServer, '__enter__'):
# Patches the open_url function for pyosmium >= 3.2
# where the socket timeout is no longer respected.
def patched_open_url(self: ReplicationServer, url: urlrequest.Request) -> Any:
""" Download a resource from the given URL and return a byte sequence
of the content.
"""
headers = {"User-Agent" : f"Nominatim (pyosmium/{pyo_version.pyosmium_release})"}
if self.session is not None:
return self.session.get(url.get_full_url(),
headers=headers, timeout=timeout or None,
stream=True)
@contextmanager
def _get_url_with_session() -> Iterator[requests.Response]:
with requests.Session() as session:
request = session.get(url.get_full_url(),
headers=headers, timeout=timeout or None,
stream=True)
yield request
return _get_url_with_session()
repl = ReplicationServer(url)
setattr(repl, 'open_url', types.MethodType(patched_open_url, repl))
return cast(ContextManager[ReplicationServer], repl)
@contextmanager
def get_cm() -> Generator[ReplicationServer, None, None]:
yield ReplicationServer(url)
return get_cm()

View File

@@ -0,0 +1,78 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Contains the class which handles statistics for the
import of special phrases.
"""
import logging
LOG = logging.getLogger()
class SpecialPhrasesImporterStatistics():
"""
Class handling statistics of the import
process of special phrases.
"""
def __init__(self) -> None:
self._intialize_values()
def _intialize_values(self) -> None:
"""
Set all counts for the global
import to 0.
"""
self.tables_created = 0
self.tables_deleted = 0
self.tables_ignored = 0
self.invalids = 0
def notify_one_phrase_invalid(self) -> None:
"""
Add +1 to the count of invalid entries
fetched from the wiki.
"""
self.invalids += 1
def notify_one_table_created(self) -> None:
"""
Add +1 to the count of created tables.
"""
self.tables_created += 1
def notify_one_table_deleted(self) -> None:
"""
Add +1 to the count of deleted tables.
"""
self.tables_deleted += 1
def notify_one_table_ignored(self) -> None:
"""
Add +1 to the count of ignored tables.
"""
self.tables_ignored += 1
def notify_import_done(self) -> None:
"""
Print stats for the whole import process
and reset all values.
"""
LOG.info('====================================================================')
LOG.info('Final statistics of the import:')
LOG.info('- %s phrases were invalid.', self.invalids)
if self.invalids > 0:
LOG.info(' Those invalid phrases have been skipped.')
LOG.info('- %s tables were ignored as they already exist on the database',
self.tables_ignored)
LOG.info('- %s tables were created', self.tables_created)
LOG.info('- %s tables were deleted from the database', self.tables_deleted)
if self.tables_deleted > 0:
LOG.info(' They were deleted as they are not valid anymore.')
if self.invalids > 0:
LOG.warning('%s phrases were invalid and have been skipped during the whole process.',
self.invalids)
self._intialize_values()

View File

@@ -0,0 +1,46 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Module containing the SPCsvLoader class.
The class allows to load phrases from a csv file.
"""
from typing import Iterable
import csv
import os
from nominatim_core.errors import UsageError
from .special_phrase import SpecialPhrase
class SPCsvLoader:
"""
Handles loading of special phrases from external csv file.
"""
def __init__(self, csv_path: str) -> None:
self.csv_path = csv_path
def generate_phrases(self) -> Iterable[SpecialPhrase]:
""" Open and parse the given csv file.
Create the corresponding SpecialPhrases.
"""
self._check_csv_validity()
with open(self.csv_path, encoding='utf-8') as fd:
reader = csv.DictReader(fd, delimiter=',')
for row in reader:
yield SpecialPhrase(row['phrase'], row['class'], row['type'], row['operator'])
def _check_csv_validity(self) -> None:
"""
Check that the csv file has the right extension.
"""
_, extension = os.path.splitext(self.csv_path)
if extension != '.csv':
raise UsageError(f'The file {self.csv_path} is not a csv file.')

View File

@@ -0,0 +1,274 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Module containing the class handling the import
of the special phrases.
Phrases are analyzed and imported into the database.
The phrases already present in the database which are not
valids anymore are removed.
"""
from typing import Iterable, Tuple, Mapping, Sequence, Optional, Set
import logging
import re
from psycopg2.sql import Identifier, SQL
from nominatim_core.typing import Protocol
from nominatim_core.config import Configuration
from nominatim_core.db.connection import Connection
from .importer_statistics import SpecialPhrasesImporterStatistics
from .special_phrase import SpecialPhrase
from ...tokenizer.base import AbstractTokenizer
LOG = logging.getLogger()
def _classtype_table(phrase_class: str, phrase_type: str) -> str:
""" Return the name of the table for the given class and type.
"""
return f'place_classtype_{phrase_class}_{phrase_type}'
class SpecialPhraseLoader(Protocol):
""" Protocol for classes implementing a loader for special phrases.
"""
def generate_phrases(self) -> Iterable[SpecialPhrase]:
""" Generates all special phrase terms this loader can produce.
"""
class SPImporter():
# pylint: disable-msg=too-many-instance-attributes
"""
Class handling the process of special phrases importation into the database.
Take a sp loader which load the phrases from an external source.
"""
def __init__(self, config: Configuration, conn: Connection,
sp_loader: SpecialPhraseLoader) -> None:
self.config = config
self.db_connection = conn
self.sp_loader = sp_loader
self.statistics_handler = SpecialPhrasesImporterStatistics()
self.black_list, self.white_list = self._load_white_and_black_lists()
self.sanity_check_pattern = re.compile(r'^\w+$')
# This set will contain all existing phrases to be added.
# It contains tuples with the following format: (label, class, type, operator)
self.word_phrases: Set[Tuple[str, str, str, str]] = set()
# This set will contain all existing place_classtype tables which doesn't match any
# special phrases class/type on the wiki.
self.table_phrases_to_delete: Set[str] = set()
def import_phrases(self, tokenizer: AbstractTokenizer, should_replace: bool) -> None:
"""
Iterate through all SpecialPhrases extracted from the
loader and import them into the database.
If should_replace is set to True only the loaded phrases
will be kept into the database. All other phrases already
in the database will be removed.
"""
LOG.warning('Special phrases importation starting')
self._fetch_existing_place_classtype_tables()
# Store pairs of class/type for further processing
class_type_pairs = set()
for phrase in self.sp_loader.generate_phrases():
result = self._process_phrase(phrase)
if result:
class_type_pairs.add(result)
self._create_classtype_table_and_indexes(class_type_pairs)
if should_replace:
self._remove_non_existent_tables_from_db()
self.db_connection.commit()
with tokenizer.name_analyzer() as analyzer:
analyzer.update_special_phrases(self.word_phrases, should_replace)
LOG.warning('Import done.')
self.statistics_handler.notify_import_done()
def _fetch_existing_place_classtype_tables(self) -> None:
"""
Fetch existing place_classtype tables.
Fill the table_phrases_to_delete set of the class.
"""
query = """
SELECT table_name
FROM information_schema.tables
WHERE table_schema='public'
AND table_name like 'place_classtype_%';
"""
with self.db_connection.cursor() as db_cursor:
db_cursor.execute(SQL(query))
for row in db_cursor:
self.table_phrases_to_delete.add(row[0])
def _load_white_and_black_lists(self) \
-> Tuple[Mapping[str, Sequence[str]], Mapping[str, Sequence[str]]]:
"""
Load white and black lists from phrases-settings.json.
"""
settings = self.config.load_sub_configuration('phrase-settings.json')
return settings['blackList'], settings['whiteList']
def _check_sanity(self, phrase: SpecialPhrase) -> bool:
"""
Check sanity of given inputs in case somebody added garbage in the wiki.
If a bad class/type is detected the system will exit with an error.
"""
class_matchs = self.sanity_check_pattern.findall(phrase.p_class)
type_matchs = self.sanity_check_pattern.findall(phrase.p_type)
if not class_matchs or not type_matchs:
LOG.warning("Bad class/type: %s=%s. It will not be imported",
phrase.p_class, phrase.p_type)
return False
return True
def _process_phrase(self, phrase: SpecialPhrase) -> Optional[Tuple[str, str]]:
"""
Processes the given phrase by checking black and white list
and sanity.
Return the class/type pair corresponding to the phrase.
"""
# blacklisting: disallow certain class/type combinations
if phrase.p_class in self.black_list.keys() \
and phrase.p_type in self.black_list[phrase.p_class]:
return None
# whitelisting: if class is in whitelist, allow only tags in the list
if phrase.p_class in self.white_list.keys() \
and phrase.p_type not in self.white_list[phrase.p_class]:
return None
# sanity check, in case somebody added garbage in the wiki
if not self._check_sanity(phrase):
self.statistics_handler.notify_one_phrase_invalid()
return None
self.word_phrases.add((phrase.p_label, phrase.p_class,
phrase.p_type, phrase.p_operator))
return (phrase.p_class, phrase.p_type)
def _create_classtype_table_and_indexes(self,
class_type_pairs: Iterable[Tuple[str, str]]) -> None:
"""
Create table place_classtype for each given pair.
Also create indexes on place_id and centroid.
"""
LOG.warning('Create tables and indexes...')
sql_tablespace = self.config.TABLESPACE_AUX_DATA
if sql_tablespace:
sql_tablespace = ' TABLESPACE ' + sql_tablespace
with self.db_connection.cursor() as db_cursor:
db_cursor.execute("CREATE INDEX idx_placex_classtype ON placex (class, type)")
for pair in class_type_pairs:
phrase_class = pair[0]
phrase_type = pair[1]
table_name = _classtype_table(phrase_class, phrase_type)
if table_name in self.table_phrases_to_delete:
self.statistics_handler.notify_one_table_ignored()
# Remove this table from the ones to delete as it match a
# class/type still existing on the special phrases of the wiki.
self.table_phrases_to_delete.remove(table_name)
# So don't need to create the table and indexes.
continue
# Table creation
self._create_place_classtype_table(sql_tablespace, phrase_class, phrase_type)
# Indexes creation
self._create_place_classtype_indexes(sql_tablespace, phrase_class, phrase_type)
# Grant access on read to the web user.
self._grant_access_to_webuser(phrase_class, phrase_type)
self.statistics_handler.notify_one_table_created()
with self.db_connection.cursor() as db_cursor:
db_cursor.execute("DROP INDEX idx_placex_classtype")
def _create_place_classtype_table(self, sql_tablespace: str,
phrase_class: str, phrase_type: str) -> None:
"""
Create table place_classtype of the given phrase_class/phrase_type
if doesn't exit.
"""
table_name = _classtype_table(phrase_class, phrase_type)
with self.db_connection.cursor() as cur:
cur.execute(SQL("""CREATE TABLE IF NOT EXISTS {} {} AS
SELECT place_id AS place_id,
st_centroid(geometry) AS centroid
FROM placex
WHERE class = %s AND type = %s
""").format(Identifier(table_name), SQL(sql_tablespace)),
(phrase_class, phrase_type))
def _create_place_classtype_indexes(self, sql_tablespace: str,
phrase_class: str, phrase_type: str) -> None:
"""
Create indexes on centroid and place_id for the place_classtype table.
"""
index_prefix = f'idx_place_classtype_{phrase_class}_{phrase_type}_'
base_table = _classtype_table(phrase_class, phrase_type)
# Index on centroid
if not self.db_connection.index_exists(index_prefix + 'centroid'):
with self.db_connection.cursor() as db_cursor:
db_cursor.execute(SQL("CREATE INDEX {} ON {} USING GIST (centroid) {}")
.format(Identifier(index_prefix + 'centroid'),
Identifier(base_table),
SQL(sql_tablespace)))
# Index on place_id
if not self.db_connection.index_exists(index_prefix + 'place_id'):
with self.db_connection.cursor() as db_cursor:
db_cursor.execute(SQL("CREATE INDEX {} ON {} USING btree(place_id) {}")
.format(Identifier(index_prefix + 'place_id'),
Identifier(base_table),
SQL(sql_tablespace)))
def _grant_access_to_webuser(self, phrase_class: str, phrase_type: str) -> None:
"""
Grant access on read to the table place_classtype for the webuser.
"""
table_name = _classtype_table(phrase_class, phrase_type)
with self.db_connection.cursor() as db_cursor:
db_cursor.execute(SQL("""GRANT SELECT ON {} TO {}""")
.format(Identifier(table_name),
Identifier(self.config.DATABASE_WEBUSER)))
def _remove_non_existent_tables_from_db(self) -> None:
"""
Remove special phrases which doesn't exist on the wiki anymore.
Delete the place_classtype tables.
"""
LOG.warning('Cleaning database...')
# Delete place_classtype tables corresponding to class/type which
# are not on the wiki anymore.
with self.db_connection.cursor() as db_cursor:
for table in self.table_phrases_to_delete:
self.statistics_handler.notify_one_table_deleted()
db_cursor.drop_table(table)

View File

@@ -0,0 +1,68 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Module containing the SPWikiLoader class.
"""
from typing import Iterable
import re
import logging
from nominatim_core.config import Configuration
from nominatim_core.utils.url_utils import get_url
from .special_phrase import SpecialPhrase
LOG = logging.getLogger()
def _get_wiki_content(lang: str) -> str:
"""
Request and return the wiki page's content
corresponding to special phrases for a given lang.
Requested URL Example :
https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/EN
"""
url = 'https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/' \
+ lang.upper()
return get_url(url)
class SPWikiLoader:
"""
Handles loading of special phrases from the wiki.
"""
def __init__(self, config: Configuration) -> None:
self.config = config
# Compile the regex here to increase performances.
self.occurence_pattern = re.compile(
r'\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([\-YN])'
)
# Hack around a bug where building=yes was imported with quotes into the wiki
self.type_fix_pattern = re.compile(r'\"|&quot;')
self.languages = self.config.get_str_list('LANGUAGES') or \
['af', 'ar', 'br', 'ca', 'cs', 'de', 'en', 'es',
'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'hr', 'hu',
'ia', 'is', 'it', 'ja', 'mk', 'nl', 'no', 'pl',
'ps', 'pt', 'ru', 'sk', 'sl', 'sv', 'uk', 'vi',
'lv', 'tr']
def generate_phrases(self) -> Iterable[SpecialPhrase]:
""" Download the wiki pages for the configured languages
and extract the phrases from the page.
"""
for lang in self.languages:
LOG.warning('Importing phrases for lang: %s...', lang)
loaded_xml = _get_wiki_content(lang)
# One match will be of format [label, class, type, operator, plural]
matches = self.occurence_pattern.findall(loaded_xml)
for match in matches:
yield SpecialPhrase(match[0],
match[1],
self.type_fix_pattern.sub('', match[2]),
match[3])

View File

@@ -0,0 +1,37 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Module containing the class SpecialPhrase.
This class is a model used to transfer a special phrase through
the process of load and importation.
"""
from typing import Any
class SpecialPhrase:
"""
Model representing a special phrase.
"""
def __init__(self, p_label: str, p_class: str, p_type: str, p_operator: str) -> None:
self.p_label = p_label.strip()
self.p_class = p_class.strip()
self.p_type = p_type.strip()
# Needed if some operator in the wiki are not written in english
p_operator = p_operator.strip().lower()
self.p_operator = '-' if p_operator not in ('near', 'in') else p_operator
def __eq__(self, other: Any) -> bool:
if not isinstance(other, SpecialPhrase):
return False
return self.p_label == other.p_label \
and self.p_class == other.p_class \
and self.p_type == other.p_type \
and self.p_operator == other.p_operator
def __hash__(self) -> int:
return hash((self.p_label, self.p_class, self.p_type, self.p_operator))

View File

@@ -0,0 +1,149 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Functions for importing tiger data and handling tarbar and directory files
"""
from typing import Any, TextIO, List, Union, cast
import csv
import io
import logging
import os
import tarfile
from psycopg2.extras import Json
from nominatim_core.config import Configuration
from nominatim_core.db.connection import connect
from nominatim_core.db.async_connection import WorkerPool
from nominatim_core.db.sql_preprocessor import SQLPreprocessor
from nominatim_core.errors import UsageError
from ..data.place_info import PlaceInfo
from ..tokenizer.base import AbstractAnalyzer, AbstractTokenizer
from . import freeze
LOG = logging.getLogger()
class TigerInput:
""" Context manager that goes through Tiger input files which may
either be in a directory or gzipped together in a tar file.
"""
def __init__(self, data_dir: str) -> None:
self.tar_handle = None
self.files: List[Union[str, tarfile.TarInfo]] = []
if data_dir.endswith('.tar.gz'):
try:
self.tar_handle = tarfile.open(data_dir) # pylint: disable=consider-using-with
except tarfile.ReadError as err:
LOG.fatal("Cannot open '%s'. Is this a tar file?", data_dir)
raise UsageError("Cannot open Tiger data file.") from err
self.files = [i for i in self.tar_handle.getmembers() if i.name.endswith('.csv')]
LOG.warning("Found %d CSV files in tarfile with path %s", len(self.files), data_dir)
else:
files = os.listdir(data_dir)
self.files = [os.path.join(data_dir, i) for i in files if i.endswith('.csv')]
LOG.warning("Found %d CSV files in path %s", len(self.files), data_dir)
if not self.files:
LOG.warning("Tiger data import selected but no files found at %s", data_dir)
def __enter__(self) -> 'TigerInput':
return self
def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
if self.tar_handle:
self.tar_handle.close()
self.tar_handle = None
def next_file(self) -> TextIO:
""" Return a file handle to the next file to be processed.
Raises an IndexError if there is no file left.
"""
fname = self.files.pop(0)
if self.tar_handle is not None:
extracted = self.tar_handle.extractfile(fname)
assert extracted is not None
return io.TextIOWrapper(extracted)
return open(cast(str, fname), encoding='utf-8')
def __len__(self) -> int:
return len(self.files)
def handle_threaded_sql_statements(pool: WorkerPool, fd: TextIO,
analyzer: AbstractAnalyzer) -> None:
""" Handles sql statement with multiplexing
"""
lines = 0
# Using pool of database connections to execute sql statements
sql = "SELECT tiger_line_import(%s, %s, %s, %s, %s, %s)"
for row in csv.DictReader(fd, delimiter=';'):
try:
address = dict(street=row['street'], postcode=row['postcode'])
args = ('SRID=4326;' + row['geometry'],
int(row['from']), int(row['to']), row['interpolation'],
Json(analyzer.process_place(PlaceInfo({'address': address}))),
analyzer.normalize_postcode(row['postcode']))
except ValueError:
continue
pool.next_free_worker().perform(sql, args=args)
lines += 1
if lines == 1000:
print('.', end='', flush=True)
lines = 0
def add_tiger_data(data_dir: str, config: Configuration, threads: int,
tokenizer: AbstractTokenizer) -> int:
""" Import tiger data from directory or tar file `data dir`.
"""
dsn = config.get_libpq_dsn()
with connect(dsn) as conn:
is_frozen = freeze.is_frozen(conn)
conn.close()
if is_frozen:
raise UsageError("Tiger cannot be imported when database frozen (Github issue #3048)")
with TigerInput(data_dir) as tar:
if not tar:
return 1
with connect(dsn) as conn:
sql = SQLPreprocessor(conn, config)
sql.run_sql_file(conn, 'tiger_import_start.sql')
# Reading files and then for each file line handling
# sql_query in <threads - 1> chunks.
place_threads = max(1, threads - 1)
with WorkerPool(dsn, place_threads, ignore_sql_errors=True) as pool:
with tokenizer.name_analyzer() as analyzer:
while tar:
with tar.next_file() as fd:
handle_threaded_sql_statements(pool, fd, analyzer)
print('\n')
LOG.warning("Creating indexes on Tiger data")
with connect(dsn) as conn:
sql = SQLPreprocessor(conn, config)
sql.run_sql_file(conn, 'tiger_import_finish.sql')
return 0

View File

@@ -0,0 +1,62 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Version information for Nominatim.
"""
from typing import Optional, NamedTuple
class NominatimVersion(NamedTuple):
""" Version information for Nominatim. We follow semantic versioning.
Major, minor and patch_level refer to the last released version.
The database patch level tracks important changes between releases
and must always be increased when there is a change to the database or code
that requires a migration.
When adding a migration on the development branch, raise the patch level
to 99 to make sure that the migration is applied when updating from a
patch release to the next minor version. Patch releases usually shouldn't
have migrations in them. When they are needed, then make sure that the
migration can be reapplied and set the migration version to the appropriate
patch level when cherry-picking the commit with the migration.
"""
major: int
minor: int
patch_level: int
db_patch_level: int
def __str__(self) -> str:
return f"{self.major}.{self.minor}.{self.patch_level}-{self.db_patch_level}"
def release_version(self) -> str:
""" Return the release version in semantic versioning format.
The release version does not include the database patch version.
"""
return f"{self.major}.{self.minor}.{self.patch_level}"
NOMINATIM_VERSION = NominatimVersion(4, 4, 99, 1)
POSTGRESQL_REQUIRED_VERSION = (9, 6)
POSTGIS_REQUIRED_VERSION = (2, 2)
# Cmake sets a variable @GIT_HASH@ by executing 'git --log'. It is not run
# on every execution of 'make'.
# cmake/tool-installed.tmpl is used to build the binary 'nominatim'. Inside
# there is a call to set the variable value below.
GIT_COMMIT_HASH : Optional[str] = None
def parse_version(version: str) -> NominatimVersion:
""" Parse a version string into a version consisting of a tuple of
four ints: major, minor, patch level, database patch level
This is the reverse operation of `version_str()`.
"""
parts = version.split('.')
return NominatimVersion(*[int(x) for x in parts[:2] + parts[2].split('-')])