split code into submodules

This commit is contained in:
Sarah Hoffmann
2024-05-16 11:55:17 +02:00
parent 0fb4fe8e4d
commit 6e89310a92
137 changed files with 757 additions and 716 deletions

View File

View File

@@ -0,0 +1,253 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Abstract class definitions for tokenizers. These base classes are here
mainly for documentation purposes.
"""
from abc import ABC, abstractmethod
from typing import List, Tuple, Dict, Any, Optional, Iterable
from pathlib import Path
from nominatim_core.typing import Protocol
from nominatim_core.config import Configuration
from nominatim_core.db.connection import Connection
from ..data.place_info import PlaceInfo
class AbstractAnalyzer(ABC):
""" The analyzer provides the functions for analysing names and building
the token database.
Analyzers are instantiated on a per-thread base. Access to global data
structures must be synchronised accordingly.
"""
def __enter__(self) -> 'AbstractAnalyzer':
return self
def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
self.close()
@abstractmethod
def close(self) -> None:
""" Free all resources used by the analyzer.
"""
@abstractmethod
def get_word_token_info(self, words: List[str]) -> List[Tuple[str, str, int]]:
""" Return token information for the given list of words.
The function is used for testing and debugging only
and does not need to be particularly efficient.
Arguments:
words: A list of words to look up the tokens for.
If a word starts with # it is assumed to be a full name
otherwise is a partial term.
Returns:
The function returns the list of all tuples that could be
found for the given words. Each list entry is a tuple of
(original word, word token, word id).
"""
@abstractmethod
def normalize_postcode(self, postcode: str) -> str:
""" Convert the postcode to its standardized form.
This function must yield exactly the same result as the SQL function
`token_normalized_postcode()`.
Arguments:
postcode: The postcode to be normalized.
Returns:
The given postcode after normalization.
"""
@abstractmethod
def update_postcodes_from_db(self) -> None:
""" Update the tokenizer's postcode tokens from the current content
of the `location_postcode` table.
"""
@abstractmethod
def update_special_phrases(self,
phrases: Iterable[Tuple[str, str, str, str]],
should_replace: bool) -> None:
""" Update the tokenizer's special phrase tokens from the given
list of special phrases.
Arguments:
phrases: The new list of special phrases. Each entry is
a tuple of (phrase, class, type, operator).
should_replace: If true, replace the current list of phrases.
When false, just add the given phrases to the
ones that already exist.
"""
@abstractmethod
def add_country_names(self, country_code: str, names: Dict[str, str]) -> None:
""" Add the given names to the tokenizer's list of country tokens.
Arguments:
country_code: two-letter country code for the country the names
refer to.
names: Dictionary of name type to name.
"""
@abstractmethod
def process_place(self, place: PlaceInfo) -> Any:
""" Extract tokens for the given place and compute the
information to be handed to the PL/pgSQL processor for building
the search index.
Arguments:
place: Place information retrieved from the database.
Returns:
A JSON-serialisable structure that will be handed into
the database via the `token_info` field.
"""
class AbstractTokenizer(ABC):
""" The tokenizer instance is the central instance of the tokenizer in
the system. There will only be a single instance of the tokenizer
active at any time.
"""
@abstractmethod
def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
""" Set up a new tokenizer for the database.
The function should copy all necessary data into the project
directory or save it in the property table to make sure that
the tokenizer remains stable over updates.
Arguments:
config: Read-only object with configuration options.
init_db: When set to False, then initialisation of database
tables should be skipped. This option is only required for
migration purposes and can be safely ignored by custom
tokenizers.
"""
@abstractmethod
def init_from_project(self, config: Configuration) -> None:
""" Initialise the tokenizer from an existing database setup.
The function should load all previously saved configuration from
the project directory and/or the property table.
Arguments:
config: Read-only object with configuration options.
"""
@abstractmethod
def finalize_import(self, config: Configuration) -> None:
""" This function is called at the very end of an import when all
data has been imported and indexed. The tokenizer may create
at this point any additional indexes and data structures needed
during query time.
Arguments:
config: Read-only object with configuration options.
"""
@abstractmethod
def update_sql_functions(self, config: Configuration) -> None:
""" Update the SQL part of the tokenizer. This function is called
automatically on migrations or may be called explicitly by the
user through the `nominatim refresh --functions` command.
The tokenizer must only update the code of the tokenizer. The
data structures or data itself must not be changed by this function.
Arguments:
config: Read-only object with configuration options.
"""
@abstractmethod
def check_database(self, config: Configuration) -> Optional[str]:
""" Check that the database is set up correctly and ready for being
queried.
Arguments:
config: Read-only object with configuration options.
Returns:
If an issue was found, return an error message with the
description of the issue as well as hints for the user on
how to resolve the issue. If everything is okay, return `None`.
"""
@abstractmethod
def update_statistics(self, config: Configuration, threads: int = 1) -> None:
""" Recompute any tokenizer statistics necessary for efficient lookup.
This function is meant to be called from time to time by the user
to improve performance. However, the tokenizer must not depend on
it to be called in order to work.
"""
@abstractmethod
def update_word_tokens(self) -> None:
""" Do house-keeping on the tokenizers internal data structures.
Remove unused word tokens, resort data etc.
"""
@abstractmethod
def name_analyzer(self) -> AbstractAnalyzer:
""" Create a new analyzer for tokenizing names and queries
using this tokinzer. Analyzers are context managers and should
be used accordingly:
```
with tokenizer.name_analyzer() as analyzer:
analyser.tokenize()
```
When used outside the with construct, the caller must ensure to
call the close() function before destructing the analyzer.
"""
@abstractmethod
def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
""" Return a list of the most frequent full words in the database.
Arguments:
conn: Open connection to the database which may be used to
retrieve the words.
num: Maximum number of words to return.
"""
class TokenizerModule(Protocol):
""" Interface that must be exported by modules that implement their
own tokenizer.
"""
def create(self, dsn: str, data_dir: Path) -> AbstractTokenizer:
""" Factory for new tokenizers.
"""

View File

@@ -0,0 +1,102 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Functions for creating a tokenizer or initialising the right one for an
existing database.
A tokenizer is something that is bound to the lifetime of a database. It
can be chosen and configured before the initial import but then needs to
be used consistently when querying and updating the database.
This module provides the functions to create and configure a new tokenizer
as well as instantiating the appropriate tokenizer for updating an existing
database.
A tokenizer usually also includes PHP code for querying. The appropriate PHP
normalizer module is installed, when the tokenizer is created.
"""
from typing import Optional
import logging
import importlib
from pathlib import Path
from nominatim_core.errors import UsageError
from nominatim_core.db import properties
from nominatim_core.db.connection import connect
from nominatim_core.config import Configuration
from ..tokenizer.base import AbstractTokenizer, TokenizerModule
LOG = logging.getLogger()
def _import_tokenizer(name: str) -> TokenizerModule:
""" Load the tokenizer.py module from project directory.
"""
src_file = Path(__file__).parent / (name + '_tokenizer.py')
if not src_file.is_file():
LOG.fatal("No tokenizer named '%s' available. "
"Check the setting of NOMINATIM_TOKENIZER.", name)
raise UsageError('Tokenizer not found')
return importlib.import_module('nominatim.tokenizer.' + name + '_tokenizer')
def create_tokenizer(config: Configuration, init_db: bool = True,
module_name: Optional[str] = None) -> AbstractTokenizer:
""" Create a new tokenizer as defined by the given configuration.
The tokenizer data and code is copied into the 'tokenizer' directory
of the project directory and the tokenizer loaded from its new location.
"""
if module_name is None:
module_name = config.TOKENIZER
# Create the directory for the tokenizer data
assert config.project_dir is not None
basedir = config.project_dir / 'tokenizer'
if not basedir.exists():
basedir.mkdir()
elif not basedir.is_dir():
LOG.fatal("Tokenizer directory '%s' cannot be created.", basedir)
raise UsageError("Tokenizer setup failed.")
# Import and initialize the tokenizer.
tokenizer_module = _import_tokenizer(module_name)
tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
tokenizer.init_new_db(config, init_db=init_db)
with connect(config.get_libpq_dsn()) as conn:
properties.set_property(conn, 'tokenizer', module_name)
return tokenizer
def get_tokenizer_for_db(config: Configuration) -> AbstractTokenizer:
""" Instantiate a tokenizer for an existing database.
The function looks up the appropriate tokenizer in the database
and initialises it.
"""
assert config.project_dir is not None
basedir = config.project_dir / 'tokenizer'
if not basedir.is_dir():
# Directory will be repopulated by tokenizer below.
basedir.mkdir()
with connect(config.get_libpq_dsn()) as conn:
name = properties.get_property(conn, 'tokenizer')
if name is None:
LOG.fatal("Tokenizer was not set up properly. Database property missing.")
raise UsageError('Cannot initialize tokenizer.')
tokenizer_module = _import_tokenizer(name)
tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
tokenizer.init_from_project(config)
return tokenizer

View File

@@ -0,0 +1,196 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Helper class to create ICU rules from a configuration file.
"""
from typing import Mapping, Any, Dict, Optional
import io
import json
import logging
from icu import Transliterator
from nominatim_core.config import flatten_config_list, Configuration
from nominatim_core.db.properties import set_property, get_property
from nominatim_core.db.connection import Connection
from nominatim_core.errors import UsageError
from .place_sanitizer import PlaceSanitizer
from .icu_token_analysis import ICUTokenAnalysis
from .token_analysis.base import AnalysisModule, Analyzer
from ..data import country_info
LOG = logging.getLogger()
DBCFG_IMPORT_NORM_RULES = "tokenizer_import_normalisation"
DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration"
DBCFG_IMPORT_ANALYSIS_RULES = "tokenizer_import_analysis_rules"
def _get_section(rules: Mapping[str, Any], section: str) -> Any:
""" Get the section named 'section' from the rules. If the section does
not exist, raise a usage error with a meaningful message.
"""
if section not in rules:
LOG.fatal("Section '%s' not found in tokenizer config.", section)
raise UsageError("Syntax error in tokenizer configuration file.")
return rules[section]
class ICURuleLoader:
""" Compiler for ICU rules from a tokenizer configuration file.
"""
def __init__(self, config: Configuration) -> None:
self.config = config
rules = config.load_sub_configuration('icu_tokenizer.yaml',
config='TOKENIZER_CONFIG')
# Make sure country information is available to analyzers and sanitizers.
country_info.setup_country_config(config)
self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
self.analysis_rules = _get_section(rules, 'token-analysis')
self._setup_analysis()
# Load optional sanitizer rule set.
self.sanitizer_rules = rules.get('sanitizers', [])
def load_config_from_db(self, conn: Connection) -> None:
""" Get previously saved parts of the configuration from the
database.
"""
rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
if rules is not None:
self.normalization_rules = rules
rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
if rules is not None:
self.transliteration_rules = rules
rules = get_property(conn, DBCFG_IMPORT_ANALYSIS_RULES)
if rules:
self.analysis_rules = json.loads(rules)
else:
self.analysis_rules = []
self._setup_analysis()
def save_config_to_db(self, conn: Connection) -> None:
""" Save the part of the configuration that cannot be changed into
the database.
"""
set_property(conn, DBCFG_IMPORT_NORM_RULES, self.normalization_rules)
set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.transliteration_rules)
set_property(conn, DBCFG_IMPORT_ANALYSIS_RULES, json.dumps(self.analysis_rules))
def make_sanitizer(self) -> PlaceSanitizer:
""" Create a place sanitizer from the configured rules.
"""
return PlaceSanitizer(self.sanitizer_rules, self.config)
def make_token_analysis(self) -> ICUTokenAnalysis:
""" Create a token analyser from the reviouly loaded rules.
"""
return ICUTokenAnalysis(self.normalization_rules,
self.transliteration_rules, self.analysis)
def get_search_rules(self) -> str:
""" Return the ICU rules to be used during search.
The rules combine normalization and transliteration.
"""
# First apply the normalization rules.
rules = io.StringIO()
rules.write(self.normalization_rules)
# Then add transliteration.
rules.write(self.transliteration_rules)
return rules.getvalue()
def get_normalization_rules(self) -> str:
""" Return rules for normalisation of a term.
"""
return self.normalization_rules
def get_transliteration_rules(self) -> str:
""" Return the rules for converting a string into its asciii representation.
"""
return self.transliteration_rules
def _setup_analysis(self) -> None:
""" Process the rules used for creating the various token analyzers.
"""
self.analysis: Dict[Optional[str], TokenAnalyzerRule] = {}
if not isinstance(self.analysis_rules, list):
raise UsageError("Configuration section 'token-analysis' must be a list.")
norm = Transliterator.createFromRules("rule_loader_normalization",
self.normalization_rules)
trans = Transliterator.createFromRules("rule_loader_transliteration",
self.transliteration_rules)
for section in self.analysis_rules:
name = section.get('id', None)
if name in self.analysis:
if name is None:
LOG.fatal("ICU tokenizer configuration has two default token analyzers.")
else:
LOG.fatal("ICU tokenizer configuration has two token "
"analyzers with id '%s'.", name)
raise UsageError("Syntax error in ICU tokenizer config.")
self.analysis[name] = TokenAnalyzerRule(section, norm, trans,
self.config)
@staticmethod
def _cfg_to_icu_rules(rules: Mapping[str, Any], section: str) -> str:
""" Load an ICU ruleset from the given section. If the section is a
simple string, it is interpreted as a file name and the rules are
loaded verbatim from the given file. The filename is expected to be
relative to the tokenizer rule file. If the section is a list then
each line is assumed to be a rule. All rules are concatenated and returned.
"""
content = _get_section(rules, section)
if content is None:
return ''
return ';'.join(flatten_config_list(content, section)) + ';'
class TokenAnalyzerRule:
""" Factory for a single analysis module. The class saves the configuration
and creates a new token analyzer on request.
"""
def __init__(self, rules: Mapping[str, Any],
normalizer: Any, transliterator: Any,
config: Configuration) -> None:
analyzer_name = _get_section(rules, 'analyzer')
if not analyzer_name or not isinstance(analyzer_name, str):
raise UsageError("'analyzer' parameter needs to be simple string")
self._analysis_mod: AnalysisModule = \
config.load_plugin_module(analyzer_name, 'nominatim.tokenizer.token_analysis')
self.config = self._analysis_mod.configure(rules, normalizer,
transliterator)
def create(self, normalizer: Any, transliterator: Any) -> Analyzer:
""" Create a new analyser instance for the given rule.
"""
return self._analysis_mod.create(normalizer, transliterator, self.config)

View File

@@ -0,0 +1,43 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Container class collecting all components required to transform an OSM name
into a Nominatim token.
"""
from typing import Mapping, Optional, TYPE_CHECKING
from icu import Transliterator
from .token_analysis.base import Analyzer
if TYPE_CHECKING:
from typing import Any
from .icu_rule_loader import TokenAnalyzerRule # pylint: disable=cyclic-import
class ICUTokenAnalysis:
""" Container class collecting the transliterators and token analysis
modules for a single Analyser instance.
"""
def __init__(self, norm_rules: str, trans_rules: str,
analysis_rules: Mapping[Optional[str], 'TokenAnalyzerRule']):
self.normalizer = Transliterator.createFromRules("icu_normalization",
norm_rules)
trans_rules += ";[:Space:]+ > ' '"
self.to_ascii = Transliterator.createFromRules("icu_to_ascii",
trans_rules)
self.search = Transliterator.createFromRules("icu_search",
norm_rules + trans_rules)
self.analysis = {name: arules.create(self.normalizer, self.to_ascii)
for name, arules in analysis_rules.items()}
def get_analyzer(self, name: Optional[str]) -> Analyzer:
""" Return the given named analyzer. If no analyzer with that
name exists, return the default analyzer.
"""
return self.analysis.get(name) or self.analysis[None]

View File

@@ -0,0 +1,952 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Tokenizer implementing normalisation as used before Nominatim 4 but using
libICU instead of the PostgreSQL module.
"""
from typing import Optional, Sequence, List, Tuple, Mapping, Any, cast, \
Dict, Set, Iterable
import itertools
import json
import logging
from pathlib import Path
from textwrap import dedent
from nominatim_core.db.connection import connect, Connection, Cursor
from nominatim_core.config import Configuration
from nominatim_core.db.utils import CopyBuffer
from nominatim_core.db.sql_preprocessor import SQLPreprocessor
from ..data.place_info import PlaceInfo
from ..data.place_name import PlaceName
from .icu_rule_loader import ICURuleLoader
from .place_sanitizer import PlaceSanitizer
from .icu_token_analysis import ICUTokenAnalysis
from .base import AbstractAnalyzer, AbstractTokenizer
DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
LOG = logging.getLogger()
WORD_TYPES =(('country_names', 'C'),
('postcodes', 'P'),
('full_word', 'W'),
('housenumbers', 'H'))
def create(dsn: str, data_dir: Path) -> 'ICUTokenizer':
""" Create a new instance of the tokenizer provided by this module.
"""
return ICUTokenizer(dsn, data_dir)
class ICUTokenizer(AbstractTokenizer):
""" This tokenizer uses libICU to convert names and queries to ASCII.
Otherwise it uses the same algorithms and data structures as the
normalization routines in Nominatim 3.
"""
def __init__(self, dsn: str, data_dir: Path) -> None:
self.dsn = dsn
self.data_dir = data_dir
self.loader: Optional[ICURuleLoader] = None
def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
""" Set up a new tokenizer for the database.
This copies all necessary data in the project directory to make
sure the tokenizer remains stable even over updates.
"""
self.loader = ICURuleLoader(config)
self._install_php(config.lib_dir.php, overwrite=True)
self._save_config()
if init_db:
self.update_sql_functions(config)
self._setup_db_tables(config)
self._create_base_indices(config, 'word')
def init_from_project(self, config: Configuration) -> None:
""" Initialise the tokenizer from the project directory.
"""
self.loader = ICURuleLoader(config)
with connect(self.dsn) as conn:
self.loader.load_config_from_db(conn)
self._install_php(config.lib_dir.php, overwrite=False)
def finalize_import(self, config: Configuration) -> None:
""" Do any required postprocessing to make the tokenizer data ready
for use.
"""
self._create_lookup_indices(config, 'word')
def update_sql_functions(self, config: Configuration) -> None:
""" Reimport the SQL functions for this tokenizer.
"""
with connect(self.dsn) as conn:
sqlp = SQLPreprocessor(conn, config)
sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
def check_database(self, config: Configuration) -> None:
""" Check that the tokenizer is set up correctly.
"""
# Will throw an error if there is an issue.
self.init_from_project(config)
def update_statistics(self, config: Configuration, threads: int = 2) -> None:
""" Recompute frequencies for all name words.
"""
with connect(self.dsn) as conn:
if not conn.table_exists('search_name'):
return
with conn.cursor() as cur:
cur.execute('ANALYSE search_name')
if threads > 1:
cur.execute('SET max_parallel_workers_per_gather TO %s',
(min(threads, 6),))
if conn.server_version_tuple() < (12, 0):
LOG.info('Computing word frequencies')
cur.drop_table('word_frequencies')
cur.drop_table('addressword_frequencies')
cur.execute("""CREATE TEMP TABLE word_frequencies AS
SELECT unnest(name_vector) as id, count(*)
FROM search_name GROUP BY id""")
cur.execute('CREATE INDEX ON word_frequencies(id)')
cur.execute("""CREATE TEMP TABLE addressword_frequencies AS
SELECT unnest(nameaddress_vector) as id, count(*)
FROM search_name GROUP BY id""")
cur.execute('CREATE INDEX ON addressword_frequencies(id)')
cur.execute("""CREATE OR REPLACE FUNCTION word_freq_update(wid INTEGER,
INOUT info JSONB)
AS $$
DECLARE rec RECORD;
BEGIN
IF info is null THEN
info = '{}'::jsonb;
END IF;
FOR rec IN SELECT count FROM word_frequencies WHERE id = wid
LOOP
info = info || jsonb_build_object('count', rec.count);
END LOOP;
FOR rec IN SELECT count FROM addressword_frequencies WHERE id = wid
LOOP
info = info || jsonb_build_object('addr_count', rec.count);
END LOOP;
IF info = '{}'::jsonb THEN
info = null;
END IF;
END;
$$ LANGUAGE plpgsql IMMUTABLE;
""")
LOG.info('Update word table with recomputed frequencies')
cur.drop_table('tmp_word')
cur.execute("""CREATE TABLE tmp_word AS
SELECT word_id, word_token, type, word,
word_freq_update(word_id, info) as info
FROM word
""")
cur.drop_table('word_frequencies')
cur.drop_table('addressword_frequencies')
else:
LOG.info('Computing word frequencies')
cur.drop_table('word_frequencies')
cur.execute("""
CREATE TEMP TABLE word_frequencies AS
WITH word_freq AS MATERIALIZED (
SELECT unnest(name_vector) as id, count(*)
FROM search_name GROUP BY id),
addr_freq AS MATERIALIZED (
SELECT unnest(nameaddress_vector) as id, count(*)
FROM search_name GROUP BY id)
SELECT coalesce(a.id, w.id) as id,
(CASE WHEN w.count is null THEN '{}'::JSONB
ELSE jsonb_build_object('count', w.count) END
||
CASE WHEN a.count is null THEN '{}'::JSONB
ELSE jsonb_build_object('addr_count', a.count) END) as info
FROM word_freq w FULL JOIN addr_freq a ON a.id = w.id;
""")
cur.execute('CREATE UNIQUE INDEX ON word_frequencies(id) INCLUDE(info)')
cur.execute('ANALYSE word_frequencies')
LOG.info('Update word table with recomputed frequencies')
cur.drop_table('tmp_word')
cur.execute("""CREATE TABLE tmp_word AS
SELECT word_id, word_token, type, word,
(CASE WHEN wf.info is null THEN word.info
ELSE coalesce(word.info, '{}'::jsonb) || wf.info
END) as info
FROM word LEFT JOIN word_frequencies wf
ON word.word_id = wf.id
""")
cur.drop_table('word_frequencies')
with conn.cursor() as cur:
cur.execute('SET max_parallel_workers_per_gather TO 0')
sqlp = SQLPreprocessor(conn, config)
sqlp.run_string(conn,
'GRANT SELECT ON tmp_word TO "{{config.DATABASE_WEBUSER}}"')
conn.commit()
self._create_base_indices(config, 'tmp_word')
self._create_lookup_indices(config, 'tmp_word')
self._move_temporary_word_table('tmp_word')
def _cleanup_housenumbers(self) -> None:
""" Remove unused house numbers.
"""
with connect(self.dsn) as conn:
if not conn.table_exists('search_name'):
return
with conn.cursor(name="hnr_counter") as cur:
cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
FROM word
WHERE type = 'H'
AND NOT EXISTS(SELECT * FROM search_name
WHERE ARRAY[word.word_id] && name_vector)
AND (char_length(coalesce(word, word_token)) > 6
OR coalesce(word, word_token) not similar to '\\d+')
""")
candidates = {token: wid for wid, token in cur}
with conn.cursor(name="hnr_counter") as cur:
cur.execute("""SELECT housenumber FROM placex
WHERE housenumber is not null
AND (char_length(housenumber) > 6
OR housenumber not similar to '\\d+')
""")
for row in cur:
for hnr in row[0].split(';'):
candidates.pop(hnr, None)
LOG.info("There are %s outdated housenumbers.", len(candidates))
LOG.debug("Outdated housenumbers: %s", candidates.keys())
if candidates:
with conn.cursor() as cur:
cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
(list(candidates.values()), ))
conn.commit()
def update_word_tokens(self) -> None:
""" Remove unused tokens.
"""
LOG.warning("Cleaning up housenumber tokens.")
self._cleanup_housenumbers()
LOG.warning("Tokenizer house-keeping done.")
def name_analyzer(self) -> 'ICUNameAnalyzer':
""" Create a new analyzer for tokenizing names and queries
using this tokinzer. Analyzers are context managers and should
be used accordingly:
```
with tokenizer.name_analyzer() as analyzer:
analyser.tokenize()
```
When used outside the with construct, the caller must ensure to
call the close() function before destructing the analyzer.
Analyzers are not thread-safe. You need to instantiate one per thread.
"""
assert self.loader is not None
return ICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
self.loader.make_token_analysis())
def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
""" Return a list of the `num` most frequent full words
in the database.
"""
with conn.cursor() as cur:
cur.execute("""SELECT word, sum((info->>'count')::int) as count
FROM word WHERE type = 'W'
GROUP BY word
ORDER BY count DESC LIMIT %s""", (num,))
return list(s[0].split('@')[0] for s in cur)
def _install_php(self, phpdir: Optional[Path], overwrite: bool = True) -> None:
""" Install the php script for the tokenizer.
"""
if phpdir is not None:
assert self.loader is not None
php_file = self.data_dir / "tokenizer.php"
if not php_file.exists() or overwrite:
php_file.write_text(dedent(f"""\
<?php
@define('CONST_Max_Word_Frequency', 10000000);
@define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
@define('CONST_Transliteration', "{self.loader.get_search_rules()}");
require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""), encoding='utf-8')
def _save_config(self) -> None:
""" Save the configuration that needs to remain stable for the given
database as database properties.
"""
assert self.loader is not None
with connect(self.dsn) as conn:
self.loader.save_config_to_db(conn)
def _setup_db_tables(self, config: Configuration) -> None:
""" Set up the word table and fill it with pre-computed word
frequencies.
"""
with connect(self.dsn) as conn:
with conn.cursor() as cur:
cur.drop_table('word')
sqlp = SQLPreprocessor(conn, config)
sqlp.run_string(conn, """
CREATE TABLE word (
word_id INTEGER,
word_token text NOT NULL,
type text NOT NULL,
word text,
info jsonb
) {{db.tablespace.search_data}};
GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
DROP SEQUENCE IF EXISTS seq_word;
CREATE SEQUENCE seq_word start 1;
GRANT SELECT ON seq_word to "{{config.DATABASE_WEBUSER}}";
""")
conn.commit()
def _create_base_indices(self, config: Configuration, table_name: str) -> None:
""" Set up the word table and fill it with pre-computed word
frequencies.
"""
with connect(self.dsn) as conn:
sqlp = SQLPreprocessor(conn, config)
sqlp.run_string(conn,
"""CREATE INDEX idx_{{table_name}}_word_token ON {{table_name}}
USING BTREE (word_token) {{db.tablespace.search_index}}""",
table_name=table_name)
for name, ctype in WORD_TYPES:
sqlp.run_string(conn,
"""CREATE INDEX idx_{{table_name}}_{{idx_name}} ON {{table_name}}
USING BTREE (word) {{db.tablespace.address_index}}
WHERE type = '{{column_type}}'
""",
table_name=table_name, idx_name=name,
column_type=ctype)
conn.commit()
def _create_lookup_indices(self, config: Configuration, table_name: str) -> None:
""" Create additional indexes used when running the API.
"""
with connect(self.dsn) as conn:
sqlp = SQLPreprocessor(conn, config)
# Index required for details lookup.
sqlp.run_string(conn, """
CREATE INDEX IF NOT EXISTS idx_{{table_name}}_word_id
ON {{table_name}} USING BTREE (word_id) {{db.tablespace.search_index}}
""",
table_name=table_name)
conn.commit()
def _move_temporary_word_table(self, old: str) -> None:
""" Rename all tables and indexes used by the tokenizer.
"""
with connect(self.dsn) as conn:
with conn.cursor() as cur:
cur.drop_table('word')
cur.execute(f"ALTER TABLE {old} RENAME TO word")
for idx in ('word_token', 'word_id'):
cur.execute(f"""ALTER INDEX idx_{old}_{idx}
RENAME TO idx_word_{idx}""")
for name, _ in WORD_TYPES:
cur.execute(f"""ALTER INDEX idx_{old}_{name}
RENAME TO idx_word_{name}""")
conn.commit()
class ICUNameAnalyzer(AbstractAnalyzer):
""" The ICU analyzer uses the ICU library for splitting names.
Each instance opens a connection to the database to request the
normalization.
"""
def __init__(self, dsn: str, sanitizer: PlaceSanitizer,
token_analysis: ICUTokenAnalysis) -> None:
self.conn: Optional[Connection] = connect(dsn).connection
self.conn.autocommit = True
self.sanitizer = sanitizer
self.token_analysis = token_analysis
self._cache = _TokenCache()
def close(self) -> None:
""" Free all resources used by the analyzer.
"""
if self.conn:
self.conn.close()
self.conn = None
def _search_normalized(self, name: str) -> str:
""" Return the search token transliteration of the given name.
"""
return cast(str, self.token_analysis.search.transliterate(name)).strip()
def _normalized(self, name: str) -> str:
""" Return the normalized version of the given name with all
non-relevant information removed.
"""
return cast(str, self.token_analysis.normalizer.transliterate(name)).strip()
def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
""" Return token information for the given list of words.
If a word starts with # it is assumed to be a full name
otherwise is a partial name.
The function returns a list of tuples with
(original word, word token, word id).
The function is used for testing and debugging only
and not necessarily efficient.
"""
assert self.conn is not None
full_tokens = {}
partial_tokens = {}
for word in words:
if word.startswith('#'):
full_tokens[word] = self._search_normalized(word[1:])
else:
partial_tokens[word] = self._search_normalized(word)
with self.conn.cursor() as cur:
cur.execute("""SELECT word_token, word_id
FROM word WHERE word_token = ANY(%s) and type = 'W'
""", (list(full_tokens.values()),))
full_ids = {r[0]: r[1] for r in cur}
cur.execute("""SELECT word_token, word_id
FROM word WHERE word_token = ANY(%s) and type = 'w'""",
(list(partial_tokens.values()),))
part_ids = {r[0]: r[1] for r in cur}
return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
+ [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
def normalize_postcode(self, postcode: str) -> str:
""" Convert the postcode to a standardized form.
This function must yield exactly the same result as the SQL function
'token_normalized_postcode()'.
"""
return postcode.strip().upper()
def update_postcodes_from_db(self) -> None:
""" Update postcode tokens in the word table from the location_postcode
table.
"""
assert self.conn is not None
analyzer = self.token_analysis.analysis.get('@postcode')
with self.conn.cursor() as cur:
# First get all postcode names currently in the word table.
cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'")
word_entries = set((entry[0] for entry in cur))
# Then compute the required postcode names from the postcode table.
needed_entries = set()
cur.execute("SELECT country_code, postcode FROM location_postcode")
for cc, postcode in cur:
info = PlaceInfo({'country_code': cc,
'class': 'place', 'type': 'postcode',
'address': {'postcode': postcode}})
address = self.sanitizer.process_names(info)[1]
for place in address:
if place.kind == 'postcode':
if analyzer is None:
postcode_name = place.name.strip().upper()
variant_base = None
else:
postcode_name = analyzer.get_canonical_id(place)
variant_base = place.get_attr("variant")
if variant_base:
needed_entries.add(f'{postcode_name}@{variant_base}')
else:
needed_entries.add(postcode_name)
break
# Now update the word table.
self._delete_unused_postcode_words(word_entries - needed_entries)
self._add_missing_postcode_words(needed_entries - word_entries)
def _delete_unused_postcode_words(self, tokens: Iterable[str]) -> None:
assert self.conn is not None
if tokens:
with self.conn.cursor() as cur:
cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
(list(tokens), ))
def _add_missing_postcode_words(self, tokens: Iterable[str]) -> None:
assert self.conn is not None
if not tokens:
return
analyzer = self.token_analysis.analysis.get('@postcode')
terms = []
for postcode_name in tokens:
if '@' in postcode_name:
term, variant = postcode_name.split('@', 2)
term = self._search_normalized(term)
if analyzer is None:
variants = [term]
else:
variants = analyzer.compute_variants(variant)
if term not in variants:
variants.append(term)
else:
variants = [self._search_normalized(postcode_name)]
terms.append((postcode_name, variants))
if terms:
with self.conn.cursor() as cur:
cur.execute_values("""SELECT create_postcode_word(pc, var)
FROM (VALUES %s) AS v(pc, var)""",
terms)
def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
should_replace: bool) -> None:
""" Replace the search index for special phrases with the new phrases.
If `should_replace` is True, then the previous set of will be
completely replaced. Otherwise the phrases are added to the
already existing ones.
"""
assert self.conn is not None
norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
for p in phrases))
with self.conn.cursor() as cur:
# Get the old phrases.
existing_phrases = set()
cur.execute("SELECT word, info FROM word WHERE type = 'S'")
for word, info in cur:
existing_phrases.add((word, info['class'], info['type'],
info.get('op') or '-'))
added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
if should_replace:
deleted = self._remove_special_phrases(cur, norm_phrases,
existing_phrases)
else:
deleted = 0
LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
len(norm_phrases), added, deleted)
def _add_special_phrases(self, cursor: Cursor,
new_phrases: Set[Tuple[str, str, str, str]],
existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
""" Add all phrases to the database that are not yet there.
"""
to_add = new_phrases - existing_phrases
added = 0
with CopyBuffer() as copystr:
for word, cls, typ, oper in to_add:
term = self._search_normalized(word)
if term:
copystr.add(term, 'S', word,
json.dumps({'class': cls, 'type': typ,
'op': oper if oper in ('in', 'near') else None}))
added += 1
copystr.copy_out(cursor, 'word',
columns=['word_token', 'type', 'word', 'info'])
return added
def _remove_special_phrases(self, cursor: Cursor,
new_phrases: Set[Tuple[str, str, str, str]],
existing_phrases: Set[Tuple[str, str, str, str]]) -> int:
""" Remove all phrases from the database that are no longer in the
new phrase list.
"""
to_delete = existing_phrases - new_phrases
if to_delete:
cursor.execute_values(
""" DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
WHERE type = 'S' and word = name
and info->>'class' = in_class and info->>'type' = in_type
and ((op = '-' and info->>'op' is null) or op = info->>'op')
""", to_delete)
return len(to_delete)
def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
""" Add default names for the given country to the search index.
"""
# Make sure any name preprocessing for country names applies.
info = PlaceInfo({'name': names, 'country_code': country_code,
'rank_address': 4, 'class': 'boundary',
'type': 'administrative'})
self._add_country_full_names(country_code,
self.sanitizer.process_names(info)[0],
internal=True)
def _add_country_full_names(self, country_code: str, names: Sequence[PlaceName],
internal: bool = False) -> None:
""" Add names for the given country from an already sanitized
name list.
"""
assert self.conn is not None
word_tokens = set()
for name in names:
norm_name = self._search_normalized(name.name)
if norm_name:
word_tokens.add(norm_name)
with self.conn.cursor() as cur:
# Get existing names
cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
FROM word
WHERE type = 'C' and word = %s""",
(country_code, ))
# internal/external names
existing_tokens: Dict[bool, Set[str]] = {True: set(), False: set()}
for word in cur:
existing_tokens[word[1]].add(word[0])
# Delete names that no longer exist.
gone_tokens = existing_tokens[internal] - word_tokens
if internal:
gone_tokens.update(existing_tokens[False] & word_tokens)
if gone_tokens:
cur.execute("""DELETE FROM word
USING unnest(%s) as token
WHERE type = 'C' and word = %s
and word_token = token""",
(list(gone_tokens), country_code))
# Only add those names that are not yet in the list.
new_tokens = word_tokens - existing_tokens[True]
if not internal:
new_tokens -= existing_tokens[False]
if new_tokens:
if internal:
sql = """INSERT INTO word (word_token, type, word, info)
(SELECT token, 'C', %s, '{"internal": "yes"}'
FROM unnest(%s) as token)
"""
else:
sql = """INSERT INTO word (word_token, type, word)
(SELECT token, 'C', %s
FROM unnest(%s) as token)
"""
cur.execute(sql, (country_code, list(new_tokens)))
def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
""" Determine tokenizer information about the given place.
Returns a JSON-serializable structure that will be handed into
the database via the token_info field.
"""
token_info = _TokenInfo()
names, address = self.sanitizer.process_names(place)
if names:
token_info.set_names(*self._compute_name_tokens(names))
if place.is_country():
assert place.country_code is not None
self._add_country_full_names(place.country_code, names)
if address:
self._process_place_address(token_info, address)
return token_info.to_dict()
def _process_place_address(self, token_info: '_TokenInfo',
address: Sequence[PlaceName]) -> None:
for item in address:
if item.kind == 'postcode':
token_info.set_postcode(self._add_postcode(item))
elif item.kind == 'housenumber':
token_info.add_housenumber(*self._compute_housenumber_token(item))
elif item.kind == 'street':
token_info.add_street(self._retrieve_full_tokens(item.name))
elif item.kind == 'place':
if not item.suffix:
token_info.add_place(itertools.chain(*self._compute_name_tokens([item])))
elif not item.kind.startswith('_') and not item.suffix and \
item.kind not in ('country', 'full', 'inclusion'):
token_info.add_address_term(item.kind,
itertools.chain(*self._compute_name_tokens([item])))
def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]:
""" Normalize the housenumber and return the word token and the
canonical form.
"""
assert self.conn is not None
analyzer = self.token_analysis.analysis.get('@housenumber')
result: Tuple[Optional[int], Optional[str]] = (None, None)
if analyzer is None:
# When no custom analyzer is set, simply normalize and transliterate
norm_name = self._search_normalized(hnr.name)
if norm_name:
result = self._cache.housenumbers.get(norm_name, result)
if result[0] is None:
with self.conn.cursor() as cur:
hid = cur.scalar("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
result = hid, norm_name
self._cache.housenumbers[norm_name] = result
else:
# Otherwise use the analyzer to determine the canonical name.
# Per convention we use the first variant as the 'lookup name', the
# name that gets saved in the housenumber field of the place.
word_id = analyzer.get_canonical_id(hnr)
if word_id:
result = self._cache.housenumbers.get(word_id, result)
if result[0] is None:
variants = analyzer.compute_variants(word_id)
if variants:
with self.conn.cursor() as cur:
hid = cur.scalar("SELECT create_analyzed_hnr_id(%s, %s)",
(word_id, list(variants)))
result = hid, variants[0]
self._cache.housenumbers[word_id] = result
return result
def _retrieve_full_tokens(self, name: str) -> List[int]:
""" Get the full name token for the given name, if it exists.
The name is only retrieved for the standard analyser.
"""
assert self.conn is not None
norm_name = self._search_normalized(name)
# return cached if possible
if norm_name in self._cache.fulls:
return self._cache.fulls[norm_name]
with self.conn.cursor() as cur:
cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
(norm_name, ))
full = [row[0] for row in cur]
self._cache.fulls[norm_name] = full
return full
def _compute_name_tokens(self, names: Sequence[PlaceName]) -> Tuple[Set[int], Set[int]]:
""" Computes the full name and partial name tokens for the given
dictionary of names.
"""
assert self.conn is not None
full_tokens: Set[int] = set()
partial_tokens: Set[int] = set()
for name in names:
analyzer_id = name.get_attr('analyzer')
analyzer = self.token_analysis.get_analyzer(analyzer_id)
word_id = analyzer.get_canonical_id(name)
if analyzer_id is None:
token_id = word_id
else:
token_id = f'{word_id}@{analyzer_id}'
full, part = self._cache.names.get(token_id, (None, None))
if full is None:
variants = analyzer.compute_variants(word_id)
if not variants:
continue
with self.conn.cursor() as cur:
cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
(token_id, variants))
full, part = cast(Tuple[int, List[int]], cur.fetchone())
self._cache.names[token_id] = (full, part)
assert part is not None
full_tokens.add(full)
partial_tokens.update(part)
return full_tokens, partial_tokens
def _add_postcode(self, item: PlaceName) -> Optional[str]:
""" Make sure the normalized postcode is present in the word table.
"""
assert self.conn is not None
analyzer = self.token_analysis.analysis.get('@postcode')
if analyzer is None:
postcode_name = item.name.strip().upper()
variant_base = None
else:
postcode_name = analyzer.get_canonical_id(item)
variant_base = item.get_attr("variant")
if variant_base:
postcode = f'{postcode_name}@{variant_base}'
else:
postcode = postcode_name
if postcode not in self._cache.postcodes:
term = self._search_normalized(postcode_name)
if not term:
return None
variants = {term}
if analyzer is not None and variant_base:
variants.update(analyzer.compute_variants(variant_base))
with self.conn.cursor() as cur:
cur.execute("SELECT create_postcode_word(%s, %s)",
(postcode, list(variants)))
self._cache.postcodes.add(postcode)
return postcode_name
class _TokenInfo:
""" Collect token information to be sent back to the database.
"""
def __init__(self) -> None:
self.names: Optional[str] = None
self.housenumbers: Set[str] = set()
self.housenumber_tokens: Set[int] = set()
self.street_tokens: Optional[Set[int]] = None
self.place_tokens: Set[int] = set()
self.address_tokens: Dict[str, str] = {}
self.postcode: Optional[str] = None
def _mk_array(self, tokens: Iterable[Any]) -> str:
return f"{{{','.join((str(s) for s in tokens))}}}"
def to_dict(self) -> Dict[str, Any]:
""" Return the token information in database importable format.
"""
out: Dict[str, Any] = {}
if self.names:
out['names'] = self.names
if self.housenumbers:
out['hnr'] = ';'.join(self.housenumbers)
out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
if self.street_tokens is not None:
out['street'] = self._mk_array(self.street_tokens)
if self.place_tokens:
out['place'] = self._mk_array(self.place_tokens)
if self.address_tokens:
out['addr'] = self.address_tokens
if self.postcode:
out['postcode'] = self.postcode
return out
def set_names(self, fulls: Iterable[int], partials: Iterable[int]) -> None:
""" Adds token information for the normalised names.
"""
self.names = self._mk_array(itertools.chain(fulls, partials))
def add_housenumber(self, token: Optional[int], hnr: Optional[str]) -> None:
""" Extract housenumber information from a list of normalised
housenumbers.
"""
if token:
assert hnr is not None
self.housenumbers.add(hnr)
self.housenumber_tokens.add(token)
def add_street(self, tokens: Iterable[int]) -> None:
""" Add addr:street match terms.
"""
if self.street_tokens is None:
self.street_tokens = set()
self.street_tokens.update(tokens)
def add_place(self, tokens: Iterable[int]) -> None:
""" Add addr:place search and match terms.
"""
self.place_tokens.update(tokens)
def add_address_term(self, key: str, partials: Iterable[int]) -> None:
""" Add additional address terms.
"""
array = self._mk_array(partials)
if len(array) > 2:
self.address_tokens[key] = array
def set_postcode(self, postcode: Optional[str]) -> None:
""" Set the postcode to the given one.
"""
self.postcode = postcode
class _TokenCache:
""" Cache for token information to avoid repeated database queries.
This cache is not thread-safe and needs to be instantiated per
analyzer.
"""
def __init__(self) -> None:
self.names: Dict[str, Tuple[int, List[int]]] = {}
self.partials: Dict[str, int] = {}
self.fulls: Dict[str, List[int]] = {}
self.postcodes: Set[str] = set()
self.housenumbers: Dict[str, Tuple[Optional[int], Optional[str]]] = {}

View File

@@ -0,0 +1,681 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Tokenizer implementing normalisation as used before Nominatim 4.
"""
from typing import Optional, Sequence, List, Tuple, Mapping, Any, Callable, \
cast, Dict, Set, Iterable
from collections import OrderedDict
import logging
from pathlib import Path
import re
import shutil
from textwrap import dedent
from icu import Transliterator
import psycopg2
import psycopg2.extras
from nominatim_core.errors import UsageError
from nominatim_core.db.connection import connect, Connection
from nominatim_core.config import Configuration
from nominatim_core.db import properties
from nominatim_core.db import utils as db_utils
from nominatim_core.db.sql_preprocessor import SQLPreprocessor
from ..data.place_info import PlaceInfo
from .base import AbstractAnalyzer, AbstractTokenizer
DBCFG_NORMALIZATION = "tokenizer_normalization"
DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
LOG = logging.getLogger()
def create(dsn: str, data_dir: Path) -> 'LegacyTokenizer':
""" Create a new instance of the tokenizer provided by this module.
"""
return LegacyTokenizer(dsn, data_dir)
def _install_module(config_module_path: str, src_dir: Path, module_dir: Path) -> str:
""" Copies the PostgreSQL normalisation module into the project
directory if necessary. For historical reasons the module is
saved in the '/module' subdirectory and not with the other tokenizer
data.
The function detects when the installation is run from the
build directory. It doesn't touch the module in that case.
"""
# Custom module locations are simply used as is.
if config_module_path:
LOG.info("Using custom path for database module at '%s'", config_module_path)
return config_module_path
# Compatibility mode for builddir installations.
if module_dir.exists() and src_dir.samefile(module_dir):
LOG.info('Running from build directory. Leaving database module as is.')
return str(module_dir)
# In any other case install the module in the project directory.
if not module_dir.exists():
module_dir.mkdir()
destfile = module_dir / 'nominatim.so'
shutil.copy(str(src_dir / 'nominatim.so'), str(destfile))
destfile.chmod(0o755)
LOG.info('Database module installed at %s', str(destfile))
return str(module_dir)
def _check_module(module_dir: str, conn: Connection) -> None:
""" Try to use the PostgreSQL module to confirm that it is correctly
installed and accessible from PostgreSQL.
"""
with conn.cursor() as cur:
try:
cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
RETURNS text AS %s, 'transliteration'
LANGUAGE c IMMUTABLE STRICT;
DROP FUNCTION nominatim_test_import_func(text)
""", (f'{module_dir}/nominatim.so', ))
except psycopg2.DatabaseError as err:
LOG.fatal("Error accessing database module: %s", err)
raise UsageError("Database module cannot be accessed.") from err
class LegacyTokenizer(AbstractTokenizer):
""" The legacy tokenizer uses a special PostgreSQL module to normalize
names and queries. The tokenizer thus implements normalization through
calls to the database.
"""
def __init__(self, dsn: str, data_dir: Path) -> None:
self.dsn = dsn
self.data_dir = data_dir
self.normalization: Optional[str] = None
def init_new_db(self, config: Configuration, init_db: bool = True) -> None:
""" Set up a new tokenizer for the database.
This copies all necessary data in the project directory to make
sure the tokenizer remains stable even over updates.
"""
assert config.project_dir is not None
module_dir = _install_module(config.DATABASE_MODULE_PATH,
config.lib_dir.module,
config.project_dir / 'module')
self.normalization = config.TERM_NORMALIZATION
self._install_php(config, overwrite=True)
with connect(self.dsn) as conn:
_check_module(module_dir, conn)
self._save_config(conn, config)
conn.commit()
if init_db:
self.update_sql_functions(config)
self._init_db_tables(config)
def init_from_project(self, config: Configuration) -> None:
""" Initialise the tokenizer from the project directory.
"""
assert config.project_dir is not None
with connect(self.dsn) as conn:
self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
if not (config.project_dir / 'module' / 'nominatim.so').exists():
_install_module(config.DATABASE_MODULE_PATH,
config.lib_dir.module,
config.project_dir / 'module')
self._install_php(config, overwrite=False)
def finalize_import(self, config: Configuration) -> None:
""" Do any required postprocessing to make the tokenizer data ready
for use.
"""
with connect(self.dsn) as conn:
sqlp = SQLPreprocessor(conn, config)
sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
def update_sql_functions(self, config: Configuration) -> None:
""" Reimport the SQL functions for this tokenizer.
"""
assert config.project_dir is not None
with connect(self.dsn) as conn:
max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ)
modulepath = config.DATABASE_MODULE_PATH or \
str((config.project_dir / 'module').resolve())
sqlp = SQLPreprocessor(conn, config)
sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer.sql',
max_word_freq=max_word_freq,
modulepath=modulepath)
def check_database(self, _: Configuration) -> Optional[str]:
""" Check that the tokenizer is set up correctly.
"""
hint = """\
The Postgresql extension nominatim.so was not correctly loaded.
Error: {error}
Hints:
* Check the output of the CMmake/make installation step
* Does nominatim.so exist?
* Does nominatim.so exist on the database server?
* Can nominatim.so be accessed by the database user?
"""
with connect(self.dsn) as conn:
with conn.cursor() as cur:
try:
out = cur.scalar("SELECT make_standard_name('a')")
except psycopg2.Error as err:
return hint.format(error=str(err))
if out != 'a':
return hint.format(error='Unexpected result for make_standard_name()')
return None
def migrate_database(self, config: Configuration) -> None:
""" Initialise the project directory of an existing database for
use with this tokenizer.
This is a special migration function for updating existing databases
to new software versions.
"""
assert config.project_dir is not None
self.normalization = config.TERM_NORMALIZATION
module_dir = _install_module(config.DATABASE_MODULE_PATH,
config.lib_dir.module,
config.project_dir / 'module')
with connect(self.dsn) as conn:
_check_module(module_dir, conn)
self._save_config(conn, config)
def update_statistics(self, config: Configuration, threads: int = 1) -> None:
""" Recompute the frequency of full words.
"""
with connect(self.dsn) as conn:
if conn.table_exists('search_name'):
with conn.cursor() as cur:
cur.drop_table("word_frequencies")
LOG.info("Computing word frequencies")
cur.execute("""CREATE TEMP TABLE word_frequencies AS
SELECT unnest(name_vector) as id, count(*)
FROM search_name GROUP BY id""")
cur.execute("CREATE INDEX ON word_frequencies(id)")
LOG.info("Update word table with recomputed frequencies")
cur.execute("""UPDATE word SET search_name_count = count
FROM word_frequencies
WHERE word_token like ' %' and word_id = id""")
cur.drop_table("word_frequencies")
conn.commit()
def update_word_tokens(self) -> None:
""" No house-keeping implemented for the legacy tokenizer.
"""
LOG.info("No tokenizer clean-up available.")
def name_analyzer(self) -> 'LegacyNameAnalyzer':
""" Create a new analyzer for tokenizing names and queries
using this tokinzer. Analyzers are context managers and should
be used accordingly:
```
with tokenizer.name_analyzer() as analyzer:
analyser.tokenize()
```
When used outside the with construct, the caller must ensure to
call the close() function before destructing the analyzer.
Analyzers are not thread-safe. You need to instantiate one per thread.
"""
normalizer = Transliterator.createFromRules("phrase normalizer",
self.normalization)
return LegacyNameAnalyzer(self.dsn, normalizer)
def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
""" Return a list of the `num` most frequent full words
in the database.
"""
with conn.cursor() as cur:
cur.execute(""" SELECT word FROM word WHERE word is not null
ORDER BY search_name_count DESC LIMIT %s""", (num,))
return list(s[0] for s in cur)
def _install_php(self, config: Configuration, overwrite: bool = True) -> None:
""" Install the php script for the tokenizer.
"""
if config.lib_dir.php is not None:
php_file = self.data_dir / "tokenizer.php"
if not php_file.exists() or overwrite:
php_file.write_text(dedent(f"""\
<?php
@define('CONST_Max_Word_Frequency', {config.MAX_WORD_FREQUENCY});
@define('CONST_Term_Normalization_Rules', "{config.TERM_NORMALIZATION}");
require_once('{config.lib_dir.php}/tokenizer/legacy_tokenizer.php');
"""), encoding='utf-8')
def _init_db_tables(self, config: Configuration) -> None:
""" Set up the word table and fill it with pre-computed word
frequencies.
"""
with connect(self.dsn) as conn:
sqlp = SQLPreprocessor(conn, config)
sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
conn.commit()
LOG.warning("Precomputing word tokens")
db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
def _save_config(self, conn: Connection, config: Configuration) -> None:
""" Save the configuration that needs to remain stable for the given
database as database properties.
"""
assert self.normalization is not None
properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
class LegacyNameAnalyzer(AbstractAnalyzer):
""" The legacy analyzer uses the special Postgresql module for
splitting names.
Each instance opens a connection to the database to request the
normalization.
"""
def __init__(self, dsn: str, normalizer: Any):
self.conn: Optional[Connection] = connect(dsn).connection
self.conn.autocommit = True
self.normalizer = normalizer
psycopg2.extras.register_hstore(self.conn)
self._cache = _TokenCache(self.conn)
def close(self) -> None:
""" Free all resources used by the analyzer.
"""
if self.conn:
self.conn.close()
self.conn = None
def get_word_token_info(self, words: Sequence[str]) -> List[Tuple[str, str, int]]:
""" Return token information for the given list of words.
If a word starts with # it is assumed to be a full name
otherwise is a partial name.
The function returns a list of tuples with
(original word, word token, word id).
The function is used for testing and debugging only
and not necessarily efficient.
"""
assert self.conn is not None
with self.conn.cursor() as cur:
cur.execute("""SELECT t.term, word_token, word_id
FROM word, (SELECT unnest(%s::TEXT[]) as term) t
WHERE word_token = (CASE
WHEN left(t.term, 1) = '#' THEN
' ' || make_standard_name(substring(t.term from 2))
ELSE
make_standard_name(t.term)
END)
and class is null and country_code is null""",
(words, ))
return [(r[0], r[1], r[2]) for r in cur]
def normalize(self, phrase: str) -> str:
""" Normalize the given phrase, i.e. remove all properties that
are irrelevant for search.
"""
return cast(str, self.normalizer.transliterate(phrase))
def normalize_postcode(self, postcode: str) -> str:
""" Convert the postcode to a standardized form.
This function must yield exactly the same result as the SQL function
'token_normalized_postcode()'.
"""
return postcode.strip().upper()
def update_postcodes_from_db(self) -> None:
""" Update postcode tokens in the word table from the location_postcode
table.
"""
assert self.conn is not None
with self.conn.cursor() as cur:
# This finds us the rows in location_postcode and word that are
# missing in the other table.
cur.execute("""SELECT * FROM
(SELECT pc, word FROM
(SELECT distinct(postcode) as pc FROM location_postcode) p
FULL JOIN
(SELECT word FROM word
WHERE class ='place' and type = 'postcode') w
ON pc = word) x
WHERE pc is null or word is null""")
to_delete = []
to_add = []
for postcode, word in cur:
if postcode is None:
to_delete.append(word)
else:
to_add.append(postcode)
if to_delete:
cur.execute("""DELETE FROM WORD
WHERE class ='place' and type = 'postcode'
and word = any(%s)
""", (to_delete, ))
if to_add:
cur.execute("""SELECT count(create_postcode_id(pc))
FROM unnest(%s) as pc
""", (to_add, ))
def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
should_replace: bool) -> None:
""" Replace the search index for special phrases with the new phrases.
"""
assert self.conn is not None
norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
for p in phrases))
with self.conn.cursor() as cur:
# Get the old phrases.
existing_phrases = set()
cur.execute("""SELECT word, class, type, operator FROM word
WHERE class != 'place'
OR (type != 'house' AND type != 'postcode')""")
for label, cls, typ, oper in cur:
existing_phrases.add((label, cls, typ, oper or '-'))
to_add = norm_phrases - existing_phrases
to_delete = existing_phrases - norm_phrases
if to_add:
cur.execute_values(
""" INSERT INTO word (word_id, word_token, word, class, type,
search_name_count, operator)
(SELECT nextval('seq_word'), ' ' || make_standard_name(name), name,
class, type, 0,
CASE WHEN op in ('in', 'near') THEN op ELSE null END
FROM (VALUES %s) as v(name, class, type, op))""",
to_add)
if to_delete and should_replace:
cur.execute_values(
""" DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
WHERE word = name and class = in_class and type = in_type
and ((op = '-' and operator is null) or op = operator)""",
to_delete)
LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
len(norm_phrases), len(to_add), len(to_delete))
def add_country_names(self, country_code: str, names: Mapping[str, str]) -> None:
""" Add names for the given country to the search index.
"""
assert self.conn is not None
with self.conn.cursor() as cur:
cur.execute(
"""INSERT INTO word (word_id, word_token, country_code)
(SELECT nextval('seq_word'), lookup_token, %s
FROM (SELECT DISTINCT ' ' || make_standard_name(n) as lookup_token
FROM unnest(%s)n) y
WHERE NOT EXISTS(SELECT * FROM word
WHERE word_token = lookup_token and country_code = %s))
""", (country_code, list(names.values()), country_code))
def process_place(self, place: PlaceInfo) -> Mapping[str, Any]:
""" Determine tokenizer information about the given place.
Returns a JSON-serialisable structure that will be handed into
the database via the token_info field.
"""
assert self.conn is not None
token_info = _TokenInfo(self._cache)
names = place.name
if names:
token_info.add_names(self.conn, names)
if place.is_country():
assert place.country_code is not None
self.add_country_names(place.country_code, names)
address = place.address
if address:
self._process_place_address(token_info, address)
return token_info.data
def _process_place_address(self, token_info: '_TokenInfo', address: Mapping[str, str]) -> None:
assert self.conn is not None
hnrs = []
addr_terms = []
for key, value in address.items():
if key == 'postcode':
# Make sure the normalized postcode is present in the word table.
if re.search(r'[:,;]', value) is None:
norm_pc = self.normalize_postcode(value)
token_info.set_postcode(norm_pc)
self._cache.add_postcode(self.conn, norm_pc)
elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
hnrs.append(value)
elif key == 'street':
token_info.add_street(self.conn, value)
elif key == 'place':
token_info.add_place(self.conn, value)
elif not key.startswith('_') \
and key not in ('country', 'full', 'inclusion'):
addr_terms.append((key, value))
if hnrs:
token_info.add_housenumbers(self.conn, hnrs)
if addr_terms:
token_info.add_address_terms(self.conn, addr_terms)
class _TokenInfo:
""" Collect token information to be sent back to the database.
"""
def __init__(self, cache: '_TokenCache') -> None:
self.cache = cache
self.data: Dict[str, Any] = {}
def add_names(self, conn: Connection, names: Mapping[str, str]) -> None:
""" Add token information for the names of the place.
"""
with conn.cursor() as cur:
# Create the token IDs for all names.
self.data['names'] = cur.scalar("SELECT make_keywords(%s)::text",
(names, ))
def add_housenumbers(self, conn: Connection, hnrs: Sequence[str]) -> None:
""" Extract housenumber information from the address.
"""
if len(hnrs) == 1:
token = self.cache.get_housenumber(hnrs[0])
if token is not None:
self.data['hnr_tokens'] = token
self.data['hnr'] = hnrs[0]
return
# split numbers if necessary
simple_list: List[str] = []
for hnr in hnrs:
simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
if len(simple_list) > 1:
simple_list = list(set(simple_list))
with conn.cursor() as cur:
cur.execute("SELECT * FROM create_housenumbers(%s)", (simple_list, ))
result = cur.fetchone()
assert result is not None
self.data['hnr_tokens'], self.data['hnr'] = result
def set_postcode(self, postcode: str) -> None:
""" Set or replace the postcode token with the given value.
"""
self.data['postcode'] = postcode
def add_street(self, conn: Connection, street: str) -> None:
""" Add addr:street match terms.
"""
def _get_street(name: str) -> Optional[str]:
with conn.cursor() as cur:
return cast(Optional[str],
cur.scalar("SELECT word_ids_from_name(%s)::text", (name, )))
tokens = self.cache.streets.get(street, _get_street)
self.data['street'] = tokens or '{}'
def add_place(self, conn: Connection, place: str) -> None:
""" Add addr:place search and match terms.
"""
def _get_place(name: str) -> Tuple[List[int], List[int]]:
with conn.cursor() as cur:
cur.execute("""SELECT make_keywords(hstore('name' , %s))::text,
word_ids_from_name(%s)::text""",
(name, name))
return cast(Tuple[List[int], List[int]], cur.fetchone())
self.data['place_search'], self.data['place_match'] = \
self.cache.places.get(place, _get_place)
def add_address_terms(self, conn: Connection, terms: Sequence[Tuple[str, str]]) -> None:
""" Add additional address terms.
"""
def _get_address_term(name: str) -> Tuple[List[int], List[int]]:
with conn.cursor() as cur:
cur.execute("""SELECT addr_ids_from_name(%s)::text,
word_ids_from_name(%s)::text""",
(name, name))
return cast(Tuple[List[int], List[int]], cur.fetchone())
tokens = {}
for key, value in terms:
items = self.cache.address_terms.get(value, _get_address_term)
if items[0] or items[1]:
tokens[key] = items
if tokens:
self.data['addr'] = tokens
class _LRU:
""" Least recently used cache that accepts a generator function to
produce the item when there is a cache miss.
"""
def __init__(self, maxsize: int = 128):
self.data: 'OrderedDict[str, Any]' = OrderedDict()
self.maxsize = maxsize
def get(self, key: str, generator: Callable[[str], Any]) -> Any:
""" Get the item with the given key from the cache. If nothing
is found in the cache, generate the value through the
generator function and store it in the cache.
"""
value = self.data.get(key)
if value is not None:
self.data.move_to_end(key)
else:
value = generator(key)
if len(self.data) >= self.maxsize:
self.data.popitem(last=False)
self.data[key] = value
return value
class _TokenCache:
""" Cache for token information to avoid repeated database queries.
This cache is not thread-safe and needs to be instantiated per
analyzer.
"""
def __init__(self, conn: Connection):
# various LRU caches
self.streets = _LRU(maxsize=256)
self.places = _LRU(maxsize=128)
self.address_terms = _LRU(maxsize=1024)
# Lookup houseunumbers up to 100 and cache them
with conn.cursor() as cur:
cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text
FROM generate_series(1, 100) as i""")
self._cached_housenumbers: Dict[str, str] = {str(r[0]): r[1] for r in cur}
# For postcodes remember the ones that have already been added
self.postcodes: Set[str] = set()
def get_housenumber(self, number: str) -> Optional[str]:
""" Get a housenumber token from the cache.
"""
return self._cached_housenumbers.get(number)
def add_postcode(self, conn: Connection, postcode: str) -> None:
""" Make sure the given postcode is in the database.
"""
if postcode not in self.postcodes:
with conn.cursor() as cur:
cur.execute('SELECT create_postcode_id(%s)', (postcode, ))
self.postcodes.add(postcode)

View File

@@ -0,0 +1,53 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Handler for cleaning name and address tags in place information before it
is handed to the token analysis.
"""
from typing import Optional, List, Mapping, Sequence, Callable, Any, Tuple
from nominatim_core.errors import UsageError
from nominatim_core.config import Configuration
from .sanitizers.config import SanitizerConfig
from .sanitizers.base import SanitizerHandler, ProcessInfo
from ..data.place_name import PlaceName
from ..data.place_info import PlaceInfo
class PlaceSanitizer:
""" Controller class which applies sanitizer functions on the place
names and address before they are used by the token analysers.
"""
def __init__(self, rules: Optional[Sequence[Mapping[str, Any]]],
config: Configuration) -> None:
self.handlers: List[Callable[[ProcessInfo], None]] = []
if rules:
for func in rules:
if 'step' not in func:
raise UsageError("Sanitizer rule is missing the 'step' attribute.")
if not isinstance(func['step'], str):
raise UsageError("'step' attribute must be a simple string.")
module: SanitizerHandler = \
config.load_plugin_module(func['step'], 'nominatim.tokenizer.sanitizers')
self.handlers.append(module.create(SanitizerConfig(func)))
def process_names(self, place: PlaceInfo) -> Tuple[List[PlaceName], List[PlaceName]]:
""" Extract a sanitized list of names and address parts from the
given place. The function returns a tuple
(list of names, list of address names)
"""
obj = ProcessInfo(place)
for func in self.handlers:
func(obj)
return obj.names, obj.address

View File

@@ -0,0 +1,64 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Common data types and protocols for sanitizers.
"""
from typing import Optional, List, Mapping, Callable
from nominatim_core.typing import Protocol, Final
from ...data.place_info import PlaceInfo
from ...data.place_name import PlaceName
from .config import SanitizerConfig
class ProcessInfo:
""" Container class for information handed into to handler functions.
The 'names' and 'address' members are mutable. A handler must change
them by either modifying the lists place or replacing the old content
with a new list.
"""
def __init__(self, place: PlaceInfo):
self.place: Final = place
self.names = self._convert_name_dict(place.name)
self.address = self._convert_name_dict(place.address)
@staticmethod
def _convert_name_dict(names: Optional[Mapping[str, str]]) -> List[PlaceName]:
""" Convert a dictionary of names into a list of PlaceNames.
The dictionary key is split into the primary part of the key
and the suffix (the part after an optional colon).
"""
out = []
if names:
for key, value in names.items():
parts = key.split(':', 1)
out.append(PlaceName(value.strip(),
parts[0].strip(),
parts[1].strip() if len(parts) > 1 else None))
return out
class SanitizerHandler(Protocol):
""" Protocol for sanitizer modules.
"""
def create(self, config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
"""
Create a function for sanitizing a place.
Arguments:
config: A dictionary with the additional configuration options
specified in the tokenizer configuration
Return:
The result must be a callable that takes a place description
and transforms name and address as required.
"""

View File

@@ -0,0 +1,80 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Sanitizer that preprocesses address tags for house numbers. The sanitizer
allows to
* define which tags are to be considered house numbers (see 'filter-kind')
* split house number lists into individual numbers (see 'delimiters')
Arguments:
delimiters: Define the set of characters to be used for
splitting a list of house numbers into parts. (default: ',;')
filter-kind: Define the address tags that are considered to be a
house number. Either takes a single string or a list of strings,
where each string is a regular expression. An address item
is considered a house number if the 'kind' fully matches any
of the given regular expressions. (default: 'housenumber')
convert-to-name: Define house numbers that should be treated as a name
instead of a house number. Either takes a single string
or a list of strings, where each string is a regular
expression that must match the full house number value.
"""
from typing import Callable, Iterator, List
from ...data.place_name import PlaceName
from .base import ProcessInfo
from .config import SanitizerConfig
class _HousenumberSanitizer:
def __init__(self, config: SanitizerConfig) -> None:
self.filter_kind = config.get_filter('filter-kind', ['housenumber'])
self.split_regexp = config.get_delimiter()
self.filter_name = config.get_filter('convert-to-name', 'FAIL_ALL')
def __call__(self, obj: ProcessInfo) -> None:
if not obj.address:
return
new_address: List[PlaceName] = []
for item in obj.address:
if self.filter_kind(item.kind):
if self.filter_name(item.name):
obj.names.append(item.clone(kind='housenumber'))
else:
new_address.extend(item.clone(kind='housenumber', name=n)
for n in self.sanitize(item.name))
else:
# Don't touch other address items.
new_address.append(item)
obj.address = new_address
def sanitize(self, value: str) -> Iterator[str]:
""" Extract housenumbers in a regularized format from an OSM value.
The function works as a generator that yields all valid housenumbers
that can be created from the value.
"""
for hnr in self.split_regexp.split(value):
if hnr:
yield from self._regularize(hnr)
def _regularize(self, hnr: str) -> Iterator[str]:
yield hnr
def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
""" Create a housenumber processing function.
"""
return _HousenumberSanitizer(config)

View File

@@ -0,0 +1,80 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Sanitizer that filters postcodes by their officially allowed pattern.
Arguments:
convert-to-address: If set to 'yes' (the default), then postcodes that do
not conform with their country-specific pattern are
converted to an address component. That means that
the postcode does not take part when computing the
postcode centroids of a country but is still searchable.
When set to 'no', non-conforming postcodes are not
searchable either.
default-pattern: Pattern to use, when there is none available for the
country in question. Warning: will not be used for
objects that have no country assigned. These are always
assumed to have no postcode.
"""
from typing import Callable, Optional, Tuple
from ...data.postcode_format import PostcodeFormatter
from .base import ProcessInfo
from .config import SanitizerConfig
class _PostcodeSanitizer:
def __init__(self, config: SanitizerConfig) -> None:
self.convert_to_address = config.get_bool('convert-to-address', True)
self.matcher = PostcodeFormatter()
default_pattern = config.get('default-pattern')
if default_pattern is not None and isinstance(default_pattern, str):
self.matcher.set_default_pattern(default_pattern)
def __call__(self, obj: ProcessInfo) -> None:
if not obj.address:
return
postcodes = ((i, o) for i, o in enumerate(obj.address) if o.kind == 'postcode')
for pos, postcode in postcodes:
formatted = self.scan(postcode.name, obj.place.country_code)
if formatted is None:
if self.convert_to_address:
postcode.kind = 'unofficial_postcode'
else:
obj.address.pop(pos)
else:
postcode.name = formatted[0]
postcode.set_attr('variant', formatted[1])
def scan(self, postcode: str, country: Optional[str]) -> Optional[Tuple[str, str]]:
""" Check the postcode for correct formatting and return the
normalized version. Returns None if the postcode does not
correspond to the official format of the given country.
"""
match = self.matcher.match(country, postcode)
if match is None:
return None
assert country is not None
return self.matcher.normalize(country, match),\
' '.join(filter(lambda p: p is not None, match.groups()))
def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
""" Create a function that filters postcodes by their officially allowed pattern.
"""
return _PostcodeSanitizer(config)

View File

@@ -0,0 +1,46 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Sanitizer that preprocesses tags from the TIGER import.
It makes the following changes:
* remove state reference from tiger:county
"""
from typing import Callable
import re
from .base import ProcessInfo
from .config import SanitizerConfig
COUNTY_MATCH = re.compile('(.*), [A-Z][A-Z]')
def _clean_tiger_county(obj: ProcessInfo) -> None:
""" Remove the state reference from tiger:county tags.
This transforms a name like 'Hamilton, AL' into 'Hamilton'.
If no state reference is detected at the end, the name is left as is.
"""
if not obj.address:
return
for item in obj.address:
if item.kind == 'tiger' and item.suffix == 'county':
m = COUNTY_MATCH.fullmatch(item.name)
if m:
item.name = m[1]
# Switch kind and suffix, the split left them reversed.
item.kind = 'county'
item.suffix = 'tiger'
return
def create(_: SanitizerConfig) -> Callable[[ProcessInfo], None]:
""" Create a function that preprocesses tags from the TIGER import.
"""
return _clean_tiger_county

View File

@@ -0,0 +1,151 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Configuration for Sanitizers.
"""
from typing import Sequence, Union, Optional, Pattern, Callable, Any, TYPE_CHECKING
from collections import UserDict
import re
from nominatim_core.errors import UsageError
# working around missing generics in Python < 3.8
# See https://github.com/python/typing/issues/60#issuecomment-869757075
if TYPE_CHECKING:
_BaseUserDict = UserDict[str, Any]
else:
_BaseUserDict = UserDict
class SanitizerConfig(_BaseUserDict):
""" The `SanitizerConfig` class is a read-only dictionary
with configuration options for the sanitizer.
In addition to the usual dictionary functions, the class provides
accessors to standard sanitizer options that are used by many of the
sanitizers.
"""
def get_string_list(self, param: str, default: Sequence[str] = tuple()) -> Sequence[str]:
""" Extract a configuration parameter as a string list.
Arguments:
param: Name of the configuration parameter.
default: Takes a tuple or list of strings which will
be returned if the parameter is missing in the
sanitizer configuration.
Note that if this default parameter is not
provided then an empty list is returned.
Returns:
If the parameter value is a simple string, it is returned as a
one-item list. If the parameter value does not exist, the given
default is returned. If the parameter value is a list, it is
checked to contain only strings before being returned.
"""
values = self.data.get(param, None)
if values is None:
return list(default)
if isinstance(values, str):
return [values] if values else []
if not isinstance(values, (list, tuple)):
raise UsageError(f"Parameter '{param}' must be string or list of strings.")
if any(not isinstance(value, str) for value in values):
raise UsageError(f"Parameter '{param}' must be string or list of strings.")
return values
def get_bool(self, param: str, default: Optional[bool] = None) -> bool:
""" Extract a configuration parameter as a boolean.
Arguments:
param: Name of the configuration parameter. The parameter must
contain one of the yaml boolean values or an
UsageError will be raised.
default: Value to return, when the parameter is missing.
When set to `None`, the parameter must be defined.
Returns:
Boolean value of the given parameter.
"""
value = self.data.get(param, default)
if not isinstance(value, bool):
raise UsageError(f"Parameter '{param}' must be a boolean value ('yes' or 'no').")
return value
def get_delimiter(self, default: str = ',;') -> Pattern[str]:
""" Return the 'delimiters' parameter in the configuration as a
compiled regular expression that can be used to split strings on
these delimiters.
Arguments:
default: Delimiters to be used when 'delimiters' parameter
is not explicitly configured.
Returns:
A regular expression pattern which can be used to
split a string. The regular expression makes sure that the
resulting names are stripped and that repeated delimiters
are ignored. It may still create empty fields on occasion. The
code needs to filter those.
"""
delimiter_set = set(self.data.get('delimiters', default))
if not delimiter_set:
raise UsageError("Empty 'delimiter' parameter not allowed for sanitizer.")
return re.compile('\\s*[{}]+\\s*'.format(''.join('\\' + d for d in delimiter_set)))
def get_filter(self, param: str, default: Union[str, Sequence[str]] = 'PASS_ALL'
) -> Callable[[str], bool]:
""" Returns a filter function for the given parameter of the sanitizer
configuration.
The value provided for the parameter in sanitizer configuration
should be a string or list of strings, where each string is a regular
expression. These regular expressions will later be used by the
filter function to filter strings.
Arguments:
param: The parameter for which the filter function
will be created.
default: Defines the behaviour of filter function if
parameter is missing in the sanitizer configuration.
Takes a string(PASS_ALL or FAIL_ALL) or a list of strings.
Any other value of string or an empty list is not allowed,
and will raise a ValueError. If the value is PASS_ALL, the filter
function will let all strings to pass, if the value is FAIL_ALL,
filter function will let no strings to pass.
If value provided is a list of strings each string
is treated as a regular expression. In this case these regular
expressions will be used by the filter function.
By default allow filter function to let all strings pass.
Returns:
A filter function that takes a target string as the argument and
returns True if it fully matches any of the regular expressions
otherwise returns False.
"""
filters = self.get_string_list(param) or default
if filters == 'PASS_ALL':
return lambda _: True
if filters == 'FAIL_ALL':
return lambda _: False
if filters and isinstance(filters, (list, tuple)):
regexes = [re.compile(regex) for regex in filters]
return lambda target: any(regex.fullmatch(target) for regex in regexes)
raise ValueError("Default parameter must be a non-empty list or a string value \
('PASS_ALL' or 'FAIL_ALL').")

View File

@@ -0,0 +1,128 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Sanitizer which prevents certain tags from getting into the search index.
It remove tags which matches all properties given below.
Arguments:
type: Define which type of tags should be considered for removal.
There are two types of tags 'name' and 'address' tags.
Takes a string 'name' or 'address'. (default: 'name')
filter-kind: Define which 'kind' of tags should be removed.
Takes a string or list of strings where each
string is a regular expression. A tag is considered
to be a candidate for removal if its 'kind' property
fully matches any of the given regular expressions.
Note that by default all 'kind' of tags are considered.
suffix: Define the 'suffix' property of the tags which should be
removed. Takes a string or list of strings where each
string is a regular expression. A tag is considered to be a
candidate for removal if its 'suffix' property fully
matches any of the given regular expressions. Note that by
default tags with any suffix value are considered including
those which don't have a suffix at all.
name: Define the 'name' property corresponding to the 'kind' property
of the tag. Takes a string or list of strings where each string
is a regular expression. A tag is considered to be a candidate
for removal if its name fully matches any of the given regular
expressions. Note that by default tags with any 'name' are
considered.
country_code: Define the country code of places whose tags should be
considered for removed. Takes a string or list of strings
where each string is a two-letter lower-case country code.
Note that by default tags of places with any country code
are considered including those which don't have a country
code at all.
rank_address: Define the address rank of places whose tags should be
considered for removal. Takes a string or list of strings
where each string is a number or range of number or the
form <from>-<to>.
Note that default is '0-30', which means that tags of all
places are considered.
See https://nominatim.org/release-docs/latest/customize/Ranking/#address-rank
to learn more about address rank.
"""
from typing import Callable, List, Tuple, Sequence
from ...data.place_name import PlaceName
from .base import ProcessInfo
from .config import SanitizerConfig
class _TagSanitizer:
def __init__(self, config: SanitizerConfig) -> None:
self.type = config.get('type', 'name')
self.filter_kind = config.get_filter('filter-kind')
self.country_codes = config.get_string_list('country_code', [])
self.filter_suffix = config.get_filter('suffix')
self.filter_name = config.get_filter('name')
self.allowed_ranks = self._set_allowed_ranks(
config.get_string_list("rank_address", ["0-30"])
)
self.has_country_code = config.get('country_code', None) is not None
def __call__(self, obj: ProcessInfo) -> None:
tags = obj.names if self.type == 'name' else obj.address
if not tags \
or not self.allowed_ranks[obj.place.rank_address] \
or self.has_country_code \
and obj.place.country_code not in self.country_codes:
return
filtered_tags: List[PlaceName] = []
for tag in tags:
if not self.filter_kind(tag.kind) \
or not self.filter_suffix(tag.suffix or '') \
or not self.filter_name(tag.name):
filtered_tags.append(tag)
if self.type == 'name':
obj.names = filtered_tags
else:
obj.address = filtered_tags
def _set_allowed_ranks(self, ranks: Sequence[str]) -> Tuple[bool, ...]:
""" Returns a tuple of 31 boolean values corresponding to the
address ranks 0-30. Value at index 'i' is True if rank 'i'
is present in the ranks or lies in the range of any of the
ranks provided in the sanitizer configuration, otherwise
the value is False.
"""
allowed_ranks = [False] * 31
for rank in ranks:
intvl = [int(x) for x in rank.split('-')]
start, end = intvl[0], intvl[0] if len(intvl) == 1 else intvl[1]
for i in range(start, end + 1):
allowed_ranks[i] = True
return tuple(allowed_ranks)
def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
""" Create a function to process removal of certain tags.
"""
return _TagSanitizer(config)

View File

@@ -0,0 +1,39 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Sanitizer that splits lists of names into their components.
Arguments:
delimiters: Define the set of characters to be used for
splitting the list. (default: ',;')
"""
from typing import Callable
from .base import ProcessInfo
from .config import SanitizerConfig
def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
""" Create a name processing function that splits name values with
multiple values into their components.
"""
regexp = config.get_delimiter()
def _process(obj: ProcessInfo) -> None:
if not obj.names:
return
new_names = []
for name in obj.names:
split_names = regexp.split(name.name)
if len(split_names) == 1:
new_names.append(name)
else:
new_names.extend(name.clone(name=n) for n in split_names if n)
obj.names = new_names
return _process

View File

@@ -0,0 +1,34 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
This sanitizer creates additional name variants for names that have
addendums in brackets (e.g. "Halle (Saale)"). The additional variant contains
only the main name part with the bracket part removed.
"""
from typing import Callable
from .base import ProcessInfo
from .config import SanitizerConfig
def create(_: SanitizerConfig) -> Callable[[ProcessInfo], None]:
""" Create a name processing function that creates additional name variants
for bracket addendums.
"""
def _process(obj: ProcessInfo) -> None:
""" Add variants for names that have a bracket extension.
"""
if obj.names:
new_names = []
for name in (n for n in obj.names if '(' in n.name):
new_name = name.name.split('(')[0].strip()
if new_name:
new_names.append(name.clone(name=new_name))
obj.names.extend(new_names)
return _process

View File

@@ -0,0 +1,99 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
This sanitizer sets the `analyzer` property depending on the
language of the tag. The language is taken from the suffix of the name.
If a name already has an analyzer tagged, then this is kept.
Arguments:
filter-kind: Restrict the names the sanitizer should be applied to
the given tags. The parameter expects a list of
regular expressions which are matched against 'kind'.
Note that a match against the full string is expected.
whitelist: Restrict the set of languages that should be tagged.
Expects a list of acceptable suffixes. When unset,
all 2- and 3-letter lower-case codes are accepted.
use-defaults: Configure what happens when the name has no suffix.
When set to 'all', a variant is created for
each of the default languages in the country
the feature is in. When set to 'mono', a variant is
only created, when exactly one language is spoken
in the country. The default is to do nothing with
the default languages of a country.
mode: Define how the variants are created and may be 'replace' or
'append'. When set to 'append' the original name (without
any analyzer tagged) is retained. (default: replace)
"""
from typing import Callable, Dict, Optional, List
from ...data import country_info
from .base import ProcessInfo
from .config import SanitizerConfig
class _AnalyzerByLanguage:
""" Processor for tagging the language of names in a place.
"""
def __init__(self, config: SanitizerConfig) -> None:
self.filter_kind = config.get_filter('filter-kind')
self.replace = config.get('mode', 'replace') != 'append'
self.whitelist = config.get('whitelist')
self._compute_default_languages(config.get('use-defaults', 'no'))
def _compute_default_languages(self, use_defaults: str) -> None:
self.deflangs: Dict[Optional[str], List[str]] = {}
if use_defaults in ('mono', 'all'):
for ccode, clangs in country_info.iterate('languages'):
if len(clangs) == 1 or use_defaults == 'all':
if self.whitelist:
self.deflangs[ccode] = [l for l in clangs if l in self.whitelist]
else:
self.deflangs[ccode] = clangs
def _suffix_matches(self, suffix: str) -> bool:
if self.whitelist is None:
return len(suffix) in (2, 3) and suffix.islower()
return suffix in self.whitelist
def __call__(self, obj: ProcessInfo) -> None:
if not obj.names:
return
more_names = []
for name in (n for n in obj.names
if not n.has_attr('analyzer') and self.filter_kind(n.kind)):
if name.suffix:
langs = [name.suffix] if self._suffix_matches(name.suffix) else None
else:
langs = self.deflangs.get(obj.place.country_code)
if langs:
if self.replace:
name.set_attr('analyzer', langs[0])
else:
more_names.append(name.clone(attr={'analyzer': langs[0]}))
more_names.extend(name.clone(attr={'analyzer': l}) for l in langs[1:])
obj.names.extend(more_names)
def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
""" Create a function that sets the analyzer property depending on the
language of the tag.
"""
return _AnalyzerByLanguage(config)

View File

@@ -0,0 +1,117 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
This sanitizer maps OSM data to Japanese block addresses.
It replaces blocknumber and housenumber with housenumber,
and quarter and neighbourhood with place.
"""
from typing import Callable
from typing import List, Optional
from .base import ProcessInfo
from .config import SanitizerConfig
from ...data.place_name import PlaceName
def create(_: SanitizerConfig) -> Callable[[ProcessInfo], None]:
"""Set up the sanitizer
"""
return tag_japanese
def reconbine_housenumber(
new_address: List[PlaceName],
tmp_housenumber: Optional[str],
tmp_blocknumber: Optional[str]
) -> List[PlaceName]:
""" Recombine the tag of housenumber by using housenumber and blocknumber
"""
if tmp_blocknumber and tmp_housenumber:
new_address.append(
PlaceName(
kind='housenumber',
name=f'{tmp_blocknumber}-{tmp_housenumber}',
suffix=''
)
)
elif tmp_blocknumber:
new_address.append(
PlaceName(
kind='housenumber',
name=tmp_blocknumber,
suffix=''
)
)
elif tmp_housenumber:
new_address.append(
PlaceName(
kind='housenumber',
name=tmp_housenumber,
suffix=''
)
)
return new_address
def reconbine_place(
new_address: List[PlaceName],
tmp_neighbourhood: Optional[str],
tmp_quarter: Optional[str]
) -> List[PlaceName]:
""" Recombine the tag of place by using neighbourhood and quarter
"""
if tmp_neighbourhood and tmp_quarter:
new_address.append(
PlaceName(
kind='place',
name=f'{tmp_quarter}{tmp_neighbourhood}',
suffix=''
)
)
elif tmp_neighbourhood:
new_address.append(
PlaceName(
kind='place',
name=tmp_neighbourhood,
suffix=''
)
)
elif tmp_quarter:
new_address.append(
PlaceName(
kind='place',
name=tmp_quarter,
suffix=''
)
)
return new_address
def tag_japanese(obj: ProcessInfo) -> None:
"""Recombine kind of address
"""
if obj.place.country_code != 'jp':
return
tmp_housenumber = None
tmp_blocknumber = None
tmp_neighbourhood = None
tmp_quarter = None
new_address = []
for item in obj.address:
if item.kind == 'housenumber':
tmp_housenumber = item.name
elif item.kind == 'block_number':
tmp_blocknumber = item.name
elif item.kind == 'neighbourhood':
tmp_neighbourhood = item.name
elif item.kind == 'quarter':
tmp_quarter = item.name
else:
new_address.append(item)
new_address = reconbine_housenumber(new_address, tmp_housenumber, tmp_blocknumber)
new_address = reconbine_place(new_address, tmp_neighbourhood, tmp_quarter)
obj.address = [item for item in new_address if item.name is not None]

View File

@@ -0,0 +1,96 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Common data types and protocols for analysers.
"""
from typing import Mapping, List, Any
from nominatim_core.typing import Protocol
from ...data.place_name import PlaceName
class Analyzer(Protocol):
""" The `create()` function of an analysis module needs to return an
object that implements the following functions.
"""
def get_canonical_id(self, name: PlaceName) -> str:
""" Return the canonical form of the given name. The canonical ID must
be unique (the same ID must always yield the same variants) and
must be a form from which the variants can be derived.
Arguments:
name: Extended place name description as prepared by
the sanitizers.
Returns:
ID string with a canonical form of the name. The string may
be empty, when the analyzer cannot analyze the name at all,
for example because the character set in use does not match.
"""
def compute_variants(self, canonical_id: str) -> List[str]:
""" Compute the transliterated spelling variants for the given
canonical ID.
Arguments:
canonical_id: ID string previously computed with
`get_canonical_id()`.
Returns:
A list of possible spelling variants. All strings must have
been transformed with the global normalizer and
transliterator ICU rules. Otherwise they cannot be matched
against the input by the query frontend.
The list may be empty, when there are no useful
spelling variants. This may happen when an analyzer only
usually outputs additional variants to the canonical spelling
and there are no such variants.
"""
class AnalysisModule(Protocol):
""" The setup of the token analysis is split into two parts:
configuration and analyser factory. A token analysis module must
therefore implement the two functions here described.
"""
def configure(self, rules: Mapping[str, Any],
normalizer: Any, transliterator: Any) -> Any:
""" Prepare the configuration of the analysis module.
This function should prepare all data that can be shared
between instances of this analyser.
Arguments:
rules: A dictionary with the additional configuration options
as specified in the tokenizer configuration.
normalizer: an ICU Transliterator with the compiled
global normalization rules.
transliterator: an ICU Transliterator with the compiled
global transliteration rules.
Returns:
A data object with configuration data. This will be handed
as is into the `create()` function and may be
used freely by the analysis module as needed.
"""
def create(self, normalizer: Any, transliterator: Any, config: Any) -> Analyzer:
""" Create a new instance of the analyser.
A separate instance of the analyser is created for each thread
when used in multi-threading context.
Arguments:
normalizer: an ICU Transliterator with the compiled normalization
rules.
transliterator: an ICU Transliterator with the compiled
transliteration rules.
config: The object that was returned by the call to configure().
Returns:
A new analyzer instance. This must be an object that implements
the Analyzer protocol.
"""

View File

@@ -0,0 +1,139 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Parser for configuration for variants.
"""
from typing import Any, Iterator, Tuple, List, Optional, Set, NamedTuple
from collections import defaultdict
import itertools
import re
from nominatim_core.config import flatten_config_list
from nominatim_core.errors import UsageError
class ICUVariant(NamedTuple):
""" A single replacement rule for variant creation.
"""
source: str
replacement: str
def get_variant_config(in_rules: Any,
normalizer: Any) -> Tuple[List[Tuple[str, List[str]]], str]:
""" Convert the variant definition from the configuration into
replacement sets.
Returns a tuple containing the replacement set and the list of characters
used in the replacements.
"""
immediate = defaultdict(list)
chars: Set[str] = set()
if in_rules:
vset: Set[ICUVariant] = set()
rules = flatten_config_list(in_rules, 'variants')
vmaker = _VariantMaker(normalizer)
for section in rules:
for rule in (section.get('words') or []):
vset.update(vmaker.compute(rule))
# Intermediate reorder by source. Also compute required character set.
for variant in vset:
if variant.source[-1] == ' ' and variant.replacement[-1] == ' ':
replstr = variant.replacement[:-1]
else:
replstr = variant.replacement
immediate[variant.source].append(replstr)
chars.update(variant.source)
return list(immediate.items()), ''.join(chars)
class _VariantMaker:
""" Generator for all necessary ICUVariants from a single variant rule.
All text in rules is normalized to make sure the variants match later.
"""
def __init__(self, normalizer: Any) -> None:
self.norm = normalizer
def compute(self, rule: Any) -> Iterator[ICUVariant]:
""" Generator for all ICUVariant tuples from a single variant rule.
"""
parts = re.split(r'(\|)?([=-])>', rule)
if len(parts) != 4:
raise UsageError(f"Syntax error in variant rule: {rule}")
decompose = parts[1] is None
src_terms = [self._parse_variant_word(t) for t in parts[0].split(',')]
repl_terms = (self.norm.transliterate(t).strip() for t in parts[3].split(','))
# If the source should be kept, add a 1:1 replacement
if parts[2] == '-':
for src in src_terms:
if src:
for froms, tos in _create_variants(*src, src[0], decompose):
yield ICUVariant(froms, tos)
for src, repl in itertools.product(src_terms, repl_terms):
if src and repl:
for froms, tos in _create_variants(*src, repl, decompose):
yield ICUVariant(froms, tos)
def _parse_variant_word(self, name: str) -> Optional[Tuple[str, str, str]]:
name = name.strip()
match = re.fullmatch(r'([~^]?)([^~$^]*)([~$]?)', name)
if match is None or (match.group(1) == '~' and match.group(3) == '~'):
raise UsageError(f"Invalid variant word descriptor '{name}'")
norm_name = self.norm.transliterate(match.group(2)).strip()
if not norm_name:
return None
return norm_name, match.group(1), match.group(3)
_FLAG_MATCH = {'^': '^ ',
'$': ' ^',
'': ' '}
def _create_variants(src: str, preflag: str, postflag: str,
repl: str, decompose: bool) -> Iterator[Tuple[str, str]]:
if preflag == '~':
postfix = _FLAG_MATCH[postflag]
# suffix decomposition
src = src + postfix
repl = repl + postfix
yield src, repl
yield ' ' + src, ' ' + repl
if decompose:
yield src, ' ' + repl
yield ' ' + src, repl
elif postflag == '~':
# prefix decomposition
prefix = _FLAG_MATCH[preflag]
src = prefix + src
repl = prefix + repl
yield src, repl
yield src + ' ', repl + ' '
if decompose:
yield src, repl + ' '
yield src + ' ', repl
else:
prefix = _FLAG_MATCH[preflag]
postfix = _FLAG_MATCH[postflag]
yield prefix + src + postfix, prefix + repl + postfix

View File

@@ -0,0 +1,150 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Generic processor for names that creates abbreviation variants.
"""
from typing import Mapping, Dict, Any, Iterable, Iterator, Optional, List, cast
import itertools
import datrie
from nominatim_core.errors import UsageError
from ...data.place_name import PlaceName
from .config_variants import get_variant_config
from .generic_mutation import MutationVariantGenerator
### Configuration section
def configure(rules: Mapping[str, Any], normalizer: Any, _: Any) -> Dict[str, Any]:
""" Extract and preprocess the configuration for this module.
"""
config: Dict[str, Any] = {}
config['replacements'], config['chars'] = get_variant_config(rules.get('variants'),
normalizer)
config['variant_only'] = rules.get('mode', '') == 'variant-only'
# parse mutation rules
config['mutations'] = []
for rule in rules.get('mutations', []):
if 'pattern' not in rule:
raise UsageError("Missing field 'pattern' in mutation configuration.")
if not isinstance(rule['pattern'], str):
raise UsageError("Field 'pattern' in mutation configuration "
"must be a simple text field.")
if 'replacements' not in rule:
raise UsageError("Missing field 'replacements' in mutation configuration.")
if not isinstance(rule['replacements'], list):
raise UsageError("Field 'replacements' in mutation configuration "
"must be a list of texts.")
config['mutations'].append((rule['pattern'], rule['replacements']))
return config
### Analysis section
def create(normalizer: Any, transliterator: Any,
config: Mapping[str, Any]) -> 'GenericTokenAnalysis':
""" Create a new token analysis instance for this module.
"""
return GenericTokenAnalysis(normalizer, transliterator, config)
class GenericTokenAnalysis:
""" Collects the different transformation rules for normalisation of names
and provides the functions to apply the transformations.
"""
def __init__(self, norm: Any, to_ascii: Any, config: Mapping[str, Any]) -> None:
self.norm = norm
self.to_ascii = to_ascii
self.variant_only = config['variant_only']
# Set up datrie
if config['replacements']:
self.replacements = datrie.Trie(config['chars'])
for src, repllist in config['replacements']:
self.replacements[src] = repllist
else:
self.replacements = None
# set up mutation rules
self.mutations = [MutationVariantGenerator(*cfg) for cfg in config['mutations']]
def get_canonical_id(self, name: PlaceName) -> str:
""" Return the normalized form of the name. This is the standard form
from which possible variants for the name can be derived.
"""
return cast(str, self.norm.transliterate(name.name)).strip()
def compute_variants(self, norm_name: str) -> List[str]:
""" Compute the spelling variants for the given normalized name
and transliterate the result.
"""
variants = self._generate_word_variants(norm_name)
for mutation in self.mutations:
variants = mutation.generate(variants)
return [name for name in self._transliterate_unique_list(norm_name, variants) if name]
def _transliterate_unique_list(self, norm_name: str,
iterable: Iterable[str]) -> Iterator[Optional[str]]:
seen = set()
if self.variant_only:
seen.add(norm_name)
for variant in map(str.strip, iterable):
if variant not in seen:
seen.add(variant)
yield self.to_ascii.transliterate(variant).strip()
def _generate_word_variants(self, norm_name: str) -> Iterable[str]:
baseform = '^ ' + norm_name + ' ^'
baselen = len(baseform)
partials = ['']
startpos = 0
if self.replacements is not None:
pos = 0
force_space = False
while pos < baselen:
full, repl = self.replacements.longest_prefix_item(baseform[pos:],
(None, None))
if full is not None:
done = baseform[startpos:pos]
partials = [v + done + r
for v, r in itertools.product(partials, repl)
if not force_space or r.startswith(' ')]
if len(partials) > 128:
# If too many variants are produced, they are unlikely
# to be helpful. Only use the original term.
startpos = 0
break
startpos = pos + len(full)
if full[-1] == ' ':
startpos -= 1
force_space = True
pos = startpos
else:
pos += 1
force_space = False
# No variants detected? Fast return.
if startpos == 0:
return (norm_name, )
if startpos < baselen:
return (part[1:] + baseform[startpos:-1] for part in partials)
return (part[1:-1] for part in partials)

View File

@@ -0,0 +1,57 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Creator for mutation variants for the generic token analysis.
"""
from typing import Sequence, Iterable, Iterator, Tuple
import itertools
import logging
import re
from nominatim_core.errors import UsageError
LOG = logging.getLogger()
def _zigzag(outer: Iterable[str], inner: Iterable[str]) -> Iterator[str]:
return itertools.chain.from_iterable(itertools.zip_longest(outer, inner, fillvalue=''))
class MutationVariantGenerator:
""" Generates name variants by applying a regular expression to the name
and replacing it with one or more variants. When the regular expression
matches more than once, each occurrence is replaced with all replacement
patterns.
"""
def __init__(self, pattern: str, replacements: Sequence[str]):
self.pattern = re.compile(pattern)
self.replacements = replacements
if self.pattern.groups > 0:
LOG.fatal("The mutation pattern %s contains a capturing group. "
"This is not allowed.", pattern)
raise UsageError("Bad mutation pattern in configuration.")
def generate(self, names: Iterable[str]) -> Iterator[str]:
""" Generator function for the name variants. 'names' is an iterable
over a set of names for which the variants are to be generated.
"""
for name in names:
parts = self.pattern.split(name)
if len(parts) == 1:
yield name
else:
for seps in self._fillers(len(parts)):
yield ''.join(_zigzag(parts, seps))
def _fillers(self, num_parts: int) -> Iterator[Tuple[str, ...]]:
""" Returns a generator for strings to join the given number of string
parts in all possible combinations.
"""
return itertools.product(self.replacements, repeat=num_parts - 1)

View File

@@ -0,0 +1,70 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Specialized processor for housenumbers. Analyses common housenumber patterns
and creates variants for them.
"""
from typing import Any, List, cast
import re
from ...data.place_name import PlaceName
from .generic_mutation import MutationVariantGenerator
RE_NON_DIGIT = re.compile('[^0-9]')
RE_DIGIT_ALPHA = re.compile(r'(\d)\s*([^\d\s␣])')
RE_ALPHA_DIGIT = re.compile(r'([^\s\d␣])\s*(\d)')
RE_NAMED_PART = re.compile(r'[a-z]{4}')
### Configuration section
def configure(*_: Any) -> None:
""" All behaviour is currently hard-coded.
"""
return None
### Analysis section
def create(normalizer: Any, transliterator: Any, config: None) -> 'HousenumberTokenAnalysis': # pylint: disable=W0613
""" Create a new token analysis instance for this module.
"""
return HousenumberTokenAnalysis(normalizer, transliterator)
class HousenumberTokenAnalysis:
""" Detects common housenumber patterns and normalizes them.
"""
def __init__(self, norm: Any, trans: Any) -> None:
self.norm = norm
self.trans = trans
self.mutator = MutationVariantGenerator('', (' ', ''))
def get_canonical_id(self, name: PlaceName) -> str:
""" Return the normalized form of the housenumber.
"""
# shortcut for number-only numbers, which make up 90% of the data.
if RE_NON_DIGIT.search(name.name) is None:
return name.name
norm = cast(str, self.trans.transliterate(self.norm.transliterate(name.name)))
# If there is a significant non-numeric part, use as is.
if RE_NAMED_PART.search(norm) is None:
# Otherwise add optional spaces between digits and letters.
(norm_opt, cnt1) = RE_DIGIT_ALPHA.subn(r'\1␣\2', norm)
(norm_opt, cnt2) = RE_ALPHA_DIGIT.subn(r'\1␣\2', norm_opt)
# Avoid creating too many variants per number.
if cnt1 + cnt2 <= 4:
return norm_opt
return norm
def compute_variants(self, norm_name: str) -> List[str]:
""" Compute the spelling variants for the given normalized housenumber.
Generates variants for optional spaces (marked with '').
"""
return list(self.mutator.generate([norm_name]))

View File

@@ -0,0 +1,65 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2022 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Specialized processor for postcodes. Supports a 'lookup' variant of the
token, which produces variants with optional spaces.
"""
from typing import Any, List
from ...data.place_name import PlaceName
from .generic_mutation import MutationVariantGenerator
### Configuration section
def configure(*_: Any) -> None:
""" All behaviour is currently hard-coded.
"""
return None
### Analysis section
def create(normalizer: Any, transliterator: Any, config: None) -> 'PostcodeTokenAnalysis': # pylint: disable=W0613
""" Create a new token analysis instance for this module.
"""
return PostcodeTokenAnalysis(normalizer, transliterator)
class PostcodeTokenAnalysis:
""" Special normalization and variant generation for postcodes.
This analyser must not be used with anything but postcodes as
it follows some special rules: the canonial ID is the form that
is used for the output. `compute_variants` then needs to ensure that
the generated variants once more follow the standard normalization
and transliteration, so that postcodes are correctly recognised by
the search algorithm.
"""
def __init__(self, norm: Any, trans: Any) -> None:
self.norm = norm
self.trans = trans
self.mutator = MutationVariantGenerator(' ', (' ', ''))
def get_canonical_id(self, name: PlaceName) -> str:
""" Return the standard form of the postcode.
"""
return name.name.strip().upper()
def compute_variants(self, norm_name: str) -> List[str]:
""" Compute the spelling variants for the given normalized postcode.
Takes the canonical form of the postcode, normalizes it using the
standard rules and then creates variants of the result where
all spaces are optional.
"""
# Postcodes follow their own transliteration rules.
# Make sure at this point, that the terms are normalized in a way
# that they are searchable with the standard transliteration rules.
return [self.trans.transliterate(term) for term in
self.mutator.generate([self.norm.transliterate(norm_name)]) if term]