From 231250f2eb272b77d54e4b4b18bd85a80413ac34 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Wed, 29 Sep 2021 10:37:54 +0200 Subject: [PATCH 1/7] add wrapper class for place data passed to tokenizer This is mostly for convenience and documentation purposes. --- nominatim/indexer/place_info.py | 44 +++++++++++++++++++++++++ nominatim/indexer/runners.py | 8 +++-- nominatim/tokenizer/base.py | 12 ++----- nominatim/tokenizer/icu_tokenizer.py | 6 ++-- nominatim/tokenizer/legacy_tokenizer.py | 6 ++-- nominatim/tools/tiger_data.py | 5 ++- test/python/dummy_tokenizer.py | 2 ++ test/python/test_tokenizer_icu.py | 20 +++++++---- test/python/test_tokenizer_legacy.py | 15 +++++---- 9 files changed, 84 insertions(+), 34 deletions(-) create mode 100644 nominatim/indexer/place_info.py diff --git a/nominatim/indexer/place_info.py b/nominatim/indexer/place_info.py new file mode 100644 index 00000000..fd179fef --- /dev/null +++ b/nominatim/indexer/place_info.py @@ -0,0 +1,44 @@ +""" +Wrapper around place information the indexer gets from the database and hands to +the tokenizer. +""" + +import psycopg2.extras + +class PlaceInfo: + """ Data class containing all information the tokenizer gets about a + place it should process the names for. + """ + + def __init__(self, info): + self._info = info + + + def analyze(self, analyzer): + """ Process this place with the given tokenizer and return the + result in psycopg2-compatible Json. + """ + return psycopg2.extras.Json(analyzer.process_place(self)) + + + @property + def name(self): + """ A dictionary with the names of the place or None if the place + has no names. + """ + return self._info.get('name') + + + @property + def address(self): + """ A dictionary with the address elements of the place + or None if no address information is available. + """ + return self._info.get('address') + + + @property + def country_feature(self): + """ Return the country code if the place is a valid country boundary. + """ + return self._info.get('country_feature') diff --git a/nominatim/indexer/runners.py b/nominatim/indexer/runners.py index 29261ee5..43966419 100644 --- a/nominatim/indexer/runners.py +++ b/nominatim/indexer/runners.py @@ -4,14 +4,16 @@ tasks. """ import functools -import psycopg2.extras from psycopg2 import sql as pysql +from nominatim.indexer.place_info import PlaceInfo + # pylint: disable=C0111 def _mk_valuelist(template, num): return pysql.SQL(',').join([pysql.SQL(template)] * num) + class AbstractPlacexRunner: """ Returns SQL commands for indexing of the placex table. """ @@ -47,7 +49,7 @@ class AbstractPlacexRunner: for place in places: for field in ('place_id', 'name', 'address', 'linked_place_id'): values.append(place[field]) - values.append(psycopg2.extras.Json(self.analyzer.process_place(place))) + values.append(PlaceInfo(place).analyze(self.analyzer)) worker.perform(self._index_sql(len(places)), values) @@ -141,7 +143,7 @@ class InterpolationRunner: values = [] for place in places: values.extend((place[x] for x in ('place_id', 'address'))) - values.append(psycopg2.extras.Json(self.analyzer.process_place(place))) + values.append(PlaceInfo(place).analyze(self.analyzer)) worker.perform(self._index_sql(len(places)), values) diff --git a/nominatim/tokenizer/base.py b/nominatim/tokenizer/base.py index 00ecae44..d827f813 100644 --- a/nominatim/tokenizer/base.py +++ b/nominatim/tokenizer/base.py @@ -6,6 +6,7 @@ from abc import ABC, abstractmethod from typing import List, Tuple, Dict, Any from nominatim.config import Configuration +from nominatim.indexer.place_info import PlaceInfo # pylint: disable=unnecessary-pass @@ -105,20 +106,13 @@ class AbstractAnalyzer(ABC): @abstractmethod - def process_place(self, place: Dict) -> Any: + def process_place(self, place: PlaceInfo) -> Any: """ Extract tokens for the given place and compute the information to be handed to the PL/pgSQL processor for building the search index. Arguments: - place: Dictionary with the information about the place. Currently - the following fields may be present: - - - *name* is a dictionary of names for the place together - with the designation of the name. - - *address* is a dictionary of address terms. - - *country_feature* is set to a country code when the - place describes a country. + place: Place information retrived from the database. Returns: A JSON-serialisable structure that will be handed into diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index 5768fd35..81b07568 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -390,18 +390,18 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): """ token_info = _TokenInfo(self._cache) - names = place.get('name') + names = place.name if names: fulls, partials = self._compute_name_tokens(names) token_info.add_names(fulls, partials) - country_feature = place.get('country_feature') + country_feature = place.country_feature if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature): self.add_country_names(country_feature.lower(), names) - address = place.get('address') + address = place.address if address: self._process_place_address(token_info, address) diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py index 8957426b..8bfb309d 100644 --- a/nominatim/tokenizer/legacy_tokenizer.py +++ b/nominatim/tokenizer/legacy_tokenizer.py @@ -405,16 +405,16 @@ class LegacyNameAnalyzer(AbstractAnalyzer): """ token_info = _TokenInfo(self._cache) - names = place.get('name') + names = place.name if names: token_info.add_names(self.conn, names) - country_feature = place.get('country_feature') + country_feature = place.country_feature if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature): self.add_country_names(country_feature.lower(), names) - address = place.get('address') + address = place.address if address: self._process_place_address(token_info, address) diff --git a/nominatim/tools/tiger_data.py b/nominatim/tools/tiger_data.py index ff498f77..19a12682 100644 --- a/nominatim/tools/tiger_data.py +++ b/nominatim/tools/tiger_data.py @@ -7,12 +7,11 @@ import logging import os import tarfile -import psycopg2.extras - from nominatim.db.connection import connect from nominatim.db.async_connection import WorkerPool from nominatim.db.sql_preprocessor import SQLPreprocessor from nominatim.errors import UsageError +from nominatim.indexer.place_info import PlaceInfo LOG = logging.getLogger() @@ -58,7 +57,7 @@ def handle_threaded_sql_statements(pool, fd, analyzer): address = dict(street=row['street'], postcode=row['postcode']) args = ('SRID=4326;' + row['geometry'], int(row['from']), int(row['to']), row['interpolation'], - psycopg2.extras.Json(analyzer.process_place(dict(address=address))), + PlaceInfo({'address': address}).analyze(analyzer), analyzer.normalize_postcode(row['postcode'])) except ValueError: continue diff --git a/test/python/dummy_tokenizer.py b/test/python/dummy_tokenizer.py index 69202bc3..db0f32cd 100644 --- a/test/python/dummy_tokenizer.py +++ b/test/python/dummy_tokenizer.py @@ -1,6 +1,7 @@ """ Tokenizer for testing. """ +from nominatim.indexer.place_info import PlaceInfo def create(dsn, data_dir): """ Create a new instance of the tokenizer provided by this module. @@ -68,4 +69,5 @@ class DummyNameAnalyzer: @staticmethod def process_place(place): + assert isinstance(place, PlaceInfo) return {} diff --git a/test/python/test_tokenizer_icu.py b/test/python/test_tokenizer_icu.py index ed079269..28c6ef7a 100644 --- a/test/python/test_tokenizer_icu.py +++ b/test/python/test_tokenizer_icu.py @@ -11,6 +11,7 @@ from nominatim.tokenizer.icu_name_processor import ICUNameProcessorRules from nominatim.tokenizer.icu_rule_loader import ICURuleLoader from nominatim.db import properties from nominatim.db.sql_preprocessor import SQLPreprocessor +from nominatim.indexer.place_info import PlaceInfo from mock_icu_word_table import MockIcuWordTable @@ -322,30 +323,37 @@ class TestPlaceNames: assert eval(info['names']) == set((t[2] for t in tokens)) + def process_named_place(self, names, country_feature=None): + place = {'name': names} + if country_feature: + place['country_feature'] = country_feature + + return self.analyzer.process_place(PlaceInfo(place)) + + def test_simple_names(self): - info = self.analyzer.process_place({'name': {'name': 'Soft bAr', 'ref': '34'}}) + info = self.process_named_place({'name': 'Soft bAr', 'ref': '34'}) self.expect_name_terms(info, '#Soft bAr', '#34', 'Soft', 'bAr', '34') @pytest.mark.parametrize('sep', [',' , ';']) def test_names_with_separator(self, sep): - info = self.analyzer.process_place({'name': {'name': sep.join(('New York', 'Big Apple'))}}) + info = self.process_named_place({'name': sep.join(('New York', 'Big Apple'))}) self.expect_name_terms(info, '#New York', '#Big Apple', 'new', 'york', 'big', 'apple') def test_full_names_with_bracket(self): - info = self.analyzer.process_place({'name': {'name': 'Houseboat (left)'}}) + info = self.process_named_place({'name': 'Houseboat (left)'}) self.expect_name_terms(info, '#Houseboat (left)', '#Houseboat', 'houseboat', 'left') def test_country_name(self, word_table): - info = self.analyzer.process_place({'name': {'name': 'Norge'}, - 'country_feature': 'no'}) + info = self.process_named_place({'name': 'Norge'}, country_feature='no') self.expect_name_terms(info, '#norge', 'norge') assert word_table.get_country() == {('no', 'NORGE')} @@ -361,7 +369,7 @@ class TestPlaceAddress: def process_address(self, **kwargs): - return self.analyzer.process_place({'address': kwargs}) + return self.analyzer.process_place(PlaceInfo({'address': kwargs})) def name_token_set(self, *expected_terms): diff --git a/test/python/test_tokenizer_legacy.py b/test/python/test_tokenizer_legacy.py index 4dd3a141..2545c2db 100644 --- a/test/python/test_tokenizer_legacy.py +++ b/test/python/test_tokenizer_legacy.py @@ -5,6 +5,7 @@ import shutil import pytest +from nominatim.indexer.place_info import PlaceInfo from nominatim.tokenizer import legacy_tokenizer from nominatim.db import properties from nominatim.errors import UsageError @@ -284,21 +285,21 @@ def test_add_more_country_names(analyzer, word_table, make_standard_name): def test_process_place_names(analyzer, make_keywords): - info = analyzer.process_place({'name' : {'name' : 'Soft bAr', 'ref': '34'}}) + info = analyzer.process_place(PlaceInfo({'name' : {'name' : 'Soft bAr', 'ref': '34'}})) assert info['names'] == '{1,2,3}' @pytest.mark.parametrize('pcode', ['12345', 'AB 123', '34-345']) def test_process_place_postcode(analyzer, create_postcode_id, word_table, pcode): - analyzer.process_place({'address': {'postcode' : pcode}}) + analyzer.process_place(PlaceInfo({'address': {'postcode' : pcode}})) assert word_table.get_postcodes() == {pcode, } @pytest.mark.parametrize('pcode', ['12:23', 'ab;cd;f', '123;836']) def test_process_place_bad_postcode(analyzer, create_postcode_id, word_table, pcode): - analyzer.process_place({'address': {'postcode' : pcode}}) + analyzer.process_place(PlaceInfo({'address': {'postcode' : pcode}})) assert not word_table.get_postcodes() @@ -319,7 +320,7 @@ class TestHousenumberName: @staticmethod @pytest.mark.parametrize('hnr', ['123a', '1', '101']) def test_process_place_housenumbers_simple(analyzer, hnr): - info = analyzer.process_place({'address': {'housenumber' : hnr}}) + info = analyzer.process_place(PlaceInfo({'address': {'housenumber' : hnr}})) assert info['hnr'] == hnr assert info['hnr_tokens'].startswith("{") @@ -327,15 +328,15 @@ class TestHousenumberName: @staticmethod def test_process_place_housenumbers_lists(analyzer): - info = analyzer.process_place({'address': {'conscriptionnumber' : '1; 2;3'}}) + info = analyzer.process_place(PlaceInfo({'address': {'conscriptionnumber' : '1; 2;3'}})) assert set(info['hnr'].split(';')) == set(('1', '2', '3')) @staticmethod def test_process_place_housenumbers_duplicates(analyzer): - info = analyzer.process_place({'address': {'housenumber' : '134', + info = analyzer.process_place(PlaceInfo({'address': {'housenumber' : '134', 'conscriptionnumber' : '134', - 'streetnumber' : '99a'}}) + 'streetnumber' : '99a'}})) assert set(info['hnr'].split(';')) == set(('134', '99a')) From be65c8303f18d0f92bbf5bc9558f8789d33f21d9 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Wed, 29 Sep 2021 11:54:14 +0200 Subject: [PATCH 2/7] export more data for the tokenizer name preparation Adds class, type, country and rank to the exported information and removes the rather odd hack for countries. Whether a place represents a country boundary can now be computed by the tokenizer. --- lib-sql/functions/placex_triggers.sql | 50 +++++++++++++------------ nominatim/indexer/place_info.py | 30 +++++++++++++-- nominatim/indexer/runners.py | 2 +- nominatim/tokenizer/icu_tokenizer.py | 5 +-- nominatim/tokenizer/legacy_tokenizer.py | 5 +-- test/python/test_indexing.py | 26 +++++++++---- test/python/test_tokenizer_icu.py | 12 ++++-- 7 files changed, 85 insertions(+), 45 deletions(-) diff --git a/lib-sql/functions/placex_triggers.sql b/lib-sql/functions/placex_triggers.sql index 9c2a67a1..8ae8cf39 100644 --- a/lib-sql/functions/placex_triggers.sql +++ b/lib-sql/functions/placex_triggers.sql @@ -1,30 +1,33 @@ -- Trigger functions for the placex table. +-- Information returned by update preparation. +DROP TYPE IF EXISTS prepare_update_info CASCADE; +CREATE TYPE prepare_update_info AS ( + name HSTORE, + address HSTORE, + rank_address SMALLINT, + country_code TEXT, + class TEXT, + type TEXT, + linked_place_id BIGINT +); + -- Retrieve the data needed by the indexer for updating the place. --- --- Return parameters: --- name list of names --- address list of address tags, either from the object or a surrounding --- building --- country_feature If the place is a country feature, this contains the --- country code, otherwise it is null. -CREATE OR REPLACE FUNCTION placex_prepare_update(p placex, - OUT name HSTORE, - OUT address HSTORE, - OUT country_feature VARCHAR, - OUT linked_place_id BIGINT) +CREATE OR REPLACE FUNCTION placex_indexing_prepare(p placex) + RETURNS prepare_update_info AS $$ DECLARE location RECORD; + result prepare_update_info; BEGIN -- For POI nodes, check if the address should be derived from a surrounding -- building. IF p.rank_search < 30 OR p.osm_type != 'N' OR p.address is not null THEN - address := p.address; + result.address := p.address; ELSE -- The additional && condition works around the misguided query -- planner of postgis 3.0. - SELECT placex.address || hstore('_inherited', '') INTO address + SELECT placex.address || hstore('_inherited', '') INTO result.address FROM placex WHERE ST_Covers(geometry, p.centroid) and geometry && p.centroid @@ -34,27 +37,26 @@ BEGIN LIMIT 1; END IF; - address := address - '_unlisted_place'::TEXT; - name := p.name; + result.address := result.address - '_unlisted_place'::TEXT; + result.name := p.name; + result.class := p.class; + result.type := p.type; + result.country_code := p.country_code; + result.rank_address := p.rank_address; -- Names of linked places need to be merged in, so search for a linkable -- place already here. SELECT * INTO location FROM find_linked_place(p); IF location.place_id is not NULL THEN - linked_place_id := location.place_id; + result.linked_place_id := location.place_id; IF NOT location.name IS NULL THEN - name := location.name || name; + result.name := location.name || result.name; END IF; END IF; - country_feature := CASE WHEN p.admin_level = 2 - and p.class = 'boundary' and p.type = 'administrative' - and p.osm_type = 'R' - THEN p.country_code - ELSE null - END; + RETURN result; END; $$ LANGUAGE plpgsql STABLE; diff --git a/nominatim/indexer/place_info.py b/nominatim/indexer/place_info.py index fd179fef..06d730e0 100644 --- a/nominatim/indexer/place_info.py +++ b/nominatim/indexer/place_info.py @@ -38,7 +38,31 @@ class PlaceInfo: @property - def country_feature(self): - """ Return the country code if the place is a valid country boundary. + def country_code(self): + """ The country code of the country the place is in. Guaranteed + to be a two-letter lower-case string or None, if no country + could be found. """ - return self._info.get('country_feature') + return self._info.get('country_code') + + + @property + def rank_address(self): + """ The computed rank address before rank correction. + """ + return self._info.get('rank_address') + + + def is_a(self, key, value): + """ Check if the place's primary tag corresponds to the given + key and value. + """ + return self._info.get('class') == key and self._info.get('type') == value + + + def is_country(self): + """ Check if the place is a valid country boundary. + """ + return self.rank_address == 4 \ + and self.is_a('boundary', 'administrative') \ + and self.country_code is not None diff --git a/nominatim/indexer/runners.py b/nominatim/indexer/runners.py index 43966419..70536a71 100644 --- a/nominatim/indexer/runners.py +++ b/nominatim/indexer/runners.py @@ -39,7 +39,7 @@ class AbstractPlacexRunner: @staticmethod def get_place_details(worker, ids): - worker.perform("""SELECT place_id, (placex_prepare_update(placex)).* + worker.perform("""SELECT place_id, (placex_indexing_prepare(placex)).* FROM placex WHERE place_id IN %s""", (tuple((p[0] for p in ids)), )) diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index 81b07568..fbaa2596 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -397,9 +397,8 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): token_info.add_names(fulls, partials) - country_feature = place.country_feature - if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature): - self.add_country_names(country_feature.lower(), names) + if place.is_country(): + self.add_country_names(place.country_code, names) address = place.address if address: diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py index 8bfb309d..dc6972dc 100644 --- a/nominatim/tokenizer/legacy_tokenizer.py +++ b/nominatim/tokenizer/legacy_tokenizer.py @@ -410,9 +410,8 @@ class LegacyNameAnalyzer(AbstractAnalyzer): if names: token_info.add_names(self.conn, names) - country_feature = place.country_feature - if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature): - self.add_country_names(country_feature.lower(), names) + if place.is_country(): + self.add_country_names(place.country_code, names) address = place.address if address: diff --git a/test/python/test_indexing.py b/test/python/test_indexing.py index 60ad0bc4..4c9d940d 100644 --- a/test/python/test_indexing.py +++ b/test/python/test_indexing.py @@ -29,6 +29,7 @@ class IndexerTestDB: indexed_date TIMESTAMP, partition SMALLINT, admin_level SMALLINT, + country_code TEXT, address HSTORE, token_info JSONB, geometry_sector INTEGER)""") @@ -54,15 +55,26 @@ class IndexerTestDB: END IF; RETURN NEW; END; $$ LANGUAGE plpgsql;""") - cur.execute("""CREATE OR REPLACE FUNCTION placex_prepare_update(p placex, - OUT name HSTORE, - OUT address HSTORE, - OUT country_feature VARCHAR, - OUT linked_place_id BIGINT) + cur.execute("DROP TYPE IF EXISTS prepare_update_info CASCADE") + cur.execute("""CREATE TYPE prepare_update_info AS ( + name HSTORE, + address HSTORE, + rank_address SMALLINT, + country_code TEXT, + class TEXT, + type TEXT, + linked_place_id BIGINT + )""") + cur.execute("""CREATE OR REPLACE FUNCTION placex_indexing_prepare(p placex, + OUT result prepare_update_info) AS $$ BEGIN - address := p.address; - name := p.name; + result.address := p.address; + result.name := p.name; + result.class := p.class; + result.type := p.type; + result.country_code := p.country_code; + result.rank_address := p.rank_address; END; $$ LANGUAGE plpgsql STABLE; """) diff --git a/test/python/test_tokenizer_icu.py b/test/python/test_tokenizer_icu.py index 28c6ef7a..bbfc0b12 100644 --- a/test/python/test_tokenizer_icu.py +++ b/test/python/test_tokenizer_icu.py @@ -323,10 +323,8 @@ class TestPlaceNames: assert eval(info['names']) == set((t[2] for t in tokens)) - def process_named_place(self, names, country_feature=None): + def process_named_place(self, names): place = {'name': names} - if country_feature: - place['country_feature'] = country_feature return self.analyzer.process_place(PlaceInfo(place)) @@ -353,7 +351,13 @@ class TestPlaceNames: def test_country_name(self, word_table): - info = self.process_named_place({'name': 'Norge'}, country_feature='no') + place = PlaceInfo({'name' : {'name': 'Norge'}, + 'country_code': 'no', + 'rank_address': 4, + 'class': 'boundary', + 'type': 'administrative'}) + + info = self.analyzer.process_place(place) self.expect_name_terms(info, '#norge', 'norge') assert word_table.get_country() == {('no', 'NORGE')} From 5e5addcdbf20022b3fd76dc8c2275a1eecf12a3c Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Wed, 29 Sep 2021 14:16:09 +0200 Subject: [PATCH 3/7] fix typo --- nominatim/tokenizer/base.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nominatim/tokenizer/base.py b/nominatim/tokenizer/base.py index d827f813..e126507b 100644 --- a/nominatim/tokenizer/base.py +++ b/nominatim/tokenizer/base.py @@ -136,7 +136,7 @@ class AbstractTokenizer(ABC): the tokenizer remains stable over updates. Arguments: - config: Read-only object with configuration obtions. + config: Read-only object with configuration options. init_db: When set to False, then initialisation of database tables should be skipped. This option is only required for @@ -166,7 +166,7 @@ class AbstractTokenizer(ABC): during query time. Arguments: - config: Read-only object with configuration obtions. + config: Read-only object with configuration options. """ pass @@ -181,7 +181,7 @@ class AbstractTokenizer(ABC): data structures or data itself must not be changed by this function. Arguments: - config: Read-only object with configuration obtions. + config: Read-only object with configuration options. """ pass From 16daa57e4757e4daeffec1e61630f989727dc563 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Wed, 29 Sep 2021 17:37:04 +0200 Subject: [PATCH 4/7] unify ICUNameProcessorRules and ICURuleLoader There is no need for the additional layer of indirection that the ICUNameProcessorRules class adds. The ICURuleLoader can fill the database properties directly. --- nominatim/tokenizer/base.py | 10 +++- nominatim/tokenizer/factory.py | 2 +- nominatim/tokenizer/icu_name_processor.py | 54 +++---------------- nominatim/tokenizer/icu_rule_loader.py | 47 ++++++++++++++-- nominatim/tokenizer/icu_tokenizer.py | 29 +++++----- nominatim/tokenizer/icu_variants.py | 32 ----------- nominatim/tokenizer/legacy_tokenizer.py | 4 +- nominatim/tools/check_database.py | 2 +- test/python/dummy_tokenizer.py | 4 +- test/python/test_tokenizer_icu.py | 8 +-- .../test_tokenizer_icu_name_processor.py | 27 +++++----- test/python/test_tokenizer_icu_rule_loader.py | 35 +++++++----- test/python/test_tokenizer_legacy.py | 4 +- test/python/test_tools_check_database.py | 2 +- 14 files changed, 123 insertions(+), 137 deletions(-) diff --git a/nominatim/tokenizer/base.py b/nominatim/tokenizer/base.py index e126507b..53289c78 100644 --- a/nominatim/tokenizer/base.py +++ b/nominatim/tokenizer/base.py @@ -149,11 +149,14 @@ class AbstractTokenizer(ABC): @abstractmethod - def init_from_project(self) -> None: + def init_from_project(self, config: Configuration) -> None: """ Initialise the tokenizer from an existing database setup. The function should load all previously saved configuration from the project directory and/or the property table. + + Arguments: + config: Read-only object with configuration options. """ pass @@ -187,7 +190,7 @@ class AbstractTokenizer(ABC): @abstractmethod - def check_database(self) -> str: + def check_database(self, config: Configuration) -> str: """ Check that the database is set up correctly and ready for being queried. @@ -196,6 +199,9 @@ class AbstractTokenizer(ABC): description of the issue as well as hints for the user on how to resolve the issue. + Arguments: + config: Read-only object with configuration options. + Return `None`, if no issue was found. """ pass diff --git a/nominatim/tokenizer/factory.py b/nominatim/tokenizer/factory.py index 069672d4..dc3e7411 100644 --- a/nominatim/tokenizer/factory.py +++ b/nominatim/tokenizer/factory.py @@ -85,6 +85,6 @@ def get_tokenizer_for_db(config): tokenizer_module = _import_tokenizer(name) tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir) - tokenizer.init_from_project() + tokenizer.init_from_project(config) return tokenizer diff --git a/nominatim/tokenizer/icu_name_processor.py b/nominatim/tokenizer/icu_name_processor.py index 93d2b0ff..544f5ebc 100644 --- a/nominatim/tokenizer/icu_name_processor.py +++ b/nominatim/tokenizer/icu_name_processor.py @@ -8,67 +8,25 @@ import itertools from icu import Transliterator import datrie -from nominatim.db.properties import set_property, get_property -from nominatim.tokenizer import icu_variants as variants - -DBCFG_IMPORT_NORM_RULES = "tokenizer_import_normalisation" -DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration" -DBCFG_IMPORT_REPLACEMENTS = "tokenizer_import_replacements" -DBCFG_SEARCH_STD_RULES = "tokenizer_search_standardization" - - -class ICUNameProcessorRules: - """ Data object that saves the rules needed for the name processor. - - The rules can either be initialised through an ICURuleLoader or - be loaded from a database when a connection is given. - """ - def __init__(self, loader=None, conn=None): - if loader is not None: - self.norm_rules = loader.get_normalization_rules() - self.trans_rules = loader.get_transliteration_rules() - self.replacements = loader.get_replacement_pairs() - self.search_rules = loader.get_search_rules() - elif conn is not None: - self.norm_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES) - self.trans_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES) - self.replacements = \ - variants.unpickle_variant_set(get_property(conn, DBCFG_IMPORT_REPLACEMENTS)) - self.search_rules = get_property(conn, DBCFG_SEARCH_STD_RULES) - else: - assert False, "Parameter loader or conn required." - - - def save_rules(self, conn): - """ Save the rules in the property table of the given database. - the rules can be loaded again by handing in a connection into - the constructor of the class. - """ - set_property(conn, DBCFG_IMPORT_NORM_RULES, self.norm_rules) - set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.trans_rules) - set_property(conn, DBCFG_IMPORT_REPLACEMENTS, - variants.pickle_variant_set(self.replacements)) - set_property(conn, DBCFG_SEARCH_STD_RULES, self.search_rules) - class ICUNameProcessor: """ Collects the different transformation rules for normalisation of names - and provides the functions to aply the transformations. + and provides the functions to apply the transformations. """ - def __init__(self, rules): + def __init__(self, norm_rules, trans_rules, replacements): self.normalizer = Transliterator.createFromRules("icu_normalization", - rules.norm_rules) + norm_rules) self.to_ascii = Transliterator.createFromRules("icu_to_ascii", - rules.trans_rules + + trans_rules + ";[:Space:]+ > ' '") self.search = Transliterator.createFromRules("icu_search", - rules.search_rules) + norm_rules + trans_rules) # Intermediate reorder by source. Also compute required character set. immediate = defaultdict(list) chars = set() - for variant in rules.replacements: + for variant in replacements: if variant.source[-1] == ' ' and variant.replacement[-1] == ' ': replstr = variant.replacement[:-1] else: diff --git a/nominatim/tokenizer/icu_rule_loader.py b/nominatim/tokenizer/icu_rule_loader.py index 0e6e40b4..bd0739f2 100644 --- a/nominatim/tokenizer/icu_rule_loader.py +++ b/nominatim/tokenizer/icu_rule_loader.py @@ -2,17 +2,25 @@ Helper class to create ICU rules from a configuration file. """ import io +import json import logging import itertools import re from icu import Transliterator +from nominatim.db.properties import set_property, get_property from nominatim.errors import UsageError +from nominatim.tokenizer.icu_name_processor import ICUNameProcessor import nominatim.tokenizer.icu_variants as variants LOG = logging.getLogger() +DBCFG_IMPORT_NORM_RULES = "tokenizer_import_normalisation" +DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration" +DBCFG_IMPORT_ANALYSIS_RULES = "tokenizer_import_analysis_rules" + + def _flatten_config_list(content): if not content: return [] @@ -46,12 +54,43 @@ class ICURuleLoader: """ Compiler for ICU rules from a tokenizer configuration file. """ - def __init__(self, rules): + def __init__(self, config): + rules = config.load_sub_configuration('icu_tokenizer.yaml', + config='TOKENIZER_CONFIG') + self.variants = set() self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization') self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration') - self._parse_variant_list(self._get_section(rules, 'variants')) + self.analysis_rules = self._get_section(rules, 'variants') + self._parse_variant_list() + + + def load_config_from_db(self, conn): + """ Get previously saved parts of the configuration from the + database. + """ + self.normalization_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES) + self.transliteration_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES) + self.analysis_rules = json.loads(get_property(conn, DBCFG_IMPORT_ANALYSIS_RULES)) + self._parse_variant_list() + + + def save_config_to_db(self, conn): + """ Save the part of the configuration that cannot be changed into + the database. + """ + set_property(conn, DBCFG_IMPORT_NORM_RULES, self.normalization_rules) + set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.transliteration_rules) + set_property(conn, DBCFG_IMPORT_ANALYSIS_RULES, json.dumps(self.analysis_rules)) + + + def make_token_analysis(self): + """ Create a token analyser from the reviouly loaded rules. + """ + return ICUNameProcessor(self.normalization_rules, + self.transliteration_rules, + self.variants) def get_search_rules(self): @@ -112,7 +151,9 @@ class ICURuleLoader: return ';'.join(_flatten_config_list(content)) + ';' - def _parse_variant_list(self, rules): + def _parse_variant_list(self): + rules = self.analysis_rules + self.variants.clear() if not rules: diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index fbaa2596..87906d71 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -14,7 +14,6 @@ from nominatim.db.properties import set_property, get_property from nominatim.db.utils import CopyBuffer from nominatim.db.sql_preprocessor import SQLPreprocessor from nominatim.tokenizer.icu_rule_loader import ICURuleLoader -from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization" @@ -36,7 +35,7 @@ class LegacyICUTokenizer(AbstractTokenizer): def __init__(self, dsn, data_dir): self.dsn = dsn self.data_dir = data_dir - self.naming_rules = None + self.loader = None self.term_normalization = None @@ -46,9 +45,8 @@ class LegacyICUTokenizer(AbstractTokenizer): This copies all necessary data in the project directory to make sure the tokenizer remains stable even over updates. """ - loader = ICURuleLoader(config.load_sub_configuration('icu_tokenizer.yaml', - config='TOKENIZER_CONFIG')) - self.naming_rules = ICUNameProcessorRules(loader=loader) + self.loader = ICURuleLoader(config) + self.term_normalization = config.TERM_NORMALIZATION self._install_php(config.lib_dir.php) @@ -59,11 +57,13 @@ class LegacyICUTokenizer(AbstractTokenizer): self._init_db_tables(config) - def init_from_project(self): + def init_from_project(self, config): """ Initialise the tokenizer from the project directory. """ + self.loader = ICURuleLoader(config) + with connect(self.dsn) as conn: - self.naming_rules = ICUNameProcessorRules(conn=conn) + self.loader.load_config_from_db(conn) self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION) @@ -81,12 +81,12 @@ class LegacyICUTokenizer(AbstractTokenizer): sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql') - def check_database(self): + def check_database(self, config): """ Check that the tokenizer is set up correctly. """ - self.init_from_project() + self.init_from_project(config) - if self.naming_rules is None: + if self.term_normalization is None: return "Configuration for tokenizer 'icu' are missing." return None @@ -107,7 +107,7 @@ class LegacyICUTokenizer(AbstractTokenizer): Analyzers are not thread-safe. You need to instantiate one per thread. """ - return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules)) + return LegacyICUNameAnalyzer(self.dsn, self.loader.make_token_analysis()) def _install_php(self, phpdir): @@ -118,7 +118,7 @@ class LegacyICUTokenizer(AbstractTokenizer): 🜵', 'street -> st') + config = cfgfile('saint -> 🜵', 'street -> st') - rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath)) - proc = ICUNameProcessor(rules) + proc = ICURuleLoader(config).make_token_analysis() assert get_normalized_variants(proc, '🜵') == [] assert get_normalized_variants(proc, '🜳') == [] @@ -83,8 +86,8 @@ VARIANT_TESTS = [ @pytest.mark.parametrize("rules,name,variants", VARIANT_TESTS) def test_variants(cfgfile, rules, name, variants): - fpath = cfgfile(*rules) - proc = ICUNameProcessor(ICUNameProcessorRules(loader=ICURuleLoader(fpath))) + config = cfgfile(*rules) + proc = ICURuleLoader(config).make_token_analysis() result = get_normalized_variants(proc, name) @@ -93,10 +96,8 @@ def test_variants(cfgfile, rules, name, variants): def test_search_normalized(cfgfile): - fpath = cfgfile('~street => s,st', 'master => mstr') - - rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath)) - proc = ICUNameProcessor(rules) + config = cfgfile('~street => s,st', 'master => mstr') + proc = ICURuleLoader(config).make_token_analysis() assert proc.get_search_normalized('Master Street') == 'master street' assert proc.get_search_normalized('Earnes St') == 'earnes st' diff --git a/test/python/test_tokenizer_icu_rule_loader.py b/test/python/test_tokenizer_icu_rule_loader.py index c3480de8..6ec53edc 100644 --- a/test/python/test_tokenizer_icu_rule_loader.py +++ b/test/python/test_tokenizer_icu_rule_loader.py @@ -12,7 +12,16 @@ from nominatim.errors import UsageError from icu import Transliterator @pytest.fixture -def cfgrules(): +def test_config(def_config, tmp_path): + project_dir = tmp_path / 'project_dir' + project_dir.mkdir() + def_config.project_dir = project_dir + + return def_config + + +@pytest.fixture +def cfgrules(test_config): def _create_config(*variants, **kwargs): content = dedent("""\ normalization: @@ -29,19 +38,21 @@ def cfgrules(): content += '\n'.join((" - " + s for s in variants)) + '\n' for k, v in kwargs: content += " {}: {}\n".format(k, v) - return yaml.safe_load(content) + (test_config.project_dir / 'icu_tokenizer.yaml').write_text(content) + + return test_config return _create_config -def test_empty_rule_set(): - rule_cfg = yaml.safe_load(dedent("""\ +def test_empty_rule_set(test_config): + (test_config.project_dir / 'icu_tokenizer.yaml').write_text(dedent("""\ normalization: transliteration: variants: """)) - rules = ICURuleLoader(rule_cfg) + rules = ICURuleLoader(test_config) assert rules.get_search_rules() == '' assert rules.get_normalization_rules() == '' assert rules.get_transliteration_rules() == '' @@ -50,11 +61,12 @@ def test_empty_rule_set(): CONFIG_SECTIONS = ('normalization', 'transliteration', 'variants') @pytest.mark.parametrize("section", CONFIG_SECTIONS) -def test_missing_section(section): +def test_missing_section(section, test_config): rule_cfg = { s: {} for s in CONFIG_SECTIONS if s != section} + (test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(rule_cfg)) with pytest.raises(UsageError): - ICURuleLoader(rule_cfg) + ICURuleLoader(test_config) def test_get_search_rules(cfgrules): @@ -88,9 +100,8 @@ def test_get_transliteration_rules(cfgrules): assert trans.transliterate(" проспект-Prospekt ") == " prospekt Prospekt " -def test_transliteration_rules_from_file(def_config, tmp_path): - def_config.project_dir = tmp_path - cfgpath = tmp_path / ('test_config.yaml') +def test_transliteration_rules_from_file(test_config): + cfgpath = test_config.project_dir / ('icu_tokenizer.yaml') cfgpath.write_text(dedent("""\ normalization: transliteration: @@ -98,10 +109,10 @@ def test_transliteration_rules_from_file(def_config, tmp_path): - !include transliteration.yaml variants: """)) - transpath = tmp_path / ('transliteration.yaml') + transpath = test_config.project_dir / ('transliteration.yaml') transpath.write_text('- "x > y"') - loader = ICURuleLoader(def_config.load_sub_configuration('test_config.yaml')) + loader = ICURuleLoader(test_config) rules = loader.get_transliteration_rules() trans = Transliterator.createFromRules("test", rules) diff --git a/test/python/test_tokenizer_legacy.py b/test/python/test_tokenizer_legacy.py index 2545c2db..53d45c1c 100644 --- a/test/python/test_tokenizer_legacy.py +++ b/test/python/test_tokenizer_legacy.py @@ -132,10 +132,10 @@ def test_init_module_custom(tokenizer_factory, test_config, assert not (test_config.project_dir / 'module').exists() -def test_init_from_project(tokenizer_setup, tokenizer_factory): +def test_init_from_project(tokenizer_setup, tokenizer_factory, test_config): tok = tokenizer_factory() - tok.init_from_project() + tok.init_from_project(test_config) assert tok.normalization is not None diff --git a/test/python/test_tools_check_database.py b/test/python/test_tools_check_database.py index aed5cb7e..edba3236 100644 --- a/test/python/test_tools_check_database.py +++ b/test/python/test_tools_check_database.py @@ -53,7 +53,7 @@ def test_check_tokenizer(temp_db_conn, def_config, monkeypatch, check_result, state): class _TestTokenizer: @staticmethod - def check_database(): + def check_database(_): return check_result monkeypatch.setattr(chkdb.tokenizer_factory, 'get_tokenizer_for_db', From 8171fe4571a57bf8e5b2a8f676989e973897e2e7 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Thu, 30 Sep 2021 21:30:13 +0200 Subject: [PATCH 5/7] introduce sanitizer step before token analysis Sanatizer functions allow to transform name and address tags before they are handed to the tokenizer. Theses transformations are visible only for the tokenizer and thus only have an influence on the search terms and address match terms for a place. Currently two sanitizers are implemented which are responsible for splitting names with multiple values and removing bracket additions. Both was previously hard-coded in the tokenizer. --- nominatim/tokenizer/icu_rule_loader.py | 10 ++ nominatim/tokenizer/icu_tokenizer.py | 110 ++++++++------- nominatim/tokenizer/place_sanitizer.py | 127 ++++++++++++++++++ nominatim/tokenizer/sanitizers/__init__.py | 0 .../tokenizer/sanitizers/split_name_list.py | 28 ++++ .../tokenizer/sanitizers/strip_brace_terms.py | 22 +++ settings/icu_tokenizer.yaml | 3 + test/python/test_tokenizer_icu.py | 17 +-- 8 files changed, 259 insertions(+), 58 deletions(-) create mode 100644 nominatim/tokenizer/place_sanitizer.py create mode 100644 nominatim/tokenizer/sanitizers/__init__.py create mode 100644 nominatim/tokenizer/sanitizers/split_name_list.py create mode 100644 nominatim/tokenizer/sanitizers/strip_brace_terms.py diff --git a/nominatim/tokenizer/icu_rule_loader.py b/nominatim/tokenizer/icu_rule_loader.py index bd0739f2..330179bb 100644 --- a/nominatim/tokenizer/icu_rule_loader.py +++ b/nominatim/tokenizer/icu_rule_loader.py @@ -12,6 +12,7 @@ from icu import Transliterator from nominatim.db.properties import set_property, get_property from nominatim.errors import UsageError from nominatim.tokenizer.icu_name_processor import ICUNameProcessor +from nominatim.tokenizer.place_sanitizer import PlaceSanitizer import nominatim.tokenizer.icu_variants as variants LOG = logging.getLogger() @@ -65,6 +66,9 @@ class ICURuleLoader: self.analysis_rules = self._get_section(rules, 'variants') self._parse_variant_list() + # Load optional sanitizer rule set. + self.sanitizer_rules = rules.get('sanitizers', []) + def load_config_from_db(self, conn): """ Get previously saved parts of the configuration from the @@ -85,6 +89,12 @@ class ICURuleLoader: set_property(conn, DBCFG_IMPORT_ANALYSIS_RULES, json.dumps(self.analysis_rules)) + def make_sanitizer(self): + """ Create a place sanitizer from the configured rules. + """ + return PlaceSanitizer(self.sanitizer_rules) + + def make_token_analysis(self): """ Create a token analyser from the reviouly loaded rules. """ diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py index 87906d71..2ece10f2 100644 --- a/nominatim/tokenizer/icu_tokenizer.py +++ b/nominatim/tokenizer/icu_tokenizer.py @@ -13,6 +13,7 @@ from nominatim.db.connection import connect from nominatim.db.properties import set_property, get_property from nominatim.db.utils import CopyBuffer from nominatim.db.sql_preprocessor import SQLPreprocessor +from nominatim.indexer.place_info import PlaceInfo from nominatim.tokenizer.icu_rule_loader import ICURuleLoader from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer @@ -107,7 +108,8 @@ class LegacyICUTokenizer(AbstractTokenizer): Analyzers are not thread-safe. You need to instantiate one per thread. """ - return LegacyICUNameAnalyzer(self.dsn, self.loader.make_token_analysis()) + return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(), + self.loader.make_token_analysis()) def _install_php(self, phpdir): @@ -187,10 +189,11 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): normalization. """ - def __init__(self, dsn, name_proc): + def __init__(self, dsn, sanitizer, token_analysis): self.conn = connect(dsn).connection self.conn.autocommit = True - self.name_processor = name_proc + self.sanitizer = sanitizer + self.token_analysis = token_analysis self._cache = _TokenCache() @@ -203,6 +206,19 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): self.conn = None + def _search_normalized(self, name): + """ Return the search token transliteration of the given name. + """ + return self.token_analysis.get_search_normalized(name) + + + def _normalized(self, name): + """ Return the normalized version of the given name with all + non-relevant information removed. + """ + return self.token_analysis.get_normalized(name) + + def get_word_token_info(self, words): """ Return token information for the given list of words. If a word starts with # it is assumed to be a full name @@ -218,9 +234,9 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): partial_tokens = {} for word in words: if word.startswith('#'): - full_tokens[word] = self.name_processor.get_search_normalized(word[1:]) + full_tokens[word] = self._search_normalized(word[1:]) else: - partial_tokens[word] = self.name_processor.get_search_normalized(word) + partial_tokens[word] = self._search_normalized(word) with self.conn.cursor() as cur: cur.execute("""SELECT word_token, word_id @@ -251,7 +267,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): This function takes minor shortcuts on transliteration. """ - return self.name_processor.get_search_normalized(hnr) + return self._search_normalized(hnr) def update_postcodes_from_db(self): """ Update postcode tokens in the word table from the location_postcode @@ -274,7 +290,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): if postcode is None: to_delete.append(word) else: - copystr.add(self.name_processor.get_search_normalized(postcode), + copystr.add(self._search_normalized(postcode), 'P', postcode) if to_delete: @@ -292,7 +308,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): completely replaced. Otherwise the phrases are added to the already existing ones. """ - norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3]) + norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3]) for p in phrases)) with self.conn.cursor() as cur: @@ -322,7 +338,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): added = 0 with CopyBuffer() as copystr: for word, cls, typ, oper in to_add: - term = self.name_processor.get_search_normalized(word) + term = self._search_normalized(word) if term: copystr.add(term, 'S', word, json.dumps({'class': cls, 'type': typ, @@ -356,9 +372,21 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): def add_country_names(self, country_code, names): """ Add names for the given country to the search index. """ + # Make sure any name preprocessing for country names applies. + info = PlaceInfo({'name': names, 'country_code': country_code, + 'rank_address': 4, 'class': 'boundary', + 'type': 'administrative'}) + self._add_country_full_names(country_code, + self.sanitizer.process_names(info)[0]) + + + def _add_country_full_names(self, country_code, names): + """ Add names for the given country from an already sanitized + name list. + """ word_tokens = set() - for name in self._compute_full_names(names): - norm_name = self.name_processor.get_search_normalized(name) + for name in names: + norm_name = self._search_normalized(name.name) if norm_name: word_tokens.add(norm_name) @@ -384,12 +412,12 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): def process_place(self, place): """ Determine tokenizer information about the given place. - Returns a JSON-serialisable structure that will be handed into + Returns a JSON-serializable structure that will be handed into the database via the token_info field. """ token_info = _TokenInfo(self._cache) - names = place.name + names, address = self.sanitizer.process_names(place) if names: fulls, partials = self._compute_name_tokens(names) @@ -397,9 +425,8 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): token_info.add_names(fulls, partials) if place.is_country(): - self.add_country_names(place.country_code, names) + self._add_country_full_names(place.country_code, names) - address = place.address if address: self._process_place_address(token_info, address) @@ -409,18 +436,18 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): def _process_place_address(self, token_info, address): hnrs = [] addr_terms = [] - for key, value in address.items(): - if key == 'postcode': - self._add_postcode(value) - elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'): - hnrs.append(value) - elif key == 'street': - token_info.add_street(self._compute_partial_tokens(value)) - elif key == 'place': - token_info.add_place(self._compute_partial_tokens(value)) - elif not key.startswith('_') and \ - key not in ('country', 'full'): - addr_terms.append((key, self._compute_partial_tokens(value))) + for item in address: + if item.kind == 'postcode': + self._add_postcode(item.name) + elif item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'): + hnrs.append(item.name) + elif item.kind == 'street': + token_info.add_street(self._compute_partial_tokens(item.name)) + elif item.kind == 'place': + token_info.add_place(self._compute_partial_tokens(item.name)) + elif not item.kind.startswith('_') and \ + item.kind not in ('country', 'full'): + addr_terms.append((item.kind, self._compute_partial_tokens(item.name))) if hnrs: hnrs = self._split_housenumbers(hnrs) @@ -433,7 +460,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): """ Normalize the given term, split it into partial words and return then token list for them. """ - norm_name = self.name_processor.get_search_normalized(name) + norm_name = self._search_normalized(name) tokens = [] need_lookup = [] @@ -456,19 +483,19 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): return tokens + def _compute_name_tokens(self, names): """ Computes the full name and partial name tokens for the given dictionary of names. """ - full_names = self._compute_full_names(names) full_tokens = set() partial_tokens = set() - for name in full_names: - norm_name = self.name_processor.get_normalized(name) + for name in names: + norm_name = self._normalized(name.name) full, part = self._cache.names.get(norm_name, (None, None)) if full is None: - variants = self.name_processor.get_variants_ascii(norm_name) + variants = self.token_analysis.get_variants_ascii(norm_name) if not variants: continue @@ -485,23 +512,6 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): return full_tokens, partial_tokens - @staticmethod - def _compute_full_names(names): - """ Return the set of all full name word ids to be used with the - given dictionary of names. - """ - full_names = set() - for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)): - if name: - full_names.add(name) - - brace_idx = name.find('(') - if brace_idx >= 0: - full_names.add(name[:brace_idx].strip()) - - return full_names - - def _add_postcode(self, postcode): """ Make sure the normalized postcode is present in the word table. """ @@ -509,7 +519,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer): postcode = self.normalize_postcode(postcode) if postcode not in self._cache.postcodes: - term = self.name_processor.get_search_normalized(postcode) + term = self._search_normalized(postcode) if not term: return diff --git a/nominatim/tokenizer/place_sanitizer.py b/nominatim/tokenizer/place_sanitizer.py new file mode 100644 index 00000000..5961dcf0 --- /dev/null +++ b/nominatim/tokenizer/place_sanitizer.py @@ -0,0 +1,127 @@ +""" +Handler for cleaning name and address tags in place information before it +is handed to the token analysis. +""" +import importlib + +from nominatim.errors import UsageError + +class PlaceName: + """ A searchable name for a place together with properties. + Every name object saves the name proper and two basic properties: + * 'kind' describes the name of the OSM key used without any suffixes + (i.e. the part after the colon removed) + * 'suffix' contains the suffix of the OSM tag, if any. The suffix + is the part of the key after the first colon. + In addition to that, the name may have arbitrary additional attributes. + Which attributes are used, depends on the token analyser. + """ + + def __init__(self, name, kind, suffix): + self.name = name + self.kind = kind + self.suffix = suffix + self.attr = {} + + + def __repr__(self): + return f"PlaceName(name='{self.name}',kind='{self.kind}',suffix='{self.suffix}')" + + + def clone(self, name=None, kind=None, suffix=None, attr=None): + """ Create a deep copy of the place name, optionally with the + given parameters replaced. In the attribute list only the given + keys are updated. The list is not replaced completely. + In particular, the function cannot to be used to remove an + attribute from a place name. + """ + newobj = PlaceName(name or self.name, + kind or self.kind, + suffix or self.suffix) + + newobj.attr.update(self.attr) + if attr: + newobj.attr.update(attr) + + return newobj + + + def set_attr(self, key, value): + """ Add the given property to the name. If the property was already + set, then the value is overwritten. + """ + self.attr[key] = value + + + def get_attr(self, key, default=None): + """ Return the given property or the value of 'default' if it + is not set. + """ + return self.attr.get(key, default) + + + def has_attr(self, key): + """ Check if the given attribute is set. + """ + return key in self.attr + + +class _ProcessInfo: + """ Container class for information handed into to handler functions. + The 'names' and 'address' members are mutable. A handler must change + them by either modifying the lists place or replacing the old content + with a new list. + """ + + def __init__(self, place): + self.place = place + self.names = self._convert_name_dict(place.name) + self.address = self._convert_name_dict(place.address) + + + @staticmethod + def _convert_name_dict(names): + """ Convert a dictionary of names into a list of PlaceNames. + The dictionary key is split into the primary part of the key + and the suffix (the part after an optional colon). + """ + out = [] + + if names: + for key, value in names.items(): + parts = key.split(':', 1) + out.append(PlaceName(value.strip(), + parts[0].strip(), + parts[1].strip() if len(parts) > 1 else None)) + + return out + + +class PlaceSanitizer: + """ Controller class which applies sanitizer functions on the place + names and address before they are used by the token analysers. + """ + + def __init__(self, rules): + self.handlers = [] + + if rules: + for func in rules: + if 'step' not in func: + raise UsageError("Sanitizer rule is missing the 'step' attribute.") + module_name = 'nominatim.tokenizer.sanitizers.' + func['step'].replace('-', '_') + handler_module = importlib.import_module(module_name) + self.handlers.append(handler_module.create(func)) + + + def process_names(self, place): + """ Extract a sanitized list of names and address parts from the + given place. The function returns a tuple + (list of names, list of address names) + """ + obj = _ProcessInfo(place) + + for func in self.handlers: + func(obj) + + return obj.names, obj.address diff --git a/nominatim/tokenizer/sanitizers/__init__.py b/nominatim/tokenizer/sanitizers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/nominatim/tokenizer/sanitizers/split_name_list.py b/nominatim/tokenizer/sanitizers/split_name_list.py new file mode 100644 index 00000000..93651f3e --- /dev/null +++ b/nominatim/tokenizer/sanitizers/split_name_list.py @@ -0,0 +1,28 @@ +""" +Name processor that splits name values with multiple values into their components. +""" +import re + +def create(func): + """ Create a name processing function that splits name values with + multiple values into their components. The optional parameter + 'delimiters' can be used to define the characters that should be used + for splitting. The default is ',;'. + """ + regexp = re.compile('[{}]'.format(func.get('delimiters', ',;'))) + + def _process(obj): + if not obj.names: + return + + new_names = [] + for name in obj.names: + split_names = regexp.split(name.name) + if len(split_names) == 1: + new_names.append(name) + else: + new_names.extend(name.clone(name=n) for n in split_names) + + obj.names = new_names + + return _process diff --git a/nominatim/tokenizer/sanitizers/strip_brace_terms.py b/nominatim/tokenizer/sanitizers/strip_brace_terms.py new file mode 100644 index 00000000..4423d305 --- /dev/null +++ b/nominatim/tokenizer/sanitizers/strip_brace_terms.py @@ -0,0 +1,22 @@ +""" +Sanitizer handling names with addendums in braces. +""" + +def create(_): + """ Create a name processing function that creates additional name variants + when a name has an addendum in brackets (e.g. "Halle (Saale)"). The + additional variant only contains the main name without the bracket part. + """ + def _process(obj): + """ Add variants for names that have a bracket extension. + """ + new_names = [] + if obj.names: + for name in (n for n in obj.names if '(' in n.name): + new_name = name.name.split('(')[0].strip() + if new_name: + new_names.append(name.clone(name=new_name)) + + obj.names.extend(new_names) + + return _process diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml index c0c8c043..08b7a7ff 100644 --- a/settings/icu_tokenizer.yaml +++ b/settings/icu_tokenizer.yaml @@ -24,6 +24,9 @@ transliteration: - "[^[:Ascii:]] >" - ":: lower ()" - ":: NFC ()" +sanitizers: + - step: split-name-list + - step: strip-brace-terms variants: - !include icu-rules/variants-bg.yaml - !include icu-rules/variants-ca.yaml diff --git a/test/python/test_tokenizer_icu.py b/test/python/test_tokenizer_icu.py index 4b7c56d5..9a6f5a94 100644 --- a/test/python/test_tokenizer_icu.py +++ b/test/python/test_tokenizer_icu.py @@ -67,10 +67,12 @@ def analyzer(tokenizer_factory, test_config, monkeypatch, monkeypatch.undo() def _mk_analyser(norm=("[[:Punctuation:][:Space:]]+ > ' '",), trans=(':: upper()',), - variants=('~gasse -> gasse', 'street => st', )): + variants=('~gasse -> gasse', 'street => st', ), + sanitizers=[]): cfgstr = {'normalization' : list(norm), - 'transliteration' : list(trans), - 'variants' : [ {'words': list(variants)}]} + 'sanitizers' : sanitizers, + 'transliteration' : list(trans), + 'variants' : [ {'words': list(variants)}]} (test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr)) tok.loader = ICURuleLoader(test_config) @@ -309,14 +311,15 @@ class TestPlaceNames: @pytest.fixture(autouse=True) def setup(self, analyzer, sql_functions): - with analyzer() as anl: + sanitizers = [{'step': 'split-name-list'}, + {'step': 'strip-brace-terms'}] + with analyzer(sanitizers=sanitizers) as anl: self.analyzer = anl yield anl def expect_name_terms(self, info, *expected_terms): tokens = self.analyzer.get_word_token_info(expected_terms) - print (tokens) for token in tokens: assert token[2] is not None, "No token for {0}".format(token) @@ -324,9 +327,7 @@ class TestPlaceNames: def process_named_place(self, names): - place = {'name': names} - - return self.analyzer.process_place(PlaceInfo(place)) + return self.analyzer.process_place(PlaceInfo({'name': names})) def test_simple_names(self): From 732cd27d2e3033c11eca5b51cc2e6204c5737def Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Fri, 1 Oct 2021 09:50:17 +0200 Subject: [PATCH 6/7] add unit tests for new sanatizer functions --- .../tokenizer/sanitizers/split_name_list.py | 11 ++- .../tokenizer/sanitizers/strip_brace_terms.py | 4 +- .../sanitizers/test_split_name_list.py | 65 +++++++++++++++++ .../sanitizers/test_strip_brace_terms.py | 44 ++++++++++++ test/python/tokenizer/test_place_sanitizer.py | 71 +++++++++++++++++++ 5 files changed, 191 insertions(+), 4 deletions(-) create mode 100644 test/python/tokenizer/sanitizers/test_split_name_list.py create mode 100644 test/python/tokenizer/sanitizers/test_strip_brace_terms.py create mode 100644 test/python/tokenizer/test_place_sanitizer.py diff --git a/nominatim/tokenizer/sanitizers/split_name_list.py b/nominatim/tokenizer/sanitizers/split_name_list.py index 93651f3e..f1514203 100644 --- a/nominatim/tokenizer/sanitizers/split_name_list.py +++ b/nominatim/tokenizer/sanitizers/split_name_list.py @@ -3,13 +3,19 @@ Name processor that splits name values with multiple values into their component """ import re +from nominatim.errors import UsageError + def create(func): """ Create a name processing function that splits name values with multiple values into their components. The optional parameter 'delimiters' can be used to define the characters that should be used for splitting. The default is ',;'. """ - regexp = re.compile('[{}]'.format(func.get('delimiters', ',;'))) + delimiter_set = set(func.get('delimiters', ',;')) + if not delimiter_set: + raise UsageError("Set of delimiters in split-name-list sanitizer is empty.") + + regexp = re.compile('\\s*[{}]\\s*'.format(''.join('\\' + d for d in delimiter_set))) def _process(obj): if not obj.names: @@ -18,10 +24,11 @@ def create(func): new_names = [] for name in obj.names: split_names = regexp.split(name.name) + print(split_names) if len(split_names) == 1: new_names.append(name) else: - new_names.extend(name.clone(name=n) for n in split_names) + new_names.extend(name.clone(name=n) for n in split_names if n) obj.names = new_names diff --git a/nominatim/tokenizer/sanitizers/strip_brace_terms.py b/nominatim/tokenizer/sanitizers/strip_brace_terms.py index 4423d305..ec91bac9 100644 --- a/nominatim/tokenizer/sanitizers/strip_brace_terms.py +++ b/nominatim/tokenizer/sanitizers/strip_brace_terms.py @@ -10,13 +10,13 @@ def create(_): def _process(obj): """ Add variants for names that have a bracket extension. """ - new_names = [] if obj.names: + new_names = [] for name in (n for n in obj.names if '(' in n.name): new_name = name.name.split('(')[0].strip() if new_name: new_names.append(name.clone(name=new_name)) - obj.names.extend(new_names) + obj.names.extend(new_names) return _process diff --git a/test/python/tokenizer/sanitizers/test_split_name_list.py b/test/python/tokenizer/sanitizers/test_split_name_list.py new file mode 100644 index 00000000..ee745469 --- /dev/null +++ b/test/python/tokenizer/sanitizers/test_split_name_list.py @@ -0,0 +1,65 @@ +""" +Tests for the sanitizer that splitts multivalue lists. +""" +import pytest + +from nominatim.tokenizer.place_sanitizer import PlaceSanitizer +from nominatim.indexer.place_info import PlaceInfo + +from nominatim.errors import UsageError + +def run_sanitizer_on(**kwargs): + place = PlaceInfo({'name': kwargs}) + name, _ = PlaceSanitizer([{'step': 'split-name-list'}]).process_names(place) + + return sorted([(p.name, p.kind, p.suffix) for p in name]) + + +def sanitize_with_delimiter(delimiter, name): + place = PlaceInfo({'name': {'name': name}}) + san = PlaceSanitizer([{'step': 'split-name-list', 'delimiters': delimiter}]) + name, _ = san.process_names(place) + + return sorted([p.name for p in name]) + + +def test_simple(): + assert run_sanitizer_on(name='ABC') == [('ABC', 'name', None)] + assert run_sanitizer_on(name='') == [('', 'name', None)] + + +def test_splits(): + assert run_sanitizer_on(name='A;B;C') == [('A', 'name', None), + ('B', 'name', None), + ('C', 'name', None)] + assert run_sanitizer_on(short_name=' House, boat ') == [('House', 'short_name', None), + ('boat', 'short_name', None)] + + +def test_empty_fields(): + assert run_sanitizer_on(name='A;;B') == [('A', 'name', None), + ('B', 'name', None)] + assert run_sanitizer_on(name='A; ,B') == [('A', 'name', None), + ('B', 'name', None)] + assert run_sanitizer_on(name=' ;B') == [('B', 'name', None)] + assert run_sanitizer_on(name='B,') == [('B', 'name', None)] + + +def test_custom_delimiters(): + assert sanitize_with_delimiter(':', '12:45,3') == ['12', '45,3'] + assert sanitize_with_delimiter('\\', 'a;\\b!#@ \\') == ['a;', 'b!#@'] + assert sanitize_with_delimiter('[]', 'foo[to]be') == ['be', 'foo', 'to'] + assert sanitize_with_delimiter(' ', 'morning sun') == ['morning', 'sun'] + + +def test_empty_delimiter_set(): + with pytest.raises(UsageError): + sanitize_with_delimiter('', 'abc') + + +def test_no_name_list(): + place = PlaceInfo({'address': {'housenumber': '3'}}) + name, address = PlaceSanitizer([{'step': 'split-name-list'}]).process_names(place) + + assert not name + assert len(address) == 1 diff --git a/test/python/tokenizer/sanitizers/test_strip_brace_terms.py b/test/python/tokenizer/sanitizers/test_strip_brace_terms.py new file mode 100644 index 00000000..50af2449 --- /dev/null +++ b/test/python/tokenizer/sanitizers/test_strip_brace_terms.py @@ -0,0 +1,44 @@ +""" +Tests for the sanitizer that handles braced suffixes. +""" +import pytest + +from nominatim.tokenizer.place_sanitizer import PlaceSanitizer +from nominatim.indexer.place_info import PlaceInfo + +def run_sanitizer_on(**kwargs): + place = PlaceInfo({'name': kwargs}) + name, _ = PlaceSanitizer([{'step': 'strip-brace-terms'}]).process_names(place) + + return sorted([(p.name, p.kind, p.suffix) for p in name]) + + +def test_no_braces(): + assert run_sanitizer_on(name='foo', ref='23') == [('23', 'ref', None), + ('foo', 'name', None)] + + +def test_simple_braces(): + assert run_sanitizer_on(name='Halle (Saale)', ref='3')\ + == [('3', 'ref', None), ('Halle', 'name', None), ('Halle (Saale)', 'name', None)] + assert run_sanitizer_on(name='ack ( bar')\ + == [('ack', 'name', None), ('ack ( bar', 'name', None)] + + +def test_only_braces(): + assert run_sanitizer_on(name='(maybe)') == [('(maybe)', 'name', None)] + + +def test_double_braces(): + assert run_sanitizer_on(name='a((b))') == [('a', 'name', None), + ('a((b))', 'name', None)] + assert run_sanitizer_on(name='a (b) (c)') == [('a', 'name', None), + ('a (b) (c)', 'name', None)] + + +def test_no_names(): + place = PlaceInfo({'address': {'housenumber': '3'}}) + name, address = PlaceSanitizer([{'step': 'strip-brace-terms'}]).process_names(place) + + assert not name + assert len(address) == 1 diff --git a/test/python/tokenizer/test_place_sanitizer.py b/test/python/tokenizer/test_place_sanitizer.py new file mode 100644 index 00000000..389b068c --- /dev/null +++ b/test/python/tokenizer/test_place_sanitizer.py @@ -0,0 +1,71 @@ +""" +Tests for execution of the sanitztion step. +""" +import pytest + +from nominatim.errors import UsageError +import nominatim.tokenizer.place_sanitizer as sanitizer +from nominatim.indexer.place_info import PlaceInfo + + +def test_placeinfo_clone_new_name(): + place = sanitizer.PlaceName('foo', 'ki', 'su') + + newplace = place.clone(name='bar') + + assert place.name == 'foo' + assert newplace.name == 'bar' + assert newplace.kind == 'ki' + assert newplace.suffix == 'su' + + +def test_placeinfo_clone_merge_attr(): + place = sanitizer.PlaceName('foo', 'ki', 'su') + place.set_attr('a1', 'v1') + place.set_attr('a2', 'v2') + + newplace = place.clone(attr={'a2': 'new', 'b2': 'foo'}) + + assert place.get_attr('a2') == 'v2' + assert place.get_attr('b2') is None + assert newplace.get_attr('a1') == 'v1' + assert newplace.get_attr('a2') == 'new' + assert newplace.get_attr('b2') == 'foo' + + +def test_placeinfo_has_attr(): + place = sanitizer.PlaceName('foo', 'ki', 'su') + place.set_attr('a1', 'v1') + + assert place.has_attr('a1') + assert not place.has_attr('whatever') + + +def test_sanitizer_default(): + san = sanitizer.PlaceSanitizer([{'step': 'split-name-list'}]) + + name, address = san.process_names(PlaceInfo({'name': {'name:de:de': '1;2;3'}, + 'address': {'street': 'Bald'}})) + + assert len(name) == 3 + assert all(isinstance(n, sanitizer.PlaceName) for n in name) + assert all(n.kind == 'name' for n in name) + assert all(n.suffix == 'de:de' for n in name) + + assert len(address) == 1 + assert all(isinstance(n, sanitizer.PlaceName) for n in address) + + +@pytest.mark.parametrize('rules', [None, []]) +def test_sanitizer_empty_list(rules): + san = sanitizer.PlaceSanitizer(rules) + + name, address = san.process_names(PlaceInfo({'name': {'name:de:de': '1;2;3'}})) + + assert len(name) == 1 + assert all(isinstance(n, sanitizer.PlaceName) for n in name) + + +def test_sanitizer_missing_step_definition(): + with pytest.raises(UsageError): + san = sanitizer.PlaceSanitizer([{'id': 'split-name-list'}]) From 6b348d43c61bc825c8d9cd208b1ef34c8d7951fa Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Fri, 1 Oct 2021 10:51:41 +0200 Subject: [PATCH 7/7] replace test variable for PG env tests 'tty' was removed in PG14 and causes an error. --- test/python/test_db_connection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/python/test_db_connection.py b/test/python/test_db_connection.py index 41978e59..00c29a43 100644 --- a/test/python/test_db_connection.py +++ b/test/python/test_db_connection.py @@ -100,6 +100,6 @@ def test_get_pg_env_overwrite_variable(monkeypatch): def test_get_pg_env_ignore_unknown(): - env = get_pg_env('tty=stuff', base_env={}) + env = get_pg_env('client_encoding=stuff', base_env={}) assert env == {}