From 231250f2eb272b77d54e4b4b18bd85a80413ac34 Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Wed, 29 Sep 2021 10:37:54 +0200
Subject: [PATCH 1/7] add wrapper class for place data passed to tokenizer

This is mostly for convenience and documentation purposes.
---
 nominatim/indexer/place_info.py         | 44 +++++++++++++++++++++++++
 nominatim/indexer/runners.py            |  8 +++--
 nominatim/tokenizer/base.py             | 12 ++-----
 nominatim/tokenizer/icu_tokenizer.py    |  6 ++--
 nominatim/tokenizer/legacy_tokenizer.py |  6 ++--
 nominatim/tools/tiger_data.py           |  5 ++-
 test/python/dummy_tokenizer.py          |  2 ++
 test/python/test_tokenizer_icu.py       | 20 +++++++----
 test/python/test_tokenizer_legacy.py    | 15 +++++----
 9 files changed, 84 insertions(+), 34 deletions(-)
 create mode 100644 nominatim/indexer/place_info.py

diff --git a/nominatim/indexer/place_info.py b/nominatim/indexer/place_info.py
new file mode 100644
index 00000000..fd179fef
--- /dev/null
+++ b/nominatim/indexer/place_info.py
@@ -0,0 +1,44 @@
+"""
+Wrapper around place information the indexer gets from the database and hands to
+the tokenizer.
+"""
+
+import psycopg2.extras
+
+class PlaceInfo:
+    """ Data class containing all information the tokenizer gets about a
+        place it should process the names for.
+    """
+
+    def __init__(self, info):
+        self._info = info
+
+
+    def analyze(self, analyzer):
+        """ Process this place with the given tokenizer and return the
+            result in psycopg2-compatible Json.
+        """
+        return psycopg2.extras.Json(analyzer.process_place(self))
+
+
+    @property
+    def name(self):
+        """ A dictionary with the names of the place or None if the place
+            has no names.
+        """
+        return self._info.get('name')
+
+
+    @property
+    def address(self):
+        """ A dictionary with the address elements of the place
+            or None if no address information is available.
+        """
+        return self._info.get('address')
+
+
+    @property
+    def country_feature(self):
+        """ Return the country code if the place is a valid country boundary.
+        """
+        return self._info.get('country_feature')
diff --git a/nominatim/indexer/runners.py b/nominatim/indexer/runners.py
index 29261ee5..43966419 100644
--- a/nominatim/indexer/runners.py
+++ b/nominatim/indexer/runners.py
@@ -4,14 +4,16 @@ tasks.
 """
 import functools
 
-import psycopg2.extras
 from psycopg2 import sql as pysql
 
+from nominatim.indexer.place_info import PlaceInfo
+
 # pylint: disable=C0111
 
 def _mk_valuelist(template, num):
     return pysql.SQL(',').join([pysql.SQL(template)] * num)
 
+
 class AbstractPlacexRunner:
     """ Returns SQL commands for indexing of the placex table.
     """
@@ -47,7 +49,7 @@ class AbstractPlacexRunner:
         for place in places:
             for field in ('place_id', 'name', 'address', 'linked_place_id'):
                 values.append(place[field])
-            values.append(psycopg2.extras.Json(self.analyzer.process_place(place)))
+            values.append(PlaceInfo(place).analyze(self.analyzer))
 
         worker.perform(self._index_sql(len(places)), values)
 
@@ -141,7 +143,7 @@ class InterpolationRunner:
         values = []
         for place in places:
             values.extend((place[x] for x in ('place_id', 'address')))
-            values.append(psycopg2.extras.Json(self.analyzer.process_place(place)))
+            values.append(PlaceInfo(place).analyze(self.analyzer))
 
         worker.perform(self._index_sql(len(places)), values)
 
diff --git a/nominatim/tokenizer/base.py b/nominatim/tokenizer/base.py
index 00ecae44..d827f813 100644
--- a/nominatim/tokenizer/base.py
+++ b/nominatim/tokenizer/base.py
@@ -6,6 +6,7 @@ from abc import ABC, abstractmethod
 from typing import List, Tuple, Dict, Any
 
 from nominatim.config import Configuration
+from nominatim.indexer.place_info import PlaceInfo
 
 # pylint: disable=unnecessary-pass
 
@@ -105,20 +106,13 @@ class AbstractAnalyzer(ABC):
 
 
     @abstractmethod
-    def process_place(self, place: Dict) -> Any:
+    def process_place(self, place: PlaceInfo) -> Any:
         """ Extract tokens for the given place and compute the
             information to be handed to the PL/pgSQL processor for building
             the search index.
 
             Arguments:
-                place: Dictionary with the information about the place. Currently
-                       the following fields may be present:
-
-                       - *name* is a dictionary of names for the place together
-                         with the designation of the name.
-                       - *address* is a dictionary of address terms.
-                       - *country_feature* is set to a country code when the
-                         place describes a country.
+                place: Place information retrived from the database.
 
             Returns:
                 A JSON-serialisable structure that will be handed into
diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py
index 5768fd35..81b07568 100644
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -390,18 +390,18 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
         """
         token_info = _TokenInfo(self._cache)
 
-        names = place.get('name')
+        names = place.name
 
         if names:
             fulls, partials = self._compute_name_tokens(names)
 
             token_info.add_names(fulls, partials)
 
-            country_feature = place.get('country_feature')
+            country_feature = place.country_feature
             if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
                 self.add_country_names(country_feature.lower(), names)
 
-        address = place.get('address')
+        address = place.address
         if address:
             self._process_place_address(token_info, address)
 
diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py
index 8957426b..8bfb309d 100644
--- a/nominatim/tokenizer/legacy_tokenizer.py
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@@ -405,16 +405,16 @@ class LegacyNameAnalyzer(AbstractAnalyzer):
         """
         token_info = _TokenInfo(self._cache)
 
-        names = place.get('name')
+        names = place.name
 
         if names:
             token_info.add_names(self.conn, names)
 
-            country_feature = place.get('country_feature')
+            country_feature = place.country_feature
             if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
                 self.add_country_names(country_feature.lower(), names)
 
-        address = place.get('address')
+        address = place.address
         if address:
             self._process_place_address(token_info, address)
 
diff --git a/nominatim/tools/tiger_data.py b/nominatim/tools/tiger_data.py
index ff498f77..19a12682 100644
--- a/nominatim/tools/tiger_data.py
+++ b/nominatim/tools/tiger_data.py
@@ -7,12 +7,11 @@ import logging
 import os
 import tarfile
 
-import psycopg2.extras
-
 from nominatim.db.connection import connect
 from nominatim.db.async_connection import WorkerPool
 from nominatim.db.sql_preprocessor import SQLPreprocessor
 from nominatim.errors import UsageError
+from nominatim.indexer.place_info import PlaceInfo
 
 LOG = logging.getLogger()
 
@@ -58,7 +57,7 @@ def handle_threaded_sql_statements(pool, fd, analyzer):
             address = dict(street=row['street'], postcode=row['postcode'])
             args = ('SRID=4326;' + row['geometry'],
                     int(row['from']), int(row['to']), row['interpolation'],
-                    psycopg2.extras.Json(analyzer.process_place(dict(address=address))),
+                    PlaceInfo({'address': address}).analyze(analyzer),
                     analyzer.normalize_postcode(row['postcode']))
         except ValueError:
             continue
diff --git a/test/python/dummy_tokenizer.py b/test/python/dummy_tokenizer.py
index 69202bc3..db0f32cd 100644
--- a/test/python/dummy_tokenizer.py
+++ b/test/python/dummy_tokenizer.py
@@ -1,6 +1,7 @@
 """
 Tokenizer for testing.
 """
+from nominatim.indexer.place_info import PlaceInfo
 
 def create(dsn, data_dir):
     """ Create a new instance of the tokenizer provided by this module.
@@ -68,4 +69,5 @@ class DummyNameAnalyzer:
 
     @staticmethod
     def process_place(place):
+        assert isinstance(place, PlaceInfo)
         return {}
diff --git a/test/python/test_tokenizer_icu.py b/test/python/test_tokenizer_icu.py
index ed079269..28c6ef7a 100644
--- a/test/python/test_tokenizer_icu.py
+++ b/test/python/test_tokenizer_icu.py
@@ -11,6 +11,7 @@ from nominatim.tokenizer.icu_name_processor import ICUNameProcessorRules
 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
 from nominatim.db import properties
 from nominatim.db.sql_preprocessor import SQLPreprocessor
+from nominatim.indexer.place_info import PlaceInfo
 
 from mock_icu_word_table import MockIcuWordTable
 
@@ -322,30 +323,37 @@ class TestPlaceNames:
         assert eval(info['names']) == set((t[2] for t in tokens))
 
 
+    def process_named_place(self, names, country_feature=None):
+        place = {'name': names}
+        if country_feature:
+            place['country_feature'] = country_feature
+
+        return self.analyzer.process_place(PlaceInfo(place))
+
+
     def test_simple_names(self):
-        info = self.analyzer.process_place({'name': {'name': 'Soft bAr', 'ref': '34'}})
+        info = self.process_named_place({'name': 'Soft bAr', 'ref': '34'})
 
         self.expect_name_terms(info, '#Soft bAr', '#34', 'Soft', 'bAr', '34')
 
 
     @pytest.mark.parametrize('sep', [',' , ';'])
     def test_names_with_separator(self, sep):
-        info = self.analyzer.process_place({'name': {'name': sep.join(('New York', 'Big Apple'))}})
+        info = self.process_named_place({'name': sep.join(('New York', 'Big Apple'))})
 
         self.expect_name_terms(info, '#New York', '#Big Apple',
                                      'new', 'york', 'big', 'apple')
 
 
     def test_full_names_with_bracket(self):
-        info = self.analyzer.process_place({'name': {'name': 'Houseboat (left)'}})
+        info = self.process_named_place({'name': 'Houseboat (left)'})
 
         self.expect_name_terms(info, '#Houseboat (left)', '#Houseboat',
                                      'houseboat', 'left')
 
 
     def test_country_name(self, word_table):
-        info = self.analyzer.process_place({'name': {'name': 'Norge'},
-                                           'country_feature': 'no'})
+        info = self.process_named_place({'name': 'Norge'}, country_feature='no')
 
         self.expect_name_terms(info, '#norge', 'norge')
         assert word_table.get_country() == {('no', 'NORGE')}
@@ -361,7 +369,7 @@ class TestPlaceAddress:
 
 
     def process_address(self, **kwargs):
-        return self.analyzer.process_place({'address': kwargs})
+        return self.analyzer.process_place(PlaceInfo({'address': kwargs}))
 
 
     def name_token_set(self, *expected_terms):
diff --git a/test/python/test_tokenizer_legacy.py b/test/python/test_tokenizer_legacy.py
index 4dd3a141..2545c2db 100644
--- a/test/python/test_tokenizer_legacy.py
+++ b/test/python/test_tokenizer_legacy.py
@@ -5,6 +5,7 @@ import shutil
 
 import pytest
 
+from nominatim.indexer.place_info import PlaceInfo
 from nominatim.tokenizer import legacy_tokenizer
 from nominatim.db import properties
 from nominatim.errors import UsageError
@@ -284,21 +285,21 @@ def test_add_more_country_names(analyzer, word_table, make_standard_name):
 
 
 def test_process_place_names(analyzer, make_keywords):
-    info = analyzer.process_place({'name' : {'name' : 'Soft bAr', 'ref': '34'}})
+    info = analyzer.process_place(PlaceInfo({'name' : {'name' : 'Soft bAr', 'ref': '34'}}))
 
     assert info['names'] == '{1,2,3}'
 
 
 @pytest.mark.parametrize('pcode', ['12345', 'AB 123', '34-345'])
 def test_process_place_postcode(analyzer, create_postcode_id, word_table, pcode):
-    analyzer.process_place({'address': {'postcode' : pcode}})
+    analyzer.process_place(PlaceInfo({'address': {'postcode' : pcode}}))
 
     assert word_table.get_postcodes() == {pcode, }
 
 
 @pytest.mark.parametrize('pcode', ['12:23', 'ab;cd;f', '123;836'])
 def test_process_place_bad_postcode(analyzer, create_postcode_id, word_table, pcode):
-    analyzer.process_place({'address': {'postcode' : pcode}})
+    analyzer.process_place(PlaceInfo({'address': {'postcode' : pcode}}))
 
     assert not word_table.get_postcodes()
 
@@ -319,7 +320,7 @@ class TestHousenumberName:
     @staticmethod
     @pytest.mark.parametrize('hnr', ['123a', '1', '101'])
     def test_process_place_housenumbers_simple(analyzer, hnr):
-        info = analyzer.process_place({'address': {'housenumber' : hnr}})
+        info = analyzer.process_place(PlaceInfo({'address': {'housenumber' : hnr}}))
 
         assert info['hnr'] == hnr
         assert info['hnr_tokens'].startswith("{")
@@ -327,15 +328,15 @@ class TestHousenumberName:
 
     @staticmethod
     def test_process_place_housenumbers_lists(analyzer):
-        info = analyzer.process_place({'address': {'conscriptionnumber' : '1; 2;3'}})
+        info = analyzer.process_place(PlaceInfo({'address': {'conscriptionnumber' : '1; 2;3'}}))
 
         assert set(info['hnr'].split(';')) == set(('1', '2', '3'))
 
 
     @staticmethod
     def test_process_place_housenumbers_duplicates(analyzer):
-        info = analyzer.process_place({'address': {'housenumber' : '134',
+        info = analyzer.process_place(PlaceInfo({'address': {'housenumber' : '134',
                                                    'conscriptionnumber' : '134',
-                                                   'streetnumber' : '99a'}})
+                                                   'streetnumber' : '99a'}}))
 
         assert set(info['hnr'].split(';')) == set(('134', '99a'))

From be65c8303f18d0f92bbf5bc9558f8789d33f21d9 Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Wed, 29 Sep 2021 11:54:14 +0200
Subject: [PATCH 2/7] export more data for the tokenizer name preparation

Adds class, type, country and rank to the exported information
and removes the rather odd hack for countries. Whether a place
represents a country boundary can now be computed by the tokenizer.
---
 lib-sql/functions/placex_triggers.sql   | 50 +++++++++++++------------
 nominatim/indexer/place_info.py         | 30 +++++++++++++--
 nominatim/indexer/runners.py            |  2 +-
 nominatim/tokenizer/icu_tokenizer.py    |  5 +--
 nominatim/tokenizer/legacy_tokenizer.py |  5 +--
 test/python/test_indexing.py            | 26 +++++++++----
 test/python/test_tokenizer_icu.py       | 12 ++++--
 7 files changed, 85 insertions(+), 45 deletions(-)

diff --git a/lib-sql/functions/placex_triggers.sql b/lib-sql/functions/placex_triggers.sql
index 9c2a67a1..8ae8cf39 100644
--- a/lib-sql/functions/placex_triggers.sql
+++ b/lib-sql/functions/placex_triggers.sql
@@ -1,30 +1,33 @@
 -- Trigger functions for the placex table.
 
+-- Information returned by update preparation.
+DROP TYPE IF EXISTS prepare_update_info CASCADE;
+CREATE TYPE prepare_update_info AS (
+  name HSTORE,
+  address HSTORE,
+  rank_address SMALLINT,
+  country_code TEXT,
+  class TEXT,
+  type TEXT,
+  linked_place_id BIGINT
+);
+
 -- Retrieve the data needed by the indexer for updating the place.
---
--- Return parameters:
---  name            list of names
---  address         list of address tags, either from the object or a surrounding
---                  building
---  country_feature If the place is a country feature, this contains the
---                  country code, otherwise it is null.
-CREATE OR REPLACE FUNCTION placex_prepare_update(p placex,
-                                                 OUT name HSTORE,
-                                                 OUT address HSTORE,
-                                                 OUT country_feature VARCHAR,
-                                                 OUT linked_place_id BIGINT)
+CREATE OR REPLACE FUNCTION placex_indexing_prepare(p placex)
+  RETURNS prepare_update_info
   AS $$
 DECLARE
   location RECORD;
+  result prepare_update_info;
 BEGIN
   -- For POI nodes, check if the address should be derived from a surrounding
   -- building.
   IF p.rank_search < 30 OR p.osm_type != 'N' OR p.address is not null THEN
-    address := p.address;
+    result.address := p.address;
   ELSE
     -- The additional && condition works around the misguided query
     -- planner of postgis 3.0.
-    SELECT placex.address || hstore('_inherited', '') INTO address
+    SELECT placex.address || hstore('_inherited', '') INTO result.address
       FROM placex
      WHERE ST_Covers(geometry, p.centroid)
            and geometry && p.centroid
@@ -34,27 +37,26 @@ BEGIN
      LIMIT 1;
   END IF;
 
-  address := address - '_unlisted_place'::TEXT;
-  name := p.name;
+  result.address := result.address - '_unlisted_place'::TEXT;
+  result.name := p.name;
+  result.class := p.class;
+  result.type := p.type;
+  result.country_code := p.country_code;
+  result.rank_address := p.rank_address;
 
   -- Names of linked places need to be merged in, so search for a linkable
   -- place already here.
   SELECT * INTO location FROM find_linked_place(p);
 
   IF location.place_id is not NULL THEN
-    linked_place_id := location.place_id;
+    result.linked_place_id := location.place_id;
 
     IF NOT location.name IS NULL THEN
-      name := location.name || name;
+      result.name := location.name || result.name;
     END IF;
   END IF;
 
-  country_feature := CASE WHEN p.admin_level = 2
-                               and p.class = 'boundary' and p.type = 'administrative'
-                               and p.osm_type = 'R'
-                          THEN p.country_code
-                          ELSE null
-                     END;
+  RETURN result;
 END;
 $$
 LANGUAGE plpgsql STABLE;
diff --git a/nominatim/indexer/place_info.py b/nominatim/indexer/place_info.py
index fd179fef..06d730e0 100644
--- a/nominatim/indexer/place_info.py
+++ b/nominatim/indexer/place_info.py
@@ -38,7 +38,31 @@ class PlaceInfo:
 
 
     @property
-    def country_feature(self):
-        """ Return the country code if the place is a valid country boundary.
+    def country_code(self):
+        """ The country code of the country the place is in. Guaranteed
+            to be a two-letter lower-case string or None, if no country
+            could be found.
         """
-        return self._info.get('country_feature')
+        return self._info.get('country_code')
+
+
+    @property
+    def rank_address(self):
+        """ The computed rank address before rank correction.
+        """
+        return self._info.get('rank_address')
+
+
+    def is_a(self, key, value):
+        """ Check if the place's primary tag corresponds to the given
+            key and value.
+        """
+        return self._info.get('class') == key and self._info.get('type') == value
+
+
+    def is_country(self):
+        """ Check if the place is a valid country boundary.
+        """
+        return self.rank_address == 4 \
+               and self.is_a('boundary', 'administrative') \
+               and self.country_code is not None
diff --git a/nominatim/indexer/runners.py b/nominatim/indexer/runners.py
index 43966419..70536a71 100644
--- a/nominatim/indexer/runners.py
+++ b/nominatim/indexer/runners.py
@@ -39,7 +39,7 @@ class AbstractPlacexRunner:
 
     @staticmethod
     def get_place_details(worker, ids):
-        worker.perform("""SELECT place_id, (placex_prepare_update(placex)).*
+        worker.perform("""SELECT place_id, (placex_indexing_prepare(placex)).*
                           FROM placex WHERE place_id IN %s""",
                        (tuple((p[0] for p in ids)), ))
 
diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py
index 81b07568..fbaa2596 100644
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -397,9 +397,8 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
 
             token_info.add_names(fulls, partials)
 
-            country_feature = place.country_feature
-            if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
-                self.add_country_names(country_feature.lower(), names)
+            if place.is_country():
+                self.add_country_names(place.country_code, names)
 
         address = place.address
         if address:
diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py
index 8bfb309d..dc6972dc 100644
--- a/nominatim/tokenizer/legacy_tokenizer.py
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@@ -410,9 +410,8 @@ class LegacyNameAnalyzer(AbstractAnalyzer):
         if names:
             token_info.add_names(self.conn, names)
 
-            country_feature = place.country_feature
-            if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
-                self.add_country_names(country_feature.lower(), names)
+            if place.is_country():
+                self.add_country_names(place.country_code, names)
 
         address = place.address
         if address:
diff --git a/test/python/test_indexing.py b/test/python/test_indexing.py
index 60ad0bc4..4c9d940d 100644
--- a/test/python/test_indexing.py
+++ b/test/python/test_indexing.py
@@ -29,6 +29,7 @@ class IndexerTestDB:
                                                 indexed_date TIMESTAMP,
                                                 partition SMALLINT,
                                                 admin_level SMALLINT,
+                                                country_code TEXT,
                                                 address HSTORE,
                                                 token_info JSONB,
                                                 geometry_sector INTEGER)""")
@@ -54,15 +55,26 @@ class IndexerTestDB:
                              END IF;
                              RETURN NEW;
                            END; $$ LANGUAGE plpgsql;""")
-            cur.execute("""CREATE OR REPLACE FUNCTION placex_prepare_update(p placex,
-                                                      OUT name HSTORE,
-                                                      OUT address HSTORE,
-                                                      OUT country_feature VARCHAR,
-                                                      OUT linked_place_id BIGINT)
+            cur.execute("DROP TYPE IF EXISTS prepare_update_info CASCADE")
+            cur.execute("""CREATE TYPE prepare_update_info AS (
+                             name HSTORE,
+                             address HSTORE,
+                             rank_address SMALLINT,
+                             country_code TEXT,
+                             class TEXT,
+                             type TEXT,
+                             linked_place_id BIGINT
+                           )""")
+            cur.execute("""CREATE OR REPLACE FUNCTION placex_indexing_prepare(p placex,
+                                                     OUT result prepare_update_info)
                            AS $$
                            BEGIN
-                            address := p.address;
-                            name := p.name;
+                             result.address := p.address;
+                             result.name := p.name;
+                             result.class := p.class;
+                             result.type := p.type;
+                             result.country_code := p.country_code;
+                             result.rank_address := p.rank_address;
                            END;
                            $$ LANGUAGE plpgsql STABLE;
                         """)
diff --git a/test/python/test_tokenizer_icu.py b/test/python/test_tokenizer_icu.py
index 28c6ef7a..bbfc0b12 100644
--- a/test/python/test_tokenizer_icu.py
+++ b/test/python/test_tokenizer_icu.py
@@ -323,10 +323,8 @@ class TestPlaceNames:
         assert eval(info['names']) == set((t[2] for t in tokens))
 
 
-    def process_named_place(self, names, country_feature=None):
+    def process_named_place(self, names):
         place = {'name': names}
-        if country_feature:
-            place['country_feature'] = country_feature
 
         return self.analyzer.process_place(PlaceInfo(place))
 
@@ -353,7 +351,13 @@ class TestPlaceNames:
 
 
     def test_country_name(self, word_table):
-        info = self.process_named_place({'name': 'Norge'}, country_feature='no')
+        place = PlaceInfo({'name' : {'name': 'Norge'},
+                           'country_code': 'no',
+                           'rank_address': 4,
+                           'class': 'boundary',
+                           'type': 'administrative'})
+
+        info = self.analyzer.process_place(place)
 
         self.expect_name_terms(info, '#norge', 'norge')
         assert word_table.get_country() == {('no', 'NORGE')}

From 5e5addcdbf20022b3fd76dc8c2275a1eecf12a3c Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Wed, 29 Sep 2021 14:16:09 +0200
Subject: [PATCH 3/7] fix typo

---
 nominatim/tokenizer/base.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/nominatim/tokenizer/base.py b/nominatim/tokenizer/base.py
index d827f813..e126507b 100644
--- a/nominatim/tokenizer/base.py
+++ b/nominatim/tokenizer/base.py
@@ -136,7 +136,7 @@ class AbstractTokenizer(ABC):
             the tokenizer remains stable over updates.
 
             Arguments:
-              config: Read-only object with configuration obtions.
+              config: Read-only object with configuration options.
 
               init_db: When set to False, then initialisation of database
                 tables should be skipped. This option is only required for
@@ -166,7 +166,7 @@ class AbstractTokenizer(ABC):
             during query time.
 
             Arguments:
-              config: Read-only object with configuration obtions.
+              config: Read-only object with configuration options.
         """
         pass
 
@@ -181,7 +181,7 @@ class AbstractTokenizer(ABC):
             data structures or data itself must not be changed by this function.
 
             Arguments:
-              config: Read-only object with configuration obtions.
+              config: Read-only object with configuration options.
         """
         pass
 

From 16daa57e4757e4daeffec1e61630f989727dc563 Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Wed, 29 Sep 2021 17:37:04 +0200
Subject: [PATCH 4/7] unify ICUNameProcessorRules and ICURuleLoader

There is no need for the additional layer of indirection that
the ICUNameProcessorRules class adds. The ICURuleLoader can
fill the database properties directly.
---
 nominatim/tokenizer/base.py                   | 10 +++-
 nominatim/tokenizer/factory.py                |  2 +-
 nominatim/tokenizer/icu_name_processor.py     | 54 +++----------------
 nominatim/tokenizer/icu_rule_loader.py        | 47 ++++++++++++++--
 nominatim/tokenizer/icu_tokenizer.py          | 29 +++++-----
 nominatim/tokenizer/icu_variants.py           | 32 -----------
 nominatim/tokenizer/legacy_tokenizer.py       |  4 +-
 nominatim/tools/check_database.py             |  2 +-
 test/python/dummy_tokenizer.py                |  4 +-
 test/python/test_tokenizer_icu.py             |  8 +--
 .../test_tokenizer_icu_name_processor.py      | 27 +++++-----
 test/python/test_tokenizer_icu_rule_loader.py | 35 +++++++-----
 test/python/test_tokenizer_legacy.py          |  4 +-
 test/python/test_tools_check_database.py      |  2 +-
 14 files changed, 123 insertions(+), 137 deletions(-)

diff --git a/nominatim/tokenizer/base.py b/nominatim/tokenizer/base.py
index e126507b..53289c78 100644
--- a/nominatim/tokenizer/base.py
+++ b/nominatim/tokenizer/base.py
@@ -149,11 +149,14 @@ class AbstractTokenizer(ABC):
 
 
     @abstractmethod
-    def init_from_project(self) -> None:
+    def init_from_project(self, config: Configuration) -> None:
         """ Initialise the tokenizer from an existing database setup.
 
             The function should load all previously saved configuration from
             the project directory and/or the property table.
+
+            Arguments:
+              config: Read-only object with configuration options.
         """
         pass
 
@@ -187,7 +190,7 @@ class AbstractTokenizer(ABC):
 
 
     @abstractmethod
-    def check_database(self) -> str:
+    def check_database(self, config: Configuration) -> str:
         """ Check that the database is set up correctly and ready for being
             queried.
 
@@ -196,6 +199,9 @@ class AbstractTokenizer(ABC):
               description of the issue as well as hints for the user on
               how to resolve the issue.
 
+            Arguments:
+              config: Read-only object with configuration options.
+
               Return `None`, if no issue was found.
         """
         pass
diff --git a/nominatim/tokenizer/factory.py b/nominatim/tokenizer/factory.py
index 069672d4..dc3e7411 100644
--- a/nominatim/tokenizer/factory.py
+++ b/nominatim/tokenizer/factory.py
@@ -85,6 +85,6 @@ def get_tokenizer_for_db(config):
     tokenizer_module = _import_tokenizer(name)
 
     tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
-    tokenizer.init_from_project()
+    tokenizer.init_from_project(config)
 
     return tokenizer
diff --git a/nominatim/tokenizer/icu_name_processor.py b/nominatim/tokenizer/icu_name_processor.py
index 93d2b0ff..544f5ebc 100644
--- a/nominatim/tokenizer/icu_name_processor.py
+++ b/nominatim/tokenizer/icu_name_processor.py
@@ -8,67 +8,25 @@ import itertools
 from icu import Transliterator
 import datrie
 
-from nominatim.db.properties import set_property, get_property
-from nominatim.tokenizer import icu_variants as variants
-
-DBCFG_IMPORT_NORM_RULES = "tokenizer_import_normalisation"
-DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration"
-DBCFG_IMPORT_REPLACEMENTS = "tokenizer_import_replacements"
-DBCFG_SEARCH_STD_RULES = "tokenizer_search_standardization"
-
-
-class ICUNameProcessorRules:
-    """ Data object that saves the rules needed for the name processor.
-
-        The rules can either be initialised through an ICURuleLoader or
-        be loaded from a database when a connection is given.
-    """
-    def __init__(self, loader=None, conn=None):
-        if loader is not None:
-            self.norm_rules = loader.get_normalization_rules()
-            self.trans_rules = loader.get_transliteration_rules()
-            self.replacements = loader.get_replacement_pairs()
-            self.search_rules = loader.get_search_rules()
-        elif conn is not None:
-            self.norm_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
-            self.trans_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
-            self.replacements = \
-                variants.unpickle_variant_set(get_property(conn, DBCFG_IMPORT_REPLACEMENTS))
-            self.search_rules = get_property(conn, DBCFG_SEARCH_STD_RULES)
-        else:
-            assert False, "Parameter loader or conn required."
-
-
-    def save_rules(self, conn):
-        """ Save the rules in the property table of the given database.
-            the rules can be loaded again by handing in a connection into
-            the constructor of the class.
-        """
-        set_property(conn, DBCFG_IMPORT_NORM_RULES, self.norm_rules)
-        set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.trans_rules)
-        set_property(conn, DBCFG_IMPORT_REPLACEMENTS,
-                     variants.pickle_variant_set(self.replacements))
-        set_property(conn, DBCFG_SEARCH_STD_RULES, self.search_rules)
-
 
 class ICUNameProcessor:
     """ Collects the different transformation rules for normalisation of names
-        and provides the functions to aply the transformations.
+        and provides the functions to apply the transformations.
     """
 
-    def __init__(self, rules):
+    def __init__(self, norm_rules, trans_rules, replacements):
         self.normalizer = Transliterator.createFromRules("icu_normalization",
-                                                         rules.norm_rules)
+                                                         norm_rules)
         self.to_ascii = Transliterator.createFromRules("icu_to_ascii",
-                                                       rules.trans_rules +
+                                                       trans_rules +
                                                        ";[:Space:]+ > ' '")
         self.search = Transliterator.createFromRules("icu_search",
-                                                     rules.search_rules)
+                                                     norm_rules + trans_rules)
 
         # Intermediate reorder by source. Also compute required character set.
         immediate = defaultdict(list)
         chars = set()
-        for variant in rules.replacements:
+        for variant in replacements:
             if variant.source[-1] == ' ' and variant.replacement[-1] == ' ':
                 replstr = variant.replacement[:-1]
             else:
diff --git a/nominatim/tokenizer/icu_rule_loader.py b/nominatim/tokenizer/icu_rule_loader.py
index 0e6e40b4..bd0739f2 100644
--- a/nominatim/tokenizer/icu_rule_loader.py
+++ b/nominatim/tokenizer/icu_rule_loader.py
@@ -2,17 +2,25 @@
 Helper class to create ICU rules from a configuration file.
 """
 import io
+import json
 import logging
 import itertools
 import re
 
 from icu import Transliterator
 
+from nominatim.db.properties import set_property, get_property
 from nominatim.errors import UsageError
+from nominatim.tokenizer.icu_name_processor import ICUNameProcessor
 import nominatim.tokenizer.icu_variants as variants
 
 LOG = logging.getLogger()
 
+DBCFG_IMPORT_NORM_RULES = "tokenizer_import_normalisation"
+DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration"
+DBCFG_IMPORT_ANALYSIS_RULES = "tokenizer_import_analysis_rules"
+
+
 def _flatten_config_list(content):
     if not content:
         return []
@@ -46,12 +54,43 @@ class ICURuleLoader:
     """ Compiler for ICU rules from a tokenizer configuration file.
     """
 
-    def __init__(self, rules):
+    def __init__(self, config):
+        rules = config.load_sub_configuration('icu_tokenizer.yaml',
+                                              config='TOKENIZER_CONFIG')
+
         self.variants = set()
 
         self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
         self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
-        self._parse_variant_list(self._get_section(rules, 'variants'))
+        self.analysis_rules = self._get_section(rules, 'variants')
+        self._parse_variant_list()
+
+
+    def load_config_from_db(self, conn):
+        """ Get previously saved parts of the configuration from the
+            database.
+        """
+        self.normalization_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
+        self.transliteration_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
+        self.analysis_rules = json.loads(get_property(conn, DBCFG_IMPORT_ANALYSIS_RULES))
+        self._parse_variant_list()
+
+
+    def save_config_to_db(self, conn):
+        """ Save the part of the configuration that cannot be changed into
+            the database.
+        """
+        set_property(conn, DBCFG_IMPORT_NORM_RULES, self.normalization_rules)
+        set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.transliteration_rules)
+        set_property(conn, DBCFG_IMPORT_ANALYSIS_RULES, json.dumps(self.analysis_rules))
+
+
+    def make_token_analysis(self):
+        """ Create a token analyser from the reviouly loaded rules.
+        """
+        return ICUNameProcessor(self.normalization_rules,
+                                self.transliteration_rules,
+                                self.variants)
 
 
     def get_search_rules(self):
@@ -112,7 +151,9 @@ class ICURuleLoader:
         return ';'.join(_flatten_config_list(content)) + ';'
 
 
-    def _parse_variant_list(self, rules):
+    def _parse_variant_list(self):
+        rules = self.analysis_rules
+
         self.variants.clear()
 
         if not rules:
diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py
index fbaa2596..87906d71 100644
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -14,7 +14,6 @@ from nominatim.db.properties import set_property, get_property
 from nominatim.db.utils import CopyBuffer
 from nominatim.db.sql_preprocessor import SQLPreprocessor
 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
-from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
 
 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
@@ -36,7 +35,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
     def __init__(self, dsn, data_dir):
         self.dsn = dsn
         self.data_dir = data_dir
-        self.naming_rules = None
+        self.loader = None
         self.term_normalization = None
 
 
@@ -46,9 +45,8 @@ class LegacyICUTokenizer(AbstractTokenizer):
             This copies all necessary data in the project directory to make
             sure the tokenizer remains stable even over updates.
         """
-        loader = ICURuleLoader(config.load_sub_configuration('icu_tokenizer.yaml',
-                                                             config='TOKENIZER_CONFIG'))
-        self.naming_rules = ICUNameProcessorRules(loader=loader)
+        self.loader = ICURuleLoader(config)
+
         self.term_normalization = config.TERM_NORMALIZATION
 
         self._install_php(config.lib_dir.php)
@@ -59,11 +57,13 @@ class LegacyICUTokenizer(AbstractTokenizer):
             self._init_db_tables(config)
 
 
-    def init_from_project(self):
+    def init_from_project(self, config):
         """ Initialise the tokenizer from the project directory.
         """
+        self.loader = ICURuleLoader(config)
+
         with connect(self.dsn) as conn:
-            self.naming_rules = ICUNameProcessorRules(conn=conn)
+            self.loader.load_config_from_db(conn)
             self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
 
 
@@ -81,12 +81,12 @@ class LegacyICUTokenizer(AbstractTokenizer):
             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
 
 
-    def check_database(self):
+    def check_database(self, config):
         """ Check that the tokenizer is set up correctly.
         """
-        self.init_from_project()
+        self.init_from_project(config)
 
-        if self.naming_rules is None:
+        if self.term_normalization is None:
             return "Configuration for tokenizer 'icu' are missing."
 
         return None
@@ -107,7 +107,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
 
             Analyzers are not thread-safe. You need to instantiate one per thread.
         """
-        return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
+        return LegacyICUNameAnalyzer(self.dsn, self.loader.make_token_analysis())
 
 
     def _install_php(self, phpdir):
@@ -118,7 +118,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
             <?php
             @define('CONST_Max_Word_Frequency', 10000000);
             @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
-            @define('CONST_Transliteration', "{self.naming_rules.search_rules}");
+            @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
             require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
 
 
@@ -127,8 +127,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
             database as database properties.
         """
         with connect(self.dsn) as conn:
-            self.naming_rules.save_rules(conn)
-
+            self.loader.save_config_to_db(conn)
             set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
 
 
@@ -163,7 +162,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
         """ Count the partial terms from the names in the place table.
         """
         words = Counter()
-        name_proc = ICUNameProcessor(self.naming_rules)
+        name_proc = self.loader.make_token_analysis()
 
         with conn.cursor(name="words") as cur:
             cur.execute(""" SELECT v, count(*) FROM
diff --git a/nominatim/tokenizer/icu_variants.py b/nominatim/tokenizer/icu_variants.py
index 9ebe3684..93272f58 100644
--- a/nominatim/tokenizer/icu_variants.py
+++ b/nominatim/tokenizer/icu_variants.py
@@ -2,7 +2,6 @@
 Data structures for saving variant expansions for ICU tokenizer.
 """
 from collections import namedtuple
-import json
 
 _ICU_VARIANT_PORPERTY_FIELDS = ['lang']
 
@@ -24,34 +23,3 @@ class ICUVariantProperties(namedtuple('_ICUVariantProperties', _ICU_VARIANT_PORP
 
 
 ICUVariant = namedtuple('ICUVariant', ['source', 'replacement', 'properties'])
-
-
-def pickle_variant_set(variants):
-    """ Serializes an iterable of variant rules to a string.
-    """
-    # Create a list of property sets. So they don't need to be duplicated
-    properties = {}
-    pid = 1
-    for variant in variants:
-        if variant.properties not in properties:
-            properties[variant.properties] = pid
-            pid += 1
-
-    # Convert the variants into a simple list.
-    variants = [(v.source, v.replacement, properties[v.properties]) for v in variants]
-
-    # Convert everythin to json.
-    return json.dumps({'properties': {v: k._asdict() for k, v in properties.items()},
-                       'variants': variants})
-
-
-def unpickle_variant_set(variant_string):
-    """ Deserializes a variant string that was previously created with
-        pickle_variant_set() into a set of ICUVariants.
-    """
-    data = json.loads(variant_string)
-
-    properties = {int(k): ICUVariantProperties.from_rules(v)
-                  for k, v in data['properties'].items()}
-
-    return set((ICUVariant(src, repl, properties[pid]) for src, repl, pid in data['variants']))
diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py
index dc6972dc..c935f20d 100644
--- a/nominatim/tokenizer/legacy_tokenizer.py
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@@ -113,7 +113,7 @@ class LegacyTokenizer(AbstractTokenizer):
             self._init_db_tables(config)
 
 
-    def init_from_project(self):
+    def init_from_project(self, _):
         """ Initialise the tokenizer from the project directory.
         """
         with connect(self.dsn) as conn:
@@ -142,7 +142,7 @@ class LegacyTokenizer(AbstractTokenizer):
                               modulepath=modulepath)
 
 
-    def check_database(self):
+    def check_database(self, _):
         """ Check that the tokenizer is set up correctly.
         """
         hint = """\
diff --git a/nominatim/tools/check_database.py b/nominatim/tools/check_database.py
index d116554f..30b27d1f 100644
--- a/nominatim/tools/check_database.py
+++ b/nominatim/tools/check_database.py
@@ -166,7 +166,7 @@ def check_tokenizer(_, config):
         return CheckState.FAIL, dict(msg="""\
             Cannot load tokenizer. Did the import finish sucessfully?""")
 
-    result = tokenizer.check_database()
+    result = tokenizer.check_database(config)
 
     if result is None:
         return CheckState.OK
diff --git a/test/python/dummy_tokenizer.py b/test/python/dummy_tokenizer.py
index db0f32cd..6029eb7c 100644
--- a/test/python/dummy_tokenizer.py
+++ b/test/python/dummy_tokenizer.py
@@ -2,6 +2,7 @@
 Tokenizer for testing.
 """
 from nominatim.indexer.place_info import PlaceInfo
+from nominatim.config import Configuration
 
 def create(dsn, data_dir):
     """ Create a new instance of the tokenizer provided by this module.
@@ -22,7 +23,8 @@ class DummyTokenizer:
         self.init_state = "new"
 
 
-    def init_from_project(self):
+    def init_from_project(self, config):
+        assert isinstance(config, Configuration)
         assert self.init_state is None
         self.init_state = "loaded"
 
diff --git a/test/python/test_tokenizer_icu.py b/test/python/test_tokenizer_icu.py
index bbfc0b12..4b7c56d5 100644
--- a/test/python/test_tokenizer_icu.py
+++ b/test/python/test_tokenizer_icu.py
@@ -7,7 +7,6 @@ import yaml
 import pytest
 
 from nominatim.tokenizer import icu_tokenizer
-from nominatim.tokenizer.icu_name_processor import ICUNameProcessorRules
 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
 from nominatim.db import properties
 from nominatim.db.sql_preprocessor import SQLPreprocessor
@@ -72,7 +71,8 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
         cfgstr = {'normalization' : list(norm),
                    'transliteration' : list(trans),
                    'variants' : [ {'words': list(variants)}]}
-        tok.naming_rules = ICUNameProcessorRules(loader=ICURuleLoader(cfgstr))
+        (test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr))
+        tok.loader = ICURuleLoader(test_config)
 
         return tok.name_analyzer()
 
@@ -178,9 +178,9 @@ def test_init_from_project(monkeypatch, test_config, tokenizer_factory):
     monkeypatch.undo()
 
     tok = tokenizer_factory()
-    tok.init_from_project()
+    tok.init_from_project(test_config)
 
-    assert tok.naming_rules is not None
+    assert tok.loader is not None
     assert tok.term_normalization == ':: lower();'
 
 
diff --git a/test/python/test_tokenizer_icu_name_processor.py b/test/python/test_tokenizer_icu_name_processor.py
index ae05988a..d0ed21ec 100644
--- a/test/python/test_tokenizer_icu_name_processor.py
+++ b/test/python/test_tokenizer_icu_name_processor.py
@@ -4,15 +4,17 @@ Tests for import name normalisation and variant generation.
 from textwrap import dedent
 
 import pytest
-import yaml
 
 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
-from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
 
 from nominatim.errors import UsageError
 
 @pytest.fixture
-def cfgfile():
+def cfgfile(def_config, tmp_path):
+    project_dir = tmp_path / 'project_dir'
+    project_dir.mkdir()
+    def_config.project_dir = project_dir
+
     def _create_config(*variants, **kwargs):
         content = dedent("""\
         normalization:
@@ -30,7 +32,9 @@ def cfgfile():
         content += '\n'.join(("      - " + s for s in variants)) + '\n'
         for k, v in kwargs:
             content += "    {}: {}\n".format(k, v)
-        return yaml.safe_load(content)
+        (project_dir / 'icu_tokenizer.yaml').write_text(content)
+
+        return def_config
 
     return _create_config
 
@@ -40,10 +44,9 @@ def get_normalized_variants(proc, name):
 
 
 def test_variants_empty(cfgfile):
-    fpath = cfgfile('saint -> 🜵', 'street -> st')
+    config = cfgfile('saint -> 🜵', 'street -> st')
 
-    rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath))
-    proc = ICUNameProcessor(rules)
+    proc = ICURuleLoader(config).make_token_analysis()
 
     assert get_normalized_variants(proc, '🜵') == []
     assert get_normalized_variants(proc, '🜳') == []
@@ -83,8 +86,8 @@ VARIANT_TESTS = [
 
 @pytest.mark.parametrize("rules,name,variants", VARIANT_TESTS)
 def test_variants(cfgfile, rules, name, variants):
-    fpath = cfgfile(*rules)
-    proc = ICUNameProcessor(ICUNameProcessorRules(loader=ICURuleLoader(fpath)))
+    config = cfgfile(*rules)
+    proc = ICURuleLoader(config).make_token_analysis()
 
     result = get_normalized_variants(proc, name)
 
@@ -93,10 +96,8 @@ def test_variants(cfgfile, rules, name, variants):
 
 
 def test_search_normalized(cfgfile):
-    fpath = cfgfile('~street => s,st', 'master => mstr')
-
-    rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath))
-    proc = ICUNameProcessor(rules)
+    config = cfgfile('~street => s,st', 'master => mstr')
+    proc = ICURuleLoader(config).make_token_analysis()
 
     assert proc.get_search_normalized('Master Street') == 'master street'
     assert proc.get_search_normalized('Earnes St') == 'earnes st'
diff --git a/test/python/test_tokenizer_icu_rule_loader.py b/test/python/test_tokenizer_icu_rule_loader.py
index c3480de8..6ec53edc 100644
--- a/test/python/test_tokenizer_icu_rule_loader.py
+++ b/test/python/test_tokenizer_icu_rule_loader.py
@@ -12,7 +12,16 @@ from nominatim.errors import UsageError
 from icu import Transliterator
 
 @pytest.fixture
-def cfgrules():
+def test_config(def_config, tmp_path):
+    project_dir = tmp_path / 'project_dir'
+    project_dir.mkdir()
+    def_config.project_dir = project_dir
+
+    return def_config
+
+
+@pytest.fixture
+def cfgrules(test_config):
     def _create_config(*variants, **kwargs):
         content = dedent("""\
         normalization:
@@ -29,19 +38,21 @@ def cfgrules():
         content += '\n'.join(("      - " + s for s in variants)) + '\n'
         for k, v in kwargs:
             content += "    {}: {}\n".format(k, v)
-        return yaml.safe_load(content)
+        (test_config.project_dir / 'icu_tokenizer.yaml').write_text(content)
+
+        return test_config
 
     return _create_config
 
 
-def test_empty_rule_set():
-    rule_cfg = yaml.safe_load(dedent("""\
+def test_empty_rule_set(test_config):
+    (test_config.project_dir / 'icu_tokenizer.yaml').write_text(dedent("""\
         normalization:
         transliteration:
         variants:
         """))
 
-    rules = ICURuleLoader(rule_cfg)
+    rules = ICURuleLoader(test_config)
     assert rules.get_search_rules() == ''
     assert rules.get_normalization_rules() == ''
     assert rules.get_transliteration_rules() == ''
@@ -50,11 +61,12 @@ def test_empty_rule_set():
 CONFIG_SECTIONS = ('normalization', 'transliteration', 'variants')
 
 @pytest.mark.parametrize("section", CONFIG_SECTIONS)
-def test_missing_section(section):
+def test_missing_section(section, test_config):
     rule_cfg = { s: {} for s in CONFIG_SECTIONS if s != section}
+    (test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(rule_cfg))
 
     with pytest.raises(UsageError):
-        ICURuleLoader(rule_cfg)
+        ICURuleLoader(test_config)
 
 
 def test_get_search_rules(cfgrules):
@@ -88,9 +100,8 @@ def test_get_transliteration_rules(cfgrules):
     assert trans.transliterate(" проспект-Prospekt ") == " prospekt Prospekt "
 
 
-def test_transliteration_rules_from_file(def_config, tmp_path):
-    def_config.project_dir = tmp_path
-    cfgpath = tmp_path / ('test_config.yaml')
+def test_transliteration_rules_from_file(test_config):
+    cfgpath = test_config.project_dir / ('icu_tokenizer.yaml')
     cfgpath.write_text(dedent("""\
         normalization:
         transliteration:
@@ -98,10 +109,10 @@ def test_transliteration_rules_from_file(def_config, tmp_path):
             - !include transliteration.yaml
         variants:
         """))
-    transpath = tmp_path / ('transliteration.yaml')
+    transpath = test_config.project_dir / ('transliteration.yaml')
     transpath.write_text('- "x > y"')
 
-    loader = ICURuleLoader(def_config.load_sub_configuration('test_config.yaml'))
+    loader = ICURuleLoader(test_config)
     rules = loader.get_transliteration_rules()
     trans = Transliterator.createFromRules("test", rules)
 
diff --git a/test/python/test_tokenizer_legacy.py b/test/python/test_tokenizer_legacy.py
index 2545c2db..53d45c1c 100644
--- a/test/python/test_tokenizer_legacy.py
+++ b/test/python/test_tokenizer_legacy.py
@@ -132,10 +132,10 @@ def test_init_module_custom(tokenizer_factory, test_config,
     assert not (test_config.project_dir / 'module').exists()
 
 
-def test_init_from_project(tokenizer_setup, tokenizer_factory):
+def test_init_from_project(tokenizer_setup, tokenizer_factory, test_config):
     tok = tokenizer_factory()
 
-    tok.init_from_project()
+    tok.init_from_project(test_config)
 
     assert tok.normalization is not None
 
diff --git a/test/python/test_tools_check_database.py b/test/python/test_tools_check_database.py
index aed5cb7e..edba3236 100644
--- a/test/python/test_tools_check_database.py
+++ b/test/python/test_tools_check_database.py
@@ -53,7 +53,7 @@ def test_check_tokenizer(temp_db_conn, def_config, monkeypatch,
                          check_result, state):
     class _TestTokenizer:
         @staticmethod
-        def check_database():
+        def check_database(_):
             return check_result
 
     monkeypatch.setattr(chkdb.tokenizer_factory, 'get_tokenizer_for_db',

From 8171fe4571a57bf8e5b2a8f676989e973897e2e7 Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Thu, 30 Sep 2021 21:30:13 +0200
Subject: [PATCH 5/7] introduce sanitizer step before token analysis

Sanatizer functions allow to transform name and address tags before
they are handed to the tokenizer. Theses transformations are visible
only for the tokenizer and thus only have an influence on the
search terms and address match terms for a place.

Currently two sanitizers are implemented which are responsible for
splitting names with multiple values and removing bracket additions.
Both was previously hard-coded in the tokenizer.
---
 nominatim/tokenizer/icu_rule_loader.py        |  10 ++
 nominatim/tokenizer/icu_tokenizer.py          | 110 ++++++++-------
 nominatim/tokenizer/place_sanitizer.py        | 127 ++++++++++++++++++
 nominatim/tokenizer/sanitizers/__init__.py    |   0
 .../tokenizer/sanitizers/split_name_list.py   |  28 ++++
 .../tokenizer/sanitizers/strip_brace_terms.py |  22 +++
 settings/icu_tokenizer.yaml                   |   3 +
 test/python/test_tokenizer_icu.py             |  17 +--
 8 files changed, 259 insertions(+), 58 deletions(-)
 create mode 100644 nominatim/tokenizer/place_sanitizer.py
 create mode 100644 nominatim/tokenizer/sanitizers/__init__.py
 create mode 100644 nominatim/tokenizer/sanitizers/split_name_list.py
 create mode 100644 nominatim/tokenizer/sanitizers/strip_brace_terms.py

diff --git a/nominatim/tokenizer/icu_rule_loader.py b/nominatim/tokenizer/icu_rule_loader.py
index bd0739f2..330179bb 100644
--- a/nominatim/tokenizer/icu_rule_loader.py
+++ b/nominatim/tokenizer/icu_rule_loader.py
@@ -12,6 +12,7 @@ from icu import Transliterator
 from nominatim.db.properties import set_property, get_property
 from nominatim.errors import UsageError
 from nominatim.tokenizer.icu_name_processor import ICUNameProcessor
+from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
 import nominatim.tokenizer.icu_variants as variants
 
 LOG = logging.getLogger()
@@ -65,6 +66,9 @@ class ICURuleLoader:
         self.analysis_rules = self._get_section(rules, 'variants')
         self._parse_variant_list()
 
+        # Load optional sanitizer rule set.
+        self.sanitizer_rules = rules.get('sanitizers', [])
+
 
     def load_config_from_db(self, conn):
         """ Get previously saved parts of the configuration from the
@@ -85,6 +89,12 @@ class ICURuleLoader:
         set_property(conn, DBCFG_IMPORT_ANALYSIS_RULES, json.dumps(self.analysis_rules))
 
 
+    def make_sanitizer(self):
+        """ Create a place sanitizer from the configured rules.
+        """
+        return PlaceSanitizer(self.sanitizer_rules)
+
+
     def make_token_analysis(self):
         """ Create a token analyser from the reviouly loaded rules.
         """
diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py
index 87906d71..2ece10f2 100644
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -13,6 +13,7 @@ from nominatim.db.connection import connect
 from nominatim.db.properties import set_property, get_property
 from nominatim.db.utils import CopyBuffer
 from nominatim.db.sql_preprocessor import SQLPreprocessor
+from nominatim.indexer.place_info import PlaceInfo
 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
 
@@ -107,7 +108,8 @@ class LegacyICUTokenizer(AbstractTokenizer):
 
             Analyzers are not thread-safe. You need to instantiate one per thread.
         """
-        return LegacyICUNameAnalyzer(self.dsn, self.loader.make_token_analysis())
+        return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
+                                     self.loader.make_token_analysis())
 
 
     def _install_php(self, phpdir):
@@ -187,10 +189,11 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
         normalization.
     """
 
-    def __init__(self, dsn, name_proc):
+    def __init__(self, dsn, sanitizer, token_analysis):
         self.conn = connect(dsn).connection
         self.conn.autocommit = True
-        self.name_processor = name_proc
+        self.sanitizer = sanitizer
+        self.token_analysis = token_analysis
 
         self._cache = _TokenCache()
 
@@ -203,6 +206,19 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
             self.conn = None
 
 
+    def _search_normalized(self, name):
+        """ Return the search token transliteration of the given name.
+        """
+        return self.token_analysis.get_search_normalized(name)
+
+
+    def _normalized(self, name):
+        """ Return the normalized version of the given name with all
+            non-relevant information removed.
+        """
+        return self.token_analysis.get_normalized(name)
+
+
     def get_word_token_info(self, words):
         """ Return token information for the given list of words.
             If a word starts with # it is assumed to be a full name
@@ -218,9 +234,9 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
         partial_tokens = {}
         for word in words:
             if word.startswith('#'):
-                full_tokens[word] = self.name_processor.get_search_normalized(word[1:])
+                full_tokens[word] = self._search_normalized(word[1:])
             else:
-                partial_tokens[word] = self.name_processor.get_search_normalized(word)
+                partial_tokens[word] = self._search_normalized(word)
 
         with self.conn.cursor() as cur:
             cur.execute("""SELECT word_token, word_id
@@ -251,7 +267,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
 
             This function takes minor shortcuts on transliteration.
         """
-        return self.name_processor.get_search_normalized(hnr)
+        return self._search_normalized(hnr)
 
     def update_postcodes_from_db(self):
         """ Update postcode tokens in the word table from the location_postcode
@@ -274,7 +290,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
                     if postcode is None:
                         to_delete.append(word)
                     else:
-                        copystr.add(self.name_processor.get_search_normalized(postcode),
+                        copystr.add(self._search_normalized(postcode),
                                     'P', postcode)
 
                 if to_delete:
@@ -292,7 +308,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
             completely replaced. Otherwise the phrases are added to the
             already existing ones.
         """
-        norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
+        norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
                             for p in phrases))
 
         with self.conn.cursor() as cur:
@@ -322,7 +338,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
         added = 0
         with CopyBuffer() as copystr:
             for word, cls, typ, oper in to_add:
-                term = self.name_processor.get_search_normalized(word)
+                term = self._search_normalized(word)
                 if term:
                     copystr.add(term, 'S', word,
                                 json.dumps({'class': cls, 'type': typ,
@@ -356,9 +372,21 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
     def add_country_names(self, country_code, names):
         """ Add names for the given country to the search index.
         """
+        # Make sure any name preprocessing for country names applies.
+        info = PlaceInfo({'name': names, 'country_code': country_code,
+                          'rank_address': 4, 'class': 'boundary',
+                          'type': 'administrative'})
+        self._add_country_full_names(country_code,
+                                     self.sanitizer.process_names(info)[0])
+
+
+    def _add_country_full_names(self, country_code, names):
+        """ Add names for the given country from an already sanitized
+            name list.
+        """
         word_tokens = set()
-        for name in self._compute_full_names(names):
-            norm_name = self.name_processor.get_search_normalized(name)
+        for name in names:
+            norm_name = self._search_normalized(name.name)
             if norm_name:
                 word_tokens.add(norm_name)
 
@@ -384,12 +412,12 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
     def process_place(self, place):
         """ Determine tokenizer information about the given place.
 
-            Returns a JSON-serialisable structure that will be handed into
+            Returns a JSON-serializable structure that will be handed into
             the database via the token_info field.
         """
         token_info = _TokenInfo(self._cache)
 
-        names = place.name
+        names, address = self.sanitizer.process_names(place)
 
         if names:
             fulls, partials = self._compute_name_tokens(names)
@@ -397,9 +425,8 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
             token_info.add_names(fulls, partials)
 
             if place.is_country():
-                self.add_country_names(place.country_code, names)
+                self._add_country_full_names(place.country_code, names)
 
-        address = place.address
         if address:
             self._process_place_address(token_info, address)
 
@@ -409,18 +436,18 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
     def _process_place_address(self, token_info, address):
         hnrs = []
         addr_terms = []
-        for key, value in address.items():
-            if key == 'postcode':
-                self._add_postcode(value)
-            elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
-                hnrs.append(value)
-            elif key == 'street':
-                token_info.add_street(self._compute_partial_tokens(value))
-            elif key == 'place':
-                token_info.add_place(self._compute_partial_tokens(value))
-            elif not key.startswith('_') and \
-                 key not in ('country', 'full'):
-                addr_terms.append((key, self._compute_partial_tokens(value)))
+        for item in address:
+            if item.kind == 'postcode':
+                self._add_postcode(item.name)
+            elif item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'):
+                hnrs.append(item.name)
+            elif item.kind == 'street':
+                token_info.add_street(self._compute_partial_tokens(item.name))
+            elif item.kind == 'place':
+                token_info.add_place(self._compute_partial_tokens(item.name))
+            elif not item.kind.startswith('_') and \
+                 item.kind not in ('country', 'full'):
+                addr_terms.append((item.kind, self._compute_partial_tokens(item.name)))
 
         if hnrs:
             hnrs = self._split_housenumbers(hnrs)
@@ -433,7 +460,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
         """ Normalize the given term, split it into partial words and return
             then token list for them.
         """
-        norm_name = self.name_processor.get_search_normalized(name)
+        norm_name = self._search_normalized(name)
 
         tokens = []
         need_lookup = []
@@ -456,19 +483,19 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
 
         return tokens
 
+
     def _compute_name_tokens(self, names):
         """ Computes the full name and partial name tokens for the given
             dictionary of names.
         """
-        full_names = self._compute_full_names(names)
         full_tokens = set()
         partial_tokens = set()
 
-        for name in full_names:
-            norm_name = self.name_processor.get_normalized(name)
+        for name in names:
+            norm_name = self._normalized(name.name)
             full, part = self._cache.names.get(norm_name, (None, None))
             if full is None:
-                variants = self.name_processor.get_variants_ascii(norm_name)
+                variants = self.token_analysis.get_variants_ascii(norm_name)
                 if not variants:
                     continue
 
@@ -485,23 +512,6 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
         return full_tokens, partial_tokens
 
 
-    @staticmethod
-    def _compute_full_names(names):
-        """ Return the set of all full name word ids to be used with the
-            given dictionary of names.
-        """
-        full_names = set()
-        for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)):
-            if name:
-                full_names.add(name)
-
-                brace_idx = name.find('(')
-                if brace_idx >= 0:
-                    full_names.add(name[:brace_idx].strip())
-
-        return full_names
-
-
     def _add_postcode(self, postcode):
         """ Make sure the normalized postcode is present in the word table.
         """
@@ -509,7 +519,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
             postcode = self.normalize_postcode(postcode)
 
             if postcode not in self._cache.postcodes:
-                term = self.name_processor.get_search_normalized(postcode)
+                term = self._search_normalized(postcode)
                 if not term:
                     return
 
diff --git a/nominatim/tokenizer/place_sanitizer.py b/nominatim/tokenizer/place_sanitizer.py
new file mode 100644
index 00000000..5961dcf0
--- /dev/null
+++ b/nominatim/tokenizer/place_sanitizer.py
@@ -0,0 +1,127 @@
+"""
+Handler for cleaning name and address tags in place information before it
+is handed to the token analysis.
+"""
+import importlib
+
+from nominatim.errors import UsageError
+
+class PlaceName:
+    """ A searchable name for a place together with properties.
+        Every name object saves the name proper and two basic properties:
+        * 'kind' describes the name of the OSM key used without any suffixes
+          (i.e. the part after the colon removed)
+        * 'suffix' contains the suffix of the OSM tag, if any. The suffix
+          is the part of the key after the first colon.
+        In addition to that, the name may have arbitrary additional attributes.
+        Which attributes are used, depends on the token analyser.
+    """
+
+    def __init__(self, name, kind, suffix):
+        self.name = name
+        self.kind = kind
+        self.suffix = suffix
+        self.attr = {}
+
+
+    def __repr__(self):
+        return f"PlaceName(name='{self.name}',kind='{self.kind}',suffix='{self.suffix}')"
+
+
+    def clone(self, name=None, kind=None, suffix=None, attr=None):
+        """ Create a deep copy of the place name, optionally with the
+            given parameters replaced. In the attribute list only the given
+            keys are updated. The list is not replaced completely.
+            In particular, the function cannot to be used to remove an
+            attribute from a place name.
+        """
+        newobj = PlaceName(name or self.name,
+                           kind or self.kind,
+                           suffix or self.suffix)
+
+        newobj.attr.update(self.attr)
+        if attr:
+            newobj.attr.update(attr)
+
+        return newobj
+
+
+    def set_attr(self, key, value):
+        """ Add the given property to the name. If the property was already
+            set, then the value is overwritten.
+        """
+        self.attr[key] = value
+
+
+    def get_attr(self, key, default=None):
+        """ Return the given property or the value of 'default' if it
+            is not set.
+        """
+        return self.attr.get(key, default)
+
+
+    def has_attr(self, key):
+        """ Check if the given attribute is set.
+        """
+        return key in self.attr
+
+
+class _ProcessInfo:
+    """ Container class for information handed into to handler functions.
+        The 'names' and 'address' members are mutable. A handler must change
+        them by either modifying the lists place or replacing the old content
+        with a new list.
+    """
+
+    def __init__(self, place):
+        self.place = place
+        self.names = self._convert_name_dict(place.name)
+        self.address = self._convert_name_dict(place.address)
+
+
+    @staticmethod
+    def _convert_name_dict(names):
+        """ Convert a dictionary of names into a list of PlaceNames.
+            The dictionary key is split into the primary part of the key
+            and the suffix (the part after an optional colon).
+        """
+        out = []
+
+        if names:
+            for key, value in names.items():
+                parts = key.split(':', 1)
+                out.append(PlaceName(value.strip(),
+                                     parts[0].strip(),
+                                     parts[1].strip() if len(parts) > 1 else None))
+
+        return out
+
+
+class PlaceSanitizer:
+    """ Controller class which applies sanitizer functions on the place
+        names and address before they are used by the token analysers.
+    """
+
+    def __init__(self, rules):
+        self.handlers = []
+
+        if rules:
+            for func in rules:
+                if 'step' not in func:
+                    raise UsageError("Sanitizer rule is missing the 'step' attribute.")
+                module_name = 'nominatim.tokenizer.sanitizers.' + func['step'].replace('-', '_')
+                handler_module = importlib.import_module(module_name)
+                self.handlers.append(handler_module.create(func))
+
+
+    def process_names(self, place):
+        """ Extract a sanitized list of names and address parts from the
+            given place. The function returns a tuple
+            (list of names, list of address names)
+        """
+        obj = _ProcessInfo(place)
+
+        for func in self.handlers:
+            func(obj)
+
+        return obj.names, obj.address
diff --git a/nominatim/tokenizer/sanitizers/__init__.py b/nominatim/tokenizer/sanitizers/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/nominatim/tokenizer/sanitizers/split_name_list.py b/nominatim/tokenizer/sanitizers/split_name_list.py
new file mode 100644
index 00000000..93651f3e
--- /dev/null
+++ b/nominatim/tokenizer/sanitizers/split_name_list.py
@@ -0,0 +1,28 @@
+"""
+Name processor that splits name values with multiple values into their components.
+"""
+import re
+
+def create(func):
+    """ Create a name processing function that splits name values with
+        multiple values into their components. The optional parameter
+        'delimiters' can be used to define the characters that should be used
+        for splitting. The default is ',;'.
+    """
+    regexp = re.compile('[{}]'.format(func.get('delimiters', ',;')))
+
+    def _process(obj):
+        if not obj.names:
+            return
+
+        new_names = []
+        for name in obj.names:
+            split_names = regexp.split(name.name)
+            if len(split_names) == 1:
+                new_names.append(name)
+            else:
+                new_names.extend(name.clone(name=n) for n in split_names)
+
+        obj.names = new_names
+
+    return _process
diff --git a/nominatim/tokenizer/sanitizers/strip_brace_terms.py b/nominatim/tokenizer/sanitizers/strip_brace_terms.py
new file mode 100644
index 00000000..4423d305
--- /dev/null
+++ b/nominatim/tokenizer/sanitizers/strip_brace_terms.py
@@ -0,0 +1,22 @@
+"""
+Sanitizer handling names with addendums in braces.
+"""
+
+def create(_):
+    """ Create a name processing function that creates additional name variants
+        when a name has an addendum in brackets (e.g. "Halle (Saale)"). The
+        additional variant only contains the main name without the bracket part.
+    """
+    def _process(obj):
+        """ Add variants for names that have a bracket extension.
+        """
+        new_names = []
+        if obj.names:
+            for name in (n for n in obj.names if '(' in n.name):
+                new_name = name.name.split('(')[0].strip()
+                if new_name:
+                    new_names.append(name.clone(name=new_name))
+
+        obj.names.extend(new_names)
+
+    return _process
diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml
index c0c8c043..08b7a7ff 100644
--- a/settings/icu_tokenizer.yaml
+++ b/settings/icu_tokenizer.yaml
@@ -24,6 +24,9 @@ transliteration:
     - "[^[:Ascii:]] >"
     - ":: lower ()"
     - ":: NFC ()"
+sanitizers:
+    - step: split-name-list
+    - step: strip-brace-terms
 variants:
     - !include icu-rules/variants-bg.yaml
     - !include icu-rules/variants-ca.yaml
diff --git a/test/python/test_tokenizer_icu.py b/test/python/test_tokenizer_icu.py
index 4b7c56d5..9a6f5a94 100644
--- a/test/python/test_tokenizer_icu.py
+++ b/test/python/test_tokenizer_icu.py
@@ -67,10 +67,12 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
     monkeypatch.undo()
 
     def _mk_analyser(norm=("[[:Punctuation:][:Space:]]+ > ' '",), trans=(':: upper()',),
-                     variants=('~gasse -> gasse', 'street => st', )):
+                     variants=('~gasse -> gasse', 'street => st', ),
+                     sanitizers=[]):
         cfgstr = {'normalization' : list(norm),
-                   'transliteration' : list(trans),
-                   'variants' : [ {'words': list(variants)}]}
+                  'sanitizers' : sanitizers,
+                  'transliteration' : list(trans),
+                  'variants' : [ {'words': list(variants)}]}
         (test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr))
         tok.loader = ICURuleLoader(test_config)
 
@@ -309,14 +311,15 @@ class TestPlaceNames:
 
     @pytest.fixture(autouse=True)
     def setup(self, analyzer, sql_functions):
-        with analyzer() as anl:
+        sanitizers = [{'step': 'split-name-list'},
+                      {'step': 'strip-brace-terms'}]
+        with analyzer(sanitizers=sanitizers) as anl:
             self.analyzer = anl
             yield anl
 
 
     def expect_name_terms(self, info, *expected_terms):
         tokens = self.analyzer.get_word_token_info(expected_terms)
-        print (tokens)
         for token in tokens:
             assert token[2] is not None, "No token for {0}".format(token)
 
@@ -324,9 +327,7 @@ class TestPlaceNames:
 
 
     def process_named_place(self, names):
-        place = {'name': names}
-
-        return self.analyzer.process_place(PlaceInfo(place))
+        return self.analyzer.process_place(PlaceInfo({'name': names}))
 
 
     def test_simple_names(self):

From 732cd27d2e3033c11eca5b51cc2e6204c5737def Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Fri, 1 Oct 2021 09:50:17 +0200
Subject: [PATCH 6/7] add unit tests for new sanatizer functions

---
 .../tokenizer/sanitizers/split_name_list.py   | 11 ++-
 .../tokenizer/sanitizers/strip_brace_terms.py |  4 +-
 .../sanitizers/test_split_name_list.py        | 65 +++++++++++++++++
 .../sanitizers/test_strip_brace_terms.py      | 44 ++++++++++++
 test/python/tokenizer/test_place_sanitizer.py | 71 +++++++++++++++++++
 5 files changed, 191 insertions(+), 4 deletions(-)
 create mode 100644 test/python/tokenizer/sanitizers/test_split_name_list.py
 create mode 100644 test/python/tokenizer/sanitizers/test_strip_brace_terms.py
 create mode 100644 test/python/tokenizer/test_place_sanitizer.py

diff --git a/nominatim/tokenizer/sanitizers/split_name_list.py b/nominatim/tokenizer/sanitizers/split_name_list.py
index 93651f3e..f1514203 100644
--- a/nominatim/tokenizer/sanitizers/split_name_list.py
+++ b/nominatim/tokenizer/sanitizers/split_name_list.py
@@ -3,13 +3,19 @@ Name processor that splits name values with multiple values into their component
 """
 import re
 
+from nominatim.errors import UsageError
+
 def create(func):
     """ Create a name processing function that splits name values with
         multiple values into their components. The optional parameter
         'delimiters' can be used to define the characters that should be used
         for splitting. The default is ',;'.
     """
-    regexp = re.compile('[{}]'.format(func.get('delimiters', ',;')))
+    delimiter_set = set(func.get('delimiters', ',;'))
+    if not delimiter_set:
+        raise UsageError("Set of delimiters in split-name-list sanitizer is empty.")
+
+    regexp = re.compile('\\s*[{}]\\s*'.format(''.join('\\' + d for d in delimiter_set)))
 
     def _process(obj):
         if not obj.names:
@@ -18,10 +24,11 @@ def create(func):
         new_names = []
         for name in obj.names:
             split_names = regexp.split(name.name)
+            print(split_names)
             if len(split_names) == 1:
                 new_names.append(name)
             else:
-                new_names.extend(name.clone(name=n) for n in split_names)
+                new_names.extend(name.clone(name=n) for n in split_names if n)
 
         obj.names = new_names
 
diff --git a/nominatim/tokenizer/sanitizers/strip_brace_terms.py b/nominatim/tokenizer/sanitizers/strip_brace_terms.py
index 4423d305..ec91bac9 100644
--- a/nominatim/tokenizer/sanitizers/strip_brace_terms.py
+++ b/nominatim/tokenizer/sanitizers/strip_brace_terms.py
@@ -10,13 +10,13 @@ def create(_):
     def _process(obj):
         """ Add variants for names that have a bracket extension.
         """
-        new_names = []
         if obj.names:
+            new_names = []
             for name in (n for n in obj.names if '(' in n.name):
                 new_name = name.name.split('(')[0].strip()
                 if new_name:
                     new_names.append(name.clone(name=new_name))
 
-        obj.names.extend(new_names)
+            obj.names.extend(new_names)
 
     return _process
diff --git a/test/python/tokenizer/sanitizers/test_split_name_list.py b/test/python/tokenizer/sanitizers/test_split_name_list.py
new file mode 100644
index 00000000..ee745469
--- /dev/null
+++ b/test/python/tokenizer/sanitizers/test_split_name_list.py
@@ -0,0 +1,65 @@
+"""
+Tests for the sanitizer that splitts multivalue lists.
+"""
+import pytest
+
+from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
+from nominatim.indexer.place_info import PlaceInfo
+
+from nominatim.errors import UsageError
+
+def run_sanitizer_on(**kwargs):
+    place = PlaceInfo({'name': kwargs})
+    name, _ = PlaceSanitizer([{'step': 'split-name-list'}]).process_names(place)
+
+    return sorted([(p.name, p.kind, p.suffix) for p in name])
+
+
+def sanitize_with_delimiter(delimiter, name):
+    place = PlaceInfo({'name': {'name': name}})
+    san = PlaceSanitizer([{'step': 'split-name-list', 'delimiters': delimiter}])
+    name, _ = san.process_names(place)
+
+    return sorted([p.name for p in name])
+
+
+def test_simple():
+    assert run_sanitizer_on(name='ABC') == [('ABC', 'name', None)]
+    assert run_sanitizer_on(name='') == [('', 'name', None)]
+
+
+def test_splits():
+    assert run_sanitizer_on(name='A;B;C') == [('A', 'name', None),
+                                              ('B', 'name', None),
+                                              ('C', 'name', None)]
+    assert run_sanitizer_on(short_name=' House, boat ') == [('House', 'short_name', None),
+                                                            ('boat', 'short_name', None)]
+
+
+def test_empty_fields():
+    assert run_sanitizer_on(name='A;;B') == [('A', 'name', None),
+                                             ('B', 'name', None)]
+    assert run_sanitizer_on(name='A; ,B') == [('A', 'name', None),
+                                              ('B', 'name', None)]
+    assert run_sanitizer_on(name=' ;B') == [('B', 'name', None)]
+    assert run_sanitizer_on(name='B,') == [('B', 'name', None)]
+
+
+def test_custom_delimiters():
+    assert sanitize_with_delimiter(':', '12:45,3') == ['12', '45,3']
+    assert sanitize_with_delimiter('\\', 'a;\\b!#@ \\') == ['a;', 'b!#@']
+    assert sanitize_with_delimiter('[]', 'foo[to]be') == ['be', 'foo', 'to']
+    assert sanitize_with_delimiter(' ', 'morning  sun') == ['morning', 'sun']
+
+
+def test_empty_delimiter_set():
+    with pytest.raises(UsageError):
+        sanitize_with_delimiter('', 'abc')
+
+
+def test_no_name_list():
+    place = PlaceInfo({'address': {'housenumber': '3'}})
+    name, address = PlaceSanitizer([{'step': 'split-name-list'}]).process_names(place)
+
+    assert not name
+    assert len(address) == 1
diff --git a/test/python/tokenizer/sanitizers/test_strip_brace_terms.py b/test/python/tokenizer/sanitizers/test_strip_brace_terms.py
new file mode 100644
index 00000000..50af2449
--- /dev/null
+++ b/test/python/tokenizer/sanitizers/test_strip_brace_terms.py
@@ -0,0 +1,44 @@
+"""
+Tests for the sanitizer that handles braced suffixes.
+"""
+import pytest
+
+from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
+from nominatim.indexer.place_info import PlaceInfo
+
+def run_sanitizer_on(**kwargs):
+    place = PlaceInfo({'name': kwargs})
+    name, _ = PlaceSanitizer([{'step': 'strip-brace-terms'}]).process_names(place)
+
+    return sorted([(p.name, p.kind, p.suffix) for p in name])
+
+
+def test_no_braces():
+    assert run_sanitizer_on(name='foo', ref='23') == [('23', 'ref', None),
+                                                      ('foo', 'name', None)]
+
+
+def test_simple_braces():
+    assert run_sanitizer_on(name='Halle (Saale)', ref='3')\
+      == [('3', 'ref', None), ('Halle', 'name', None), ('Halle (Saale)', 'name', None)]
+    assert run_sanitizer_on(name='ack ( bar')\
+      == [('ack', 'name', None), ('ack ( bar', 'name', None)]
+
+
+def test_only_braces():
+    assert run_sanitizer_on(name='(maybe)') == [('(maybe)', 'name', None)]
+
+
+def test_double_braces():
+    assert run_sanitizer_on(name='a((b))') == [('a', 'name', None),
+                                               ('a((b))', 'name', None)]
+    assert run_sanitizer_on(name='a (b) (c)') == [('a', 'name', None),
+                                                  ('a (b) (c)', 'name', None)]
+
+
+def test_no_names():
+    place = PlaceInfo({'address': {'housenumber': '3'}})
+    name, address = PlaceSanitizer([{'step': 'strip-brace-terms'}]).process_names(place)
+
+    assert not name
+    assert len(address) == 1
diff --git a/test/python/tokenizer/test_place_sanitizer.py b/test/python/tokenizer/test_place_sanitizer.py
new file mode 100644
index 00000000..389b068c
--- /dev/null
+++ b/test/python/tokenizer/test_place_sanitizer.py
@@ -0,0 +1,71 @@
+"""
+Tests for execution of the sanitztion step.
+"""
+import pytest
+
+from nominatim.errors import UsageError
+import nominatim.tokenizer.place_sanitizer as sanitizer
+from nominatim.indexer.place_info import PlaceInfo
+
+
+def test_placeinfo_clone_new_name():
+    place = sanitizer.PlaceName('foo', 'ki', 'su')
+
+    newplace = place.clone(name='bar')
+
+    assert place.name == 'foo'
+    assert newplace.name == 'bar'
+    assert newplace.kind == 'ki'
+    assert newplace.suffix == 'su'
+
+
+def test_placeinfo_clone_merge_attr():
+    place = sanitizer.PlaceName('foo', 'ki', 'su')
+    place.set_attr('a1', 'v1')
+    place.set_attr('a2', 'v2')
+
+    newplace = place.clone(attr={'a2': 'new', 'b2': 'foo'})
+
+    assert place.get_attr('a2') == 'v2'
+    assert place.get_attr('b2') is None
+    assert newplace.get_attr('a1') == 'v1'
+    assert newplace.get_attr('a2') == 'new'
+    assert newplace.get_attr('b2') == 'foo'
+
+
+def test_placeinfo_has_attr():
+    place = sanitizer.PlaceName('foo', 'ki', 'su')
+    place.set_attr('a1', 'v1')
+
+    assert place.has_attr('a1')
+    assert not place.has_attr('whatever')
+
+
+def test_sanitizer_default():
+    san = sanitizer.PlaceSanitizer([{'step': 'split-name-list'}])
+
+    name, address =  san.process_names(PlaceInfo({'name': {'name:de:de': '1;2;3'},
+                                                  'address': {'street': 'Bald'}}))
+
+    assert len(name) == 3
+    assert all(isinstance(n, sanitizer.PlaceName) for n in name)
+    assert all(n.kind == 'name'  for n in name)
+    assert all(n.suffix == 'de:de'  for n in name)
+
+    assert len(address) == 1
+    assert all(isinstance(n, sanitizer.PlaceName) for n in address)
+
+
+@pytest.mark.parametrize('rules', [None, []])
+def test_sanitizer_empty_list(rules):
+    san = sanitizer.PlaceSanitizer(rules)
+
+    name, address =  san.process_names(PlaceInfo({'name': {'name:de:de': '1;2;3'}}))
+
+    assert len(name) == 1
+    assert all(isinstance(n, sanitizer.PlaceName) for n in name)
+
+
+def test_sanitizer_missing_step_definition():
+    with pytest.raises(UsageError):
+        san = sanitizer.PlaceSanitizer([{'id': 'split-name-list'}])

From 6b348d43c61bc825c8d9cd208b1ef34c8d7951fa Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Fri, 1 Oct 2021 10:51:41 +0200
Subject: [PATCH 7/7] replace test variable for PG env tests

'tty' was removed in PG14 and causes an error.
---
 test/python/test_db_connection.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/python/test_db_connection.py b/test/python/test_db_connection.py
index 41978e59..00c29a43 100644
--- a/test/python/test_db_connection.py
+++ b/test/python/test_db_connection.py
@@ -100,6 +100,6 @@ def test_get_pg_env_overwrite_variable(monkeypatch):
 
 
 def test_get_pg_env_ignore_unknown():
-    env = get_pg_env('tty=stuff', base_env={})
+    env = get_pg_env('client_encoding=stuff', base_env={})
 
     assert env == {}