Merge pull request #2346 from lonvia/words-vs-tokens

Cleanup use of partial words in legacy tokenizers
2026-02-15 02:47:59 +00:00 · 2021-05-24 17:41:38 +02:00
parent 10143e0ac7 24c986c842
commit 29b02f9e56
7 changed files with 85 additions and 38 deletions
--- a/lib-php/SearchDescription.php
+++ b/lib-php/SearchDescription.php
@@ -333,7 +333,9 @@ class SearchDescription
    public function extendWithPartialTerm($sToken, $oSearchTerm, $bStructuredPhrases, $iPhrase, $aFullTokens)
    {
        // Only allow name terms.
-        if (!(is_a($oSearchTerm, '\Nominatim\Token\Word'))) {
+        if (!(is_a($oSearchTerm, '\Nominatim\Token\Word'))
+            || strpos($sToken, ' ') !== false
+        ) {
            return array();
        }

@@ -361,7 +363,6 @@ class SearchDescription

        if ((!$this->sPostcode && !$this->aAddress && !$this->aAddressNonSearch)
            && ((empty($this->aName) && empty($this->aNameNonSearch)) || $this->iNamePhrase == $iPhrase)
-            && strpos($sToken, ' ') === false
        ) {
            $oSearch = clone $this;
            $oSearch->iSearchRank++;
--- a/lib-sql/tokenizer/legacy_tokenizer.sql
+++ b/lib-sql/tokenizer/legacy_tokenizer.sql
@@ -287,26 +287,21 @@ DECLARE
  s TEXT;
  w INTEGER;
  words TEXT[];
-  item RECORD;
+  value TEXT;
  j INTEGER;
 BEGIN
  result := '{}'::INTEGER[];

-  FOR item IN SELECT (each(src)).* LOOP
-
-    s := make_standard_name(item.value);
-    w := getorcreate_name_id(s, item.value);
+  FOR value IN SELECT unnest(regexp_split_to_array(svals(src), E'[,;]')) LOOP
+    -- full name
+    s := make_standard_name(value);
+    w := getorcreate_name_id(s, value);

    IF not(ARRAY[w] <@ result) THEN
      result := result || w;
    END IF;

-    w := getorcreate_word_id(s);
-
-    IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
-      result := result || w;
-    END IF;
-
+    -- partial single-word terms
    words := string_to_array(s, ' ');
    IF array_upper(words, 1) IS NOT NULL THEN
      FOR j IN 1..array_upper(words, 1) LOOP
@@ -319,24 +314,23 @@ BEGIN
      END LOOP;
    END IF;

-    words := regexp_split_to_array(item.value, E'[,;()]');
-    IF array_upper(words, 1) != 1 THEN
-      FOR j IN 1..array_upper(words, 1) LOOP
-        s := make_standard_name(words[j]);
-        IF s != '' THEN
-          w := getorcreate_word_id(s);
-          IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
-            result := result || w;
-          END IF;
+    -- consider parts before an opening braket a full word as well
+    words := regexp_split_to_array(value, E'[(]');
+    IF array_upper(words, 1) > 1 THEN
+      s := make_standard_name(words[1]);
+      IF s != '' THEN
+        w := getorcreate_name_id(s, words[1]);
+        IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
+          result := result || w;
        END IF;
-      END LOOP;
+      END IF;
    END IF;

-    s := regexp_replace(item.value, '市$', '');
-    IF s != item.value THEN
+    s := regexp_replace(value, '市$', '');
+    IF s != value THEN
      s := make_standard_name(s);
      IF s != '' THEN
-        w := getorcreate_name_id(s, item.value);
+        w := getorcreate_name_id(s, value);
        IF NOT (ARRAY[w] <@ result) THEN
          result := result || w;
        END IF;
--- a/nominatim/tokenizer/legacy_icu_tokenizer.py
+++ b/nominatim/tokenizer/legacy_icu_tokenizer.py
@@ -423,8 +423,7 @@ class LegacyICUNameAnalyzer:
        names = place.get('name')

        if names:
-            full_names = set((self.make_standard_word(name) for name in names.values()))
-            full_names.discard('')
+            full_names = self._compute_full_names(names)

            token_info.add_names(self.conn, full_names)

@@ -461,6 +460,25 @@ class LegacyICUNameAnalyzer:
        return token_info.data


+    def _compute_full_names(self, names):
+        """ Return the set of all full name word ids to be used with the
+            given dictionary of names.
+        """
+        full_names = set()
+        for name in (n for ns in names.values() for n in re.split('[;,]', ns)):
+            word = self.make_standard_word(name)
+            if word:
+                full_names.add(word)
+
+                brace_split = name.split('(', 2)
+                if len(brace_split) > 1:
+                    word = self.make_standard_word(brace_split[0])
+                    if word:
+                        full_names.add(word)
+
+        return full_names
+
+
    def _add_postcode(self, postcode):
        """ Make sure the normalized postcode is present in the word table.
        """
@@ -519,8 +537,6 @@ class _TokenInfo:
        """
        # Start with all partial names
        terms = set((part for ns in names for part in ns.split()))
-        # Add partials for the full terms (TO BE REMOVED)
-        terms.update((n for n in names))
        # Add the full names
        terms.update((' ' + n for n in names))

--- a/nominatim/tokenizer/legacy_tokenizer.py
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@@ -513,10 +513,9 @@ class _TokenInfo:
        """
        def _get_place(name):
            with conn.cursor() as cur:
-                cur.execute("""SELECT (addr_ids_from_name(%s)
-                                       || getorcreate_name_id(make_standard_name(%s), ''))::text,
+                cur.execute("""SELECT make_keywords(hstore('name' , %s))::text,
                                      word_ids_from_name(%s)::text""",
-                            (name, name, name))
+                            (name, name))
                return cur.fetchone()

        self.data['place_search'], self.data['place_match'] = \
--- a/test/Makefile
+++ b/test/Makefile
@@ -4,8 +4,8 @@ no-test-db: bdd-no-test-db php
 bdd:
 	cd bdd && behave -DREMOVE_TEMPLATE=1

-bdd-no-test-db:
-	cd bdd && behave -DREMOVE_TEMPLATE=1 db osm2pgsql
+icu:
+	cd bdd && behave -DREMOVE_TEMPLATE=1 -DTOKENIZER=legacy_icu

 php:
 	cd php && phpunit ./
--- a/test/bdd/db/import/search_name.feature
+++ b/test/bdd/db/import/search_name.feature
@@ -2,6 +2,29 @@
 Feature: Creation of search terms
    Tests that search_name table is filled correctly

+    Scenario Outline: Comma- and semicolon separated names appear as full names
+        Given the places
+         | osm | class   | type | name+alt_name |
+         | N1  | place   | city | New York<sep>Big Apple |
+        When importing
+        Then search_name contains
+         | object | name_vector |
+         | N1     | #New York, #Big Apple |
+
+    Examples:
+         | sep |
+         | ,   |
+         | ;   |
+
+    Scenario Outline: Name parts before brackets appear as full names
+        Given the places
+         | osm | class   | type | name+name |
+         | N1  | place   | city | Halle (Saale) |
+        When importing
+        Then search_name contains
+         | object | name_vector |
+         | N1     | #Halle Saale, #Halle |
+
    Scenario: Unnamed POIs have no search entry
        Given the scene roads-with-pois
        And the places
@@ -49,7 +72,7 @@ Feature: Creation of search terms
        When importing
        Then search_name contains
         | object | nameaddress_vector |
-         | N1     | Rose Street, Little, Big, Town |
+         | N1     | #Rose Street, rose, Little, Big, Town |
        When searching for "23 Rose Street, Little Big Town"
        Then results contain
         | osm_type | osm_id | name |
--- a/test/python/test_tokenizer_legacy_icu.py
+++ b/test/python/test_tokenizer_legacy_icu.py
@@ -223,11 +223,25 @@ def test_update_special_phrase_modify(analyzer, word_table):


 def test_process_place_names(analyzer, getorcreate_term_id):
-
    with analyzer() as anl:
        info = anl.process_place({'name' : {'name' : 'Soft bAr', 'ref': '34'}})

-    assert info['names'] == '{1,2,3,4,5,6}'
+    assert info['names'] == '{1,2,3,4,5}'
+
+
+@pytest.mark.parametrize('sep', [',' , ';'])
+def test_full_names_with_separator(analyzer, getorcreate_term_id, sep):
+    with analyzer() as anl:
+        names = anl._compute_full_names({'name' : sep.join(('New York', 'Big Apple'))})
+
+    assert names == set(('NEW YORK', 'BIG APPLE'))
+
+
+def test_full_names_with_bracket(analyzer, getorcreate_term_id):
+    with analyzer() as anl:
+        names = anl._compute_full_names({'name' : 'Houseboat (left)'})
+
+    assert names == set(('HOUSEBOAT (LEFT)', 'HOUSEBOAT'))


@pytest.mark.parametrize('pcode', ['12345', 'AB 123', '34-345'])