reorganize keyword creation for legacy tokenizer

- only save partial words without internal spaces - consider comma and semicolon a separator of full words - consider parts before an opening bracket a full word (but not the part after the bracket) Fixes #244.
2021-05-23 23:58:58 +02:00
parent fa3e48c59f
commit 4f4d15c28a
4 changed files with 85 additions and 29 deletions
--- a/test/bdd/db/import/search_name.feature
+++ b/test/bdd/db/import/search_name.feature
@@ -2,6 +2,29 @@
 Feature: Creation of search terms
    Tests that search_name table is filled correctly

+    Scenario Outline: Comma- and semicolon separated names appear as full names
+        Given the places
+         | osm | class   | type | name+alt_name |
+         | N1  | place   | city | New York<sep>Big Apple |
+        When importing
+        Then search_name contains
+         | object | name_vector |
+         | N1     | #New York, #Big Apple |
+
+    Examples:
+         | sep |
+         | ,   |
+         | ;   |
+
+    Scenario Outline: Name parts before brackets appear as full names
+        Given the places
+         | osm | class   | type | name+name |
+         | N1  | place   | city | Halle (Saale) |
+        When importing
+        Then search_name contains
+         | object | name_vector |
+         | N1     | #Halle Saale, #Halle |
+
    Scenario: Unnamed POIs have no search entry
        Given the scene roads-with-pois
        And the places
@@ -49,7 +72,7 @@ Feature: Creation of search terms
        When importing
        Then search_name contains
         | object | nameaddress_vector |
-         | N1     | Rose Street, Little, Big, Town |
+         | N1     | #Rose Street, rose, Little, Big, Town |
        When searching for "23 Rose Street, Little Big Town"
        Then results contain
         | osm_type | osm_id | name |
--- a/test/python/test_tokenizer_legacy_icu.py
+++ b/test/python/test_tokenizer_legacy_icu.py
@@ -223,11 +223,32 @@ def test_update_special_phrase_modify(analyzer, word_table):


 def test_process_place_names(analyzer, getorcreate_term_id):
-
    with analyzer() as anl:
        info = anl.process_place({'name' : {'name' : 'Soft bAr', 'ref': '34'}})

-    assert info['names'] == '{1,2,3,4,5,6}'
+    assert info['names'] == '{1,2,3,4,5}'
+
+
+@pytest.mark.parametrize('sep', [',' , ';'])
+def test_full_names_with_separator(analyzer, getorcreate_term_id, sep):
+    with analyzer() as anl:
+        full_names =
+            anl._compute_full_names({'name' : sep.join(('New York', 'Big Apple'))})
+
+        expect = set((anl.make_standard_word(w) for w in ('New York', 'Big Apple')))
+
+    assert full_names == expect
+
+
+def test_process_place_names_with_bracket(analyzer, getorcreate_term_id):
+    with analyzer() as anl:
+        info = anl.process_place({'name' :
+                                   {'name' : 'Houseboat (left)'}})
+
+        expect = set((anl.make_standard_word(w) for w in
+                       (' houseboat', ' houseboat left', 'houseboat', 'left')))
+
+    assert eval(info['names']) == expect


@pytest.mark.parametrize('pcode', ['12345', 'AB 123', '34-345'])