mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-02-15 02:47:59 +00:00
Merge pull request #2346 from lonvia/words-vs-tokens
Cleanup use of partial words in legacy tokenizers
This commit is contained in:
@@ -333,7 +333,9 @@ class SearchDescription
|
||||
public function extendWithPartialTerm($sToken, $oSearchTerm, $bStructuredPhrases, $iPhrase, $aFullTokens)
|
||||
{
|
||||
// Only allow name terms.
|
||||
if (!(is_a($oSearchTerm, '\Nominatim\Token\Word'))) {
|
||||
if (!(is_a($oSearchTerm, '\Nominatim\Token\Word'))
|
||||
|| strpos($sToken, ' ') !== false
|
||||
) {
|
||||
return array();
|
||||
}
|
||||
|
||||
@@ -361,7 +363,6 @@ class SearchDescription
|
||||
|
||||
if ((!$this->sPostcode && !$this->aAddress && !$this->aAddressNonSearch)
|
||||
&& ((empty($this->aName) && empty($this->aNameNonSearch)) || $this->iNamePhrase == $iPhrase)
|
||||
&& strpos($sToken, ' ') === false
|
||||
) {
|
||||
$oSearch = clone $this;
|
||||
$oSearch->iSearchRank++;
|
||||
|
||||
@@ -287,26 +287,21 @@ DECLARE
|
||||
s TEXT;
|
||||
w INTEGER;
|
||||
words TEXT[];
|
||||
item RECORD;
|
||||
value TEXT;
|
||||
j INTEGER;
|
||||
BEGIN
|
||||
result := '{}'::INTEGER[];
|
||||
|
||||
FOR item IN SELECT (each(src)).* LOOP
|
||||
|
||||
s := make_standard_name(item.value);
|
||||
w := getorcreate_name_id(s, item.value);
|
||||
FOR value IN SELECT unnest(regexp_split_to_array(svals(src), E'[,;]')) LOOP
|
||||
-- full name
|
||||
s := make_standard_name(value);
|
||||
w := getorcreate_name_id(s, value);
|
||||
|
||||
IF not(ARRAY[w] <@ result) THEN
|
||||
result := result || w;
|
||||
END IF;
|
||||
|
||||
w := getorcreate_word_id(s);
|
||||
|
||||
IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
|
||||
result := result || w;
|
||||
END IF;
|
||||
|
||||
-- partial single-word terms
|
||||
words := string_to_array(s, ' ');
|
||||
IF array_upper(words, 1) IS NOT NULL THEN
|
||||
FOR j IN 1..array_upper(words, 1) LOOP
|
||||
@@ -319,24 +314,23 @@ BEGIN
|
||||
END LOOP;
|
||||
END IF;
|
||||
|
||||
words := regexp_split_to_array(item.value, E'[,;()]');
|
||||
IF array_upper(words, 1) != 1 THEN
|
||||
FOR j IN 1..array_upper(words, 1) LOOP
|
||||
s := make_standard_name(words[j]);
|
||||
IF s != '' THEN
|
||||
w := getorcreate_word_id(s);
|
||||
IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
|
||||
result := result || w;
|
||||
END IF;
|
||||
-- consider parts before an opening braket a full word as well
|
||||
words := regexp_split_to_array(value, E'[(]');
|
||||
IF array_upper(words, 1) > 1 THEN
|
||||
s := make_standard_name(words[1]);
|
||||
IF s != '' THEN
|
||||
w := getorcreate_name_id(s, words[1]);
|
||||
IF w IS NOT NULL AND NOT (ARRAY[w] <@ result) THEN
|
||||
result := result || w;
|
||||
END IF;
|
||||
END LOOP;
|
||||
END IF;
|
||||
END IF;
|
||||
|
||||
s := regexp_replace(item.value, '市$', '');
|
||||
IF s != item.value THEN
|
||||
s := regexp_replace(value, '市$', '');
|
||||
IF s != value THEN
|
||||
s := make_standard_name(s);
|
||||
IF s != '' THEN
|
||||
w := getorcreate_name_id(s, item.value);
|
||||
w := getorcreate_name_id(s, value);
|
||||
IF NOT (ARRAY[w] <@ result) THEN
|
||||
result := result || w;
|
||||
END IF;
|
||||
|
||||
@@ -423,8 +423,7 @@ class LegacyICUNameAnalyzer:
|
||||
names = place.get('name')
|
||||
|
||||
if names:
|
||||
full_names = set((self.make_standard_word(name) for name in names.values()))
|
||||
full_names.discard('')
|
||||
full_names = self._compute_full_names(names)
|
||||
|
||||
token_info.add_names(self.conn, full_names)
|
||||
|
||||
@@ -461,6 +460,25 @@ class LegacyICUNameAnalyzer:
|
||||
return token_info.data
|
||||
|
||||
|
||||
def _compute_full_names(self, names):
|
||||
""" Return the set of all full name word ids to be used with the
|
||||
given dictionary of names.
|
||||
"""
|
||||
full_names = set()
|
||||
for name in (n for ns in names.values() for n in re.split('[;,]', ns)):
|
||||
word = self.make_standard_word(name)
|
||||
if word:
|
||||
full_names.add(word)
|
||||
|
||||
brace_split = name.split('(', 2)
|
||||
if len(brace_split) > 1:
|
||||
word = self.make_standard_word(brace_split[0])
|
||||
if word:
|
||||
full_names.add(word)
|
||||
|
||||
return full_names
|
||||
|
||||
|
||||
def _add_postcode(self, postcode):
|
||||
""" Make sure the normalized postcode is present in the word table.
|
||||
"""
|
||||
@@ -519,8 +537,6 @@ class _TokenInfo:
|
||||
"""
|
||||
# Start with all partial names
|
||||
terms = set((part for ns in names for part in ns.split()))
|
||||
# Add partials for the full terms (TO BE REMOVED)
|
||||
terms.update((n for n in names))
|
||||
# Add the full names
|
||||
terms.update((' ' + n for n in names))
|
||||
|
||||
|
||||
@@ -513,10 +513,9 @@ class _TokenInfo:
|
||||
"""
|
||||
def _get_place(name):
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""SELECT (addr_ids_from_name(%s)
|
||||
|| getorcreate_name_id(make_standard_name(%s), ''))::text,
|
||||
cur.execute("""SELECT make_keywords(hstore('name' , %s))::text,
|
||||
word_ids_from_name(%s)::text""",
|
||||
(name, name, name))
|
||||
(name, name))
|
||||
return cur.fetchone()
|
||||
|
||||
self.data['place_search'], self.data['place_match'] = \
|
||||
|
||||
@@ -4,8 +4,8 @@ no-test-db: bdd-no-test-db php
|
||||
bdd:
|
||||
cd bdd && behave -DREMOVE_TEMPLATE=1
|
||||
|
||||
bdd-no-test-db:
|
||||
cd bdd && behave -DREMOVE_TEMPLATE=1 db osm2pgsql
|
||||
icu:
|
||||
cd bdd && behave -DREMOVE_TEMPLATE=1 -DTOKENIZER=legacy_icu
|
||||
|
||||
php:
|
||||
cd php && phpunit ./
|
||||
|
||||
@@ -2,6 +2,29 @@
|
||||
Feature: Creation of search terms
|
||||
Tests that search_name table is filled correctly
|
||||
|
||||
Scenario Outline: Comma- and semicolon separated names appear as full names
|
||||
Given the places
|
||||
| osm | class | type | name+alt_name |
|
||||
| N1 | place | city | New York<sep>Big Apple |
|
||||
When importing
|
||||
Then search_name contains
|
||||
| object | name_vector |
|
||||
| N1 | #New York, #Big Apple |
|
||||
|
||||
Examples:
|
||||
| sep |
|
||||
| , |
|
||||
| ; |
|
||||
|
||||
Scenario Outline: Name parts before brackets appear as full names
|
||||
Given the places
|
||||
| osm | class | type | name+name |
|
||||
| N1 | place | city | Halle (Saale) |
|
||||
When importing
|
||||
Then search_name contains
|
||||
| object | name_vector |
|
||||
| N1 | #Halle Saale, #Halle |
|
||||
|
||||
Scenario: Unnamed POIs have no search entry
|
||||
Given the scene roads-with-pois
|
||||
And the places
|
||||
@@ -49,7 +72,7 @@ Feature: Creation of search terms
|
||||
When importing
|
||||
Then search_name contains
|
||||
| object | nameaddress_vector |
|
||||
| N1 | Rose Street, Little, Big, Town |
|
||||
| N1 | #Rose Street, rose, Little, Big, Town |
|
||||
When searching for "23 Rose Street, Little Big Town"
|
||||
Then results contain
|
||||
| osm_type | osm_id | name |
|
||||
|
||||
@@ -223,11 +223,25 @@ def test_update_special_phrase_modify(analyzer, word_table):
|
||||
|
||||
|
||||
def test_process_place_names(analyzer, getorcreate_term_id):
|
||||
|
||||
with analyzer() as anl:
|
||||
info = anl.process_place({'name' : {'name' : 'Soft bAr', 'ref': '34'}})
|
||||
|
||||
assert info['names'] == '{1,2,3,4,5,6}'
|
||||
assert info['names'] == '{1,2,3,4,5}'
|
||||
|
||||
|
||||
@pytest.mark.parametrize('sep', [',' , ';'])
|
||||
def test_full_names_with_separator(analyzer, getorcreate_term_id, sep):
|
||||
with analyzer() as anl:
|
||||
names = anl._compute_full_names({'name' : sep.join(('New York', 'Big Apple'))})
|
||||
|
||||
assert names == set(('NEW YORK', 'BIG APPLE'))
|
||||
|
||||
|
||||
def test_full_names_with_bracket(analyzer, getorcreate_term_id):
|
||||
with analyzer() as anl:
|
||||
names = anl._compute_full_names({'name' : 'Houseboat (left)'})
|
||||
|
||||
assert names == set(('HOUSEBOAT (LEFT)', 'HOUSEBOAT'))
|
||||
|
||||
|
||||
@pytest.mark.parametrize('pcode', ['12345', 'AB 123', '34-345'])
|
||||
|
||||
Reference in New Issue
Block a user