mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-02-26 11:08:13 +00:00
add framework for analysing housenumbers
This lays the groundwork for adding variants for housenumbers. When analysis is enabled, then the 'word' field in the word table is used as usual, so that variants can be created. There will be only one analyser allowed which must have the fixed name '@housenumber'.
This commit is contained in:
@@ -157,7 +157,8 @@ class Tokenizer
|
|||||||
$sSQL = 'SELECT word_id, word_token, type, word,';
|
$sSQL = 'SELECT word_id, word_token, type, word,';
|
||||||
$sSQL .= " info->>'op' as operator,";
|
$sSQL .= " info->>'op' as operator,";
|
||||||
$sSQL .= " info->>'class' as class, info->>'type' as ctype,";
|
$sSQL .= " info->>'class' as class, info->>'type' as ctype,";
|
||||||
$sSQL .= " info->>'count' as count";
|
$sSQL .= " info->>'count' as count,";
|
||||||
|
$sSQL .= " info->>'lookup' as lookup";
|
||||||
$sSQL .= ' FROM word WHERE word_token in (';
|
$sSQL .= ' FROM word WHERE word_token in (';
|
||||||
$sSQL .= join(',', $this->oDB->getDBQuotedList($aTokens)).')';
|
$sSQL .= join(',', $this->oDB->getDBQuotedList($aTokens)).')';
|
||||||
|
|
||||||
@@ -179,7 +180,8 @@ class Tokenizer
|
|||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case 'H': // house number tokens
|
case 'H': // house number tokens
|
||||||
$oValidTokens->addToken($sTok, new Token\HouseNumber($iId, $aWord['word_token']));
|
$sLookup = $aWord['lookup'] ?? $aWord['word_token'];
|
||||||
|
$oValidTokens->addToken($sTok, new Token\HouseNumber($iId, $sLookup));
|
||||||
break;
|
break;
|
||||||
case 'P': // postcode tokens
|
case 'P': // postcode tokens
|
||||||
// Postcodes are not normalized, so they may have content
|
// Postcodes are not normalized, so they may have content
|
||||||
|
|||||||
@@ -200,3 +200,26 @@ BEGIN
|
|||||||
END;
|
END;
|
||||||
$$
|
$$
|
||||||
LANGUAGE plpgsql;
|
LANGUAGE plpgsql;
|
||||||
|
|
||||||
|
|
||||||
|
CREATE OR REPLACE FUNCTION create_analyzed_hnr_id(norm_term TEXT, lookup_terms TEXT[])
|
||||||
|
RETURNS INTEGER
|
||||||
|
AS $$
|
||||||
|
DECLARE
|
||||||
|
return_id INTEGER;
|
||||||
|
BEGIN
|
||||||
|
SELECT min(word_id) INTO return_id
|
||||||
|
FROM word WHERE word = norm_term and type = 'H';
|
||||||
|
|
||||||
|
IF return_id IS NULL THEN
|
||||||
|
return_id := nextval('seq_word');
|
||||||
|
INSERT INTO word (word_id, word_token, type, word, info)
|
||||||
|
SELECT return_id, lookup_term, 'H', norm_term,
|
||||||
|
json_build_object('lookup', lookup_terms[1])
|
||||||
|
FROM unnest(lookup_terms) as lookup_term;
|
||||||
|
END IF;
|
||||||
|
|
||||||
|
RETURN return_id;
|
||||||
|
END;
|
||||||
|
$$
|
||||||
|
LANGUAGE plpgsql;
|
||||||
|
|||||||
@@ -28,6 +28,10 @@ CREATE INDEX idx_word_postcodes ON word
|
|||||||
CREATE INDEX idx_word_full_word ON word
|
CREATE INDEX idx_word_full_word ON word
|
||||||
USING btree(word) {{db.tablespace.address_index}}
|
USING btree(word) {{db.tablespace.address_index}}
|
||||||
WHERE type = 'W';
|
WHERE type = 'W';
|
||||||
|
-- Used when inserting analyzed housenumbers (exclude old-style entries).
|
||||||
|
CREATE INDEX idx_word_housenumbers ON word
|
||||||
|
USING btree(word) {{db.tablespace.address_index}}
|
||||||
|
WHERE type = 'H' and word is not null;
|
||||||
|
|
||||||
GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
|
GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
|
||||||
|
|
||||||
|
|||||||
@@ -485,18 +485,36 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
""" Normalize the housenumber and return the word token and the
|
""" Normalize the housenumber and return the word token and the
|
||||||
canonical form.
|
canonical form.
|
||||||
"""
|
"""
|
||||||
norm_name = self._search_normalized(hnr.name)
|
analyzer = self.token_analysis.analysis.get('@housenumber')
|
||||||
if not norm_name:
|
result = None, None
|
||||||
return None, None
|
|
||||||
|
|
||||||
token = self._cache.housenumbers.get(norm_name)
|
if analyzer is None:
|
||||||
if token is None:
|
# When no custom analyzer is set, simply normalize and transliterate
|
||||||
with self.conn.cursor() as cur:
|
norm_name = self._search_normalized(hnr.name)
|
||||||
cur.execute("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
|
if norm_name:
|
||||||
token = cur.fetchone()[0]
|
result = self._cache.housenumbers.get(norm_name, result)
|
||||||
self._cache.housenumbers[norm_name] = token
|
if result[0] is None:
|
||||||
|
with self.conn.cursor() as cur:
|
||||||
|
cur.execute("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
|
||||||
|
result = cur.fetchone()[0], norm_name
|
||||||
|
self._cache.housenumbers[norm_name] = result
|
||||||
|
else:
|
||||||
|
# Otherwise use the analyzer to determine the canonical name.
|
||||||
|
# Per convention we use the first variant as the 'lookup name', the
|
||||||
|
# name that gets saved in the housenumber field of the place.
|
||||||
|
norm_name = analyzer.normalize(hnr.name)
|
||||||
|
if norm_name:
|
||||||
|
result = self._cache.housenumbers.get(norm_name, result)
|
||||||
|
if result[0] is None:
|
||||||
|
variants = analyzer.get_variants_ascii(norm_name)
|
||||||
|
if variants:
|
||||||
|
with self.conn.cursor() as cur:
|
||||||
|
cur.execute("SELECT create_analyzed_hnr_id(%s, %s)",
|
||||||
|
(norm_name, list(variants)))
|
||||||
|
result = cur.fetchone()[0], variants[0]
|
||||||
|
self._cache.housenumbers[norm_name] = result
|
||||||
|
|
||||||
return token, norm_name
|
return result
|
||||||
|
|
||||||
|
|
||||||
def _compute_partial_tokens(self, name):
|
def _compute_partial_tokens(self, name):
|
||||||
|
|||||||
Reference in New Issue
Block a user