move abbreviation computation into import phase

This adds precomputation of abbreviated terms for names and removes
abbreviation of terms in the query. Basic import works but still
needs some thorough testing as well as speed improvements during
import.

New dependency for python library datrie.
This commit is contained in:
Sarah Hoffmann
2021-05-28 22:06:13 +02:00
parent 6ba00e6aee
commit 8413075249
10 changed files with 665 additions and 206 deletions

View File

@@ -47,9 +47,7 @@ class Tokenizer
private function makeStandardWord($sTerm) private function makeStandardWord($sTerm)
{ {
$sNorm = ' '.$this->oTransliterator->transliterate($sTerm).' '; return trim($this->oTransliterator->transliterate(' '.$sTerm.' '));
return trim(str_replace(CONST_Abbreviations[0], CONST_Abbreviations[1], $sNorm));
} }
@@ -90,6 +88,7 @@ class Tokenizer
foreach ($aPhrases as $iPhrase => $oPhrase) { foreach ($aPhrases as $iPhrase => $oPhrase) {
$sNormQuery .= ','.$this->normalizeString($oPhrase->getPhrase()); $sNormQuery .= ','.$this->normalizeString($oPhrase->getPhrase());
$sPhrase = $this->makeStandardWord($oPhrase->getPhrase()); $sPhrase = $this->makeStandardWord($oPhrase->getPhrase());
Debug::printVar('Phrase', $sPhrase);
if (strlen($sPhrase) > 0) { if (strlen($sPhrase) > 0) {
$aWords = explode(' ', $sPhrase); $aWords = explode(' ', $sPhrase);
Tokenizer::addTokens($aTokens, $aWords); Tokenizer::addTokens($aTokens, $aWords);

View File

@@ -87,25 +87,48 @@ $$ LANGUAGE SQL IMMUTABLE STRICT;
--------------- private functions ---------------------------------------------- --------------- private functions ----------------------------------------------
CREATE OR REPLACE FUNCTION getorcreate_term_id(lookup_term TEXT) CREATE OR REPLACE FUNCTION getorcreate_full_word(norm_term TEXT, lookup_terms TEXT[],
RETURNS INTEGER OUT full_token INT,
OUT partial_tokens INT[])
AS $$ AS $$
DECLARE DECLARE
return_id INTEGER; partial_terms TEXT[] = '{}'::TEXT[];
term TEXT;
term_id INTEGER;
term_count INTEGER; term_count INTEGER;
BEGIN BEGIN
SELECT min(word_id), max(search_name_count) INTO return_id, term_count SELECT min(word_id) INTO full_token
FROM word WHERE word_token = lookup_term and class is null and type is null; FROM word WHERE word = norm_term and class is null and country_code is null;
IF return_id IS NULL THEN IF full_token IS NULL THEN
return_id := nextval('seq_word'); full_token := nextval('seq_word');
INSERT INTO word (word_id, word_token, search_name_count) INSERT INTO word (word_id, word_token, word, search_name_count)
VALUES (return_id, lookup_term, 0); SELECT full_token, ' ' || lookup_term, norm_term, 0 FROM unnest(lookup_terms) as lookup_term;
ELSEIF left(lookup_term, 1) = ' ' and term_count > {{ max_word_freq }} THEN
return_id := 0;
END IF; END IF;
RETURN return_id; FOR term IN SELECT unnest(string_to_array(unnest(lookup_terms), ' ')) LOOP
term := trim(term);
IF NOT (ARRAY[term] <@ partial_terms) THEN
partial_terms := partial_terms || term;
END IF;
END LOOP;
partial_tokens := '{}'::INT[];
FOR term IN SELECT unnest(partial_terms) LOOP
SELECT min(word_id), max(search_name_count) INTO term_id, term_count
FROM word WHERE word_token = term and class is null and country_code is null;
IF term_id IS NULL THEN
term_id := nextval('seq_word');
term_count := 0;
INSERT INTO word (word_id, word_token, search_name_count)
VALUES (term_id, term, 0);
END IF;
IF term_count < {{ max_word_freq }} THEN
partial_tokens := array_merge(partial_tokens, ARRAY[term_id]);
END IF;
END LOOP;
END; END;
$$ $$
LANGUAGE plpgsql; LANGUAGE plpgsql;

View File

@@ -0,0 +1,111 @@
"""
Processor for names that are imported into the database based on the
ICU library.
"""
import json
import itertools
from icu import Transliterator
import datrie
from nominatim.db.properties import set_property, get_property
DBCFG_IMPORT_NORM_RULES = "tokenizer_import_normalisation"
DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration"
DBCFG_IMPORT_REPLACEMENTS = "tokenizer_import_replacements"
DBCFG_SEARCH_STD_RULES = "tokenizer_search_standardization"
class ICUNameProcessorRules:
""" Data object that saves the rules needed for the name processor.
The rules can either be initialised through an ICURuleLoader or
be loaded from a database when a connection is given.
"""
def __init__(self, loader=None, conn=None):
if loader is not None:
self.norm_rules = loader.get_normalization_rules()
self.trans_rules = loader.get_transliteration_rules()
self.replacements = loader.get_replacement_pairs()
self.search_rules = loader.get_search_rules()
elif conn is not None:
self.norm_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
self.trans_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
self.replacements = json.loads(get_property(conn, DBCFG_IMPORT_REPLACEMENTS))
self.search_rules = get_property(conn, DBCFG_SEARCH_STD_RULES)
else:
assert False, "Parameter loader or conn required."
# Compute the set of characters used in the replacement list.
# We need this later when computing the tree.
chars = set()
for full, repl in self.replacements:
chars.update(full)
for word in repl:
chars.update(word)
self.replacement_charset = ''.join(chars)
def save_rules(self, conn):
""" Save the rules in the property table of the given database.
the rules can be loaded again by handing in a connection into
the constructor of the class.
"""
set_property(conn, DBCFG_IMPORT_NORM_RULES, self.norm_rules)
set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.trans_rules)
set_property(conn, DBCFG_IMPORT_REPLACEMENTS, json.dumps(self.replacements))
set_property(conn, DBCFG_SEARCH_STD_RULES, self.search_rules)
class ICUNameProcessor:
def __init__(self, rules):
self.normalizer = Transliterator.createFromRules("icu_normalization",
rules.norm_rules)
self.to_ascii = Transliterator.createFromRules("icu_to_ascii",
rules.trans_rules)
self.search = Transliterator.createFromRules("icu_search",
rules.search_rules)
self.replacements = datrie.Trie(rules.replacement_charset)
for full, repl in rules.replacements:
self.replacements[full] = repl
def get_normalized(self, name):
""" Normalize the given name, i.e. remove all elements not relevant
for search.
"""
return self.normalizer.transliterate(name)
def get_variants_ascii(self, norm_name):
""" Compute the spelling variants for the given normalized name
and transliterate the result.
"""
baseform = ' ' + norm_name + ' '
variants = ['']
startpos = 0
pos = 0
while pos < len(baseform):
full, repl = self.replacements.longest_prefix_item(baseform[pos:],
(None, None))
if full is not None:
done = baseform[startpos:pos]
variants = [v + done + r for v, r in itertools.product(variants, repl)]
startpos = pos + len(full)
pos = startpos
else:
pos += 1
if startpos == 0:
return [self.to_ascii.transliterate(norm_name)]
return [self.to_ascii.transliterate(v + baseform[startpos:pos]).strip() for v in variants]
def get_search_normalized(self, name):
""" Return the normalized version of the name (including transliteration)
to be applied at search time.
"""
return self.search.transliterate(name)

View File

@@ -0,0 +1,161 @@
"""
Helper class to create ICU rules from a configuration file.
"""
import io
import yaml
import logging
from collections import defaultdict
import itertools
from icu import Transliterator
from nominatim.errors import UsageError
LOG = logging.getLogger()
class ICURuleLoader:
""" Compiler for ICU rules from a tokenizer configuration file.
"""
def __init__(self, configfile):
self.configfile = configfile
if configfile.suffix == '.yaml':
self._load_from_yaml()
else:
raise UsageError("Unknown format of tokenizer configuration.")
def get_search_rules(self):
""" Returns the ICU rules to be used during search.
The rules combine normalization, compound decomposition (including
abbreviated compounds) and transliteration.
"""
# First apply the normalization rules.
rules = io.StringIO()
rules.write(self.normalization_rules)
# For all compound suffixes: add them in their full and any abbreviated form.
suffixes = set()
for suffix in self.compound_suffixes:
suffixes.add(suffix)
suffixes.update(self.abbreviations.get(suffix, []))
for suffix in sorted(suffixes, key=lambda x:len(x), reverse=True):
rules.write("'{0} ' > ' {0} ';".format(suffix))
# Finally add transliteration.
rules.write(self.transliteration_rules)
return rules.getvalue()
def get_normalization_rules(self):
""" Return rules for normalisation of a term.
"""
return self.normalization_rules
def get_transliteration_rules(self):
""" Return the rules for converting a string into its asciii representation.
"""
return self.transliteration_rules
def get_replacement_pairs(self):
""" Returns the list of possible compound decompositions with
application of abbreviations included.
The result is a list of pairs: the first item is the sequence to
replace, the second is a list of replacements.
"""
synonyms = defaultdict(set)
for full, abbr in self.abbreviations.items():
key = ' ' + full + ' '
# Entries in the abbreviation list always apply to full words:
synonyms[key].update((' ' + a + ' ' for a in abbr))
# Replacements are optional, so add a noop
synonyms[key].add(key)
# Entries in the compound list expand to themselves and to
# abbreviations.
for suffix in self.compound_suffixes:
keyset = synonyms[suffix + ' ']
keyset.add(' ' + suffix + ' ')
keyset.update((' ' + a + ' ' for a in self.abbreviations.get(suffix, [])))
# The terms the entries are shortended to, need to be decompunded as well.
for abbr in self.abbreviations.get(suffix, []):
synonyms[abbr + ' '].add(' ' + abbr + ' ')
# sort the resulting list by descending length (longer matches are prefered).
sorted_keys = sorted(synonyms.keys(), key=lambda x: len(x), reverse=True)
return [(k, list(synonyms[k])) for k in sorted_keys]
def _load_from_yaml(self):
rules = yaml.load(self.configfile.read_text())
self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
self._parse_compound_suffix_list(self._get_section(rules, 'compound_suffixes'))
self._parse_abbreviation_list(self._get_section(rules, 'abbreviations'))
def _get_section(self, rules, section):
""" Get the section named 'section' from the rules. If the section does
not exist, raise a usage error with a meaningful message.
"""
if section not in rules:
LOG.fatal("Section '%s' not found in tokenizer config '%s'.",
section, str(self.configfile))
raise UsageError("Syntax error in tokenizer configuration file.")
return rules[section]
def _cfg_to_icu_rules(self, rules, section):
""" Load an ICU ruleset from the given section. If the section is a
simple string, it is interpreted as a file name and the rules are
loaded verbatim from the given file. The filename is expected to be
relative to the tokenizer rule file. If the section is a list then
each line is assumed to be a rule. All rules are concatenated and returned.
"""
content = self._get_section(rules, section)
if isinstance(content, str):
return (self.configfile.parent / content).read_text().replace('\n', ' ')
return ';'.join(content) + ';'
def _parse_compound_suffix_list(self, rules):
if not rules:
self.compound_suffixes = set()
return
norm = Transliterator.createFromRules("rule_loader_normalization",
self.normalization_rules)
# Make sure all suffixes are in their normalised form.
self.compound_suffixes = set((norm.transliterate(s) for s in rules))
def _parse_abbreviation_list(self, rules):
self.abbreviations = defaultdict(list)
if not rules:
return
norm = Transliterator.createFromRules("rule_loader_normalization",
self.normalization_rules)
for rule in rules:
parts = rule.split('=>')
if len(parts) != 2:
LOG.fatal("Syntax error in abbreviation section, line: %s", rule)
raise UsageError("Syntax error in tokenizer configuration file.")
# Make sure all terms match the normalised version.
fullterms = (norm.transliterate(t.strip()) for t in parts[0].split(','))
abbrterms = (norm.transliterate(t.strip()) for t in parts[1].split(','))
for full, abbr in itertools.product(fullterms, abbrterms):
self.abbreviations[full].append(abbr)

View File

@@ -18,11 +18,11 @@ import psycopg2.extras
from nominatim.db.connection import connect from nominatim.db.connection import connect
from nominatim.db.properties import set_property, get_property from nominatim.db.properties import set_property, get_property
from nominatim.db.sql_preprocessor import SQLPreprocessor from nominatim.db.sql_preprocessor import SQLPreprocessor
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
DBCFG_NORMALIZATION = "tokenizer_normalization"
DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq" DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
DBCFG_TRANSLITERATION = "tokenizer_transliteration" DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
DBCFG_ABBREVIATIONS = "tokenizer_abbreviations"
LOG = logging.getLogger() LOG = logging.getLogger()
@@ -41,9 +41,9 @@ class LegacyICUTokenizer:
def __init__(self, dsn, data_dir): def __init__(self, dsn, data_dir):
self.dsn = dsn self.dsn = dsn
self.data_dir = data_dir self.data_dir = data_dir
self.normalization = None self.naming_rules = None
self.transliteration = None self.term_normalization = None
self.abbreviations = None self.max_word_frequency = None
def init_new_db(self, config, init_db=True): def init_new_db(self, config, init_db=True):
@@ -55,14 +55,14 @@ class LegacyICUTokenizer:
if config.TOKENIZER_CONFIG: if config.TOKENIZER_CONFIG:
cfgfile = Path(config.TOKENIZER_CONFIG) cfgfile = Path(config.TOKENIZER_CONFIG)
else: else:
cfgfile = config.config_dir / 'legacy_icu_tokenizer.json' cfgfile = config.config_dir / 'legacy_icu_tokenizer.yaml'
rules = json.loads(cfgfile.read_text()) loader = ICURuleLoader(cfgfile)
self._load_transliteration(rules['normalization'], cfgfile.parent) self.naming_rules = ICUNameProcessorRules(loader=loader)
self.abbreviations = rules["abbreviations"] self.term_normalization = config.TERM_NORMALIZATION
self.normalization = config.TERM_NORMALIZATION self.max_word_frequency = config.MAX_WORD_FREQUENCY
self._install_php(config) self._install_php(config.lib_dir.php)
self._save_config(config) self._save_config(config)
if init_db: if init_db:
@@ -70,19 +70,13 @@ class LegacyICUTokenizer:
self._init_db_tables(config) self._init_db_tables(config)
def _load_transliteration(self, rules, cfg_path):
if isinstance(rules, str):
self.transliteration = (cfg_path / rules).read_text().replace('\n', ' ')
else:
self.transliteration = ';'.join(rules) + ';'
def init_from_project(self): def init_from_project(self):
""" Initialise the tokenizer from the project directory. """ Initialise the tokenizer from the project directory.
""" """
with connect(self.dsn) as conn: with connect(self.dsn) as conn:
self.normalization = get_property(conn, DBCFG_NORMALIZATION) self.naming_rules = ICUNameProcessorRules(conn=conn)
self.transliteration = get_property(conn, DBCFG_TRANSLITERATION) self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
self.abbreviations = json.loads(get_property(conn, DBCFG_ABBREVIATIONS)) self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
def finalize_import(self, config): def finalize_import(self, config):
@@ -132,26 +126,20 @@ class LegacyICUTokenizer:
Analyzers are not thread-safe. You need to instantiate one per thread. Analyzers are not thread-safe. You need to instantiate one per thread.
""" """
norm = Transliterator.createFromRules("normalizer", self.normalization) return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
trans = Transliterator.createFromRules("trans", self.transliteration)
return LegacyICUNameAnalyzer(self.dsn, norm, trans, self.abbreviations)
def _install_php(self, config): def _install_php(self, phpdir):
""" Install the php script for the tokenizer. """ Install the php script for the tokenizer.
""" """
abbr_inverse = list(zip(*self.abbreviations))
php_file = self.data_dir / "tokenizer.php" php_file = self.data_dir / "tokenizer.php"
php_file.write_text(dedent("""\ php_file.write_text(dedent("""\
<?php <?php
@define('CONST_Max_Word_Frequency', {1.MAX_WORD_FREQUENCY}); @define('CONST_Max_Word_Frequency', {0.max_word_frequency});
@define('CONST_Term_Normalization_Rules', "{0.normalization}"); @define('CONST_Term_Normalization_Rules', "{0.term_normalization}");
@define('CONST_Transliteration', "{0.transliteration}"); @define('CONST_Transliteration', "{0.naming_rules.search_rules}");
@define('CONST_Abbreviations', array(array('{2}'), array('{3}'))); require_once('{1}/tokenizer/legacy_icu_tokenizer.php');
require_once('{1.lib_dir.php}/tokenizer/legacy_icu_tokenizer.php'); """.format(self, phpdir)))
""".format(self, config,
"','".join(abbr_inverse[0]),
"','".join(abbr_inverse[1]))))
def _save_config(self, config): def _save_config(self, config):
@@ -159,10 +147,10 @@ class LegacyICUTokenizer:
database as database properties. database as database properties.
""" """
with connect(self.dsn) as conn: with connect(self.dsn) as conn:
set_property(conn, DBCFG_NORMALIZATION, self.normalization) self.naming_rules.save_rules(conn)
set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY) set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
set_property(conn, DBCFG_TRANSLITERATION, self.transliteration) set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
set_property(conn, DBCFG_ABBREVIATIONS, json.dumps(self.abbreviations))
def _init_db_tables(self, config): def _init_db_tables(self, config):
@@ -178,15 +166,14 @@ class LegacyICUTokenizer:
# get partial words and their frequencies # get partial words and their frequencies
words = Counter() words = Counter()
with self.name_analyzer() as analyzer: name_proc = ICUNameProcessor(self.naming_rules)
with conn.cursor(name="words") as cur: with conn.cursor(name="words") as cur:
cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v") cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
for name, cnt in cur: for name, cnt in cur:
term = analyzer.make_standard_word(name) for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
if term: for term in word.split():
for word in term.split(): words[term] += cnt
words[word] += cnt
# copy them back into the word table # copy them back into the word table
copystr = io.StringIO(''.join(('{}\t{}\n'.format(*args) for args in words.items()))) copystr = io.StringIO(''.join(('{}\t{}\n'.format(*args) for args in words.items())))
@@ -208,12 +195,10 @@ class LegacyICUNameAnalyzer:
normalization. normalization.
""" """
def __init__(self, dsn, normalizer, transliterator, abbreviations): def __init__(self, dsn, name_proc):
self.conn = connect(dsn).connection self.conn = connect(dsn).connection
self.conn.autocommit = True self.conn.autocommit = True
self.normalizer = normalizer self.name_processor = name_proc
self.transliterator = transliterator
self.abbreviations = abbreviations
self._cache = _TokenCache() self._cache = _TokenCache()
@@ -248,9 +233,9 @@ class LegacyICUNameAnalyzer:
tokens = {} tokens = {}
for word in words: for word in words:
if word.startswith('#'): if word.startswith('#'):
tokens[word] = ' ' + self.make_standard_word(word[1:]) tokens[word] = ' ' + self.name_processor.get_normalized(word[1:])
else: else:
tokens[word] = self.make_standard_word(word) tokens[word] = self.name_processor.get_normalized(word)
with conn.cursor() as cur: with conn.cursor() as cur:
cur.execute("""SELECT word_token, word_id cur.execute("""SELECT word_token, word_id
@@ -263,12 +248,6 @@ class LegacyICUNameAnalyzer:
return [(k, v, ids[v]) for k, v in tokens.items()] return [(k, v, ids[v]) for k, v in tokens.items()]
def normalize(self, phrase):
""" Normalize the given phrase, i.e. remove all properties that
are irrelevant for search.
"""
return self.normalizer.transliterate(phrase)
@staticmethod @staticmethod
def normalize_postcode(postcode): def normalize_postcode(postcode):
""" Convert the postcode to a standardized form. """ Convert the postcode to a standardized form.
@@ -279,27 +258,12 @@ class LegacyICUNameAnalyzer:
return postcode.strip().upper() return postcode.strip().upper()
@functools.lru_cache(maxsize=1024)
def make_standard_word(self, name):
""" Create the normalised version of the input.
"""
norm = ' ' + self.transliterator.transliterate(name) + ' '
for full, abbr in self.abbreviations:
if full in norm:
norm = norm.replace(full, abbr)
return norm.strip()
def _make_standard_hnr(self, hnr): def _make_standard_hnr(self, hnr):
""" Create a normalised version of a housenumber. """ Create a normalised version of a housenumber.
This function takes minor shortcuts on transliteration. This function takes minor shortcuts on transliteration.
""" """
if hnr.isdigit(): return self.name_processor.get_search_normalized(hnr)
return hnr
return self.transliterator.transliterate(hnr)
def update_postcodes_from_db(self): def update_postcodes_from_db(self):
""" Update postcode tokens in the word table from the location_postcode """ Update postcode tokens in the word table from the location_postcode
@@ -325,7 +289,7 @@ class LegacyICUNameAnalyzer:
else: else:
copystr.write(postcode) copystr.write(postcode)
copystr.write('\t ') copystr.write('\t ')
copystr.write(self.transliterator.transliterate(postcode)) copystr.write(self.name_processor.get_search_normalized(postcode))
copystr.write('\tplace\tpostcode\t0\n') copystr.write('\tplace\tpostcode\t0\n')
if to_delete: if to_delete:
@@ -344,7 +308,7 @@ class LegacyICUNameAnalyzer:
def update_special_phrases(self, phrases, should_replace): def update_special_phrases(self, phrases, should_replace):
""" Replace the search index for special phrases with the new phrases. """ Replace the search index for special phrases with the new phrases.
""" """
norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3]) norm_phrases = set(((self.name_processor.get_search_normalized(p[0]), p[1], p[2], p[3])
for p in phrases)) for p in phrases))
with self.conn.cursor() as cur: with self.conn.cursor() as cur:
@@ -362,7 +326,7 @@ class LegacyICUNameAnalyzer:
if to_add: if to_add:
copystr = io.StringIO() copystr = io.StringIO()
for word, cls, typ, oper in to_add: for word, cls, typ, oper in to_add:
term = self.make_standard_word(word) term = self.name_processor.get_search_normalized(word)
if term: if term:
copystr.write(word) copystr.write(word)
copystr.write('\t ') copystr.write('\t ')
@@ -395,15 +359,11 @@ class LegacyICUNameAnalyzer:
def add_country_names(self, country_code, names): def add_country_names(self, country_code, names):
""" Add names for the given country to the search index. """ Add names for the given country to the search index.
""" """
full_names = set((self.make_standard_word(n) for n in names)) word_tokens = set()
full_names.discard('') for name in self._compute_full_names(names):
self._add_normalized_country_names(country_code, full_names) if name:
word_tokens.add(' ' + self.name_processor.get_search_normalized(name))
def _add_normalized_country_names(self, country_code, names):
""" Add names for the given country to the search index.
"""
word_tokens = set((' ' + name for name in names))
with self.conn.cursor() as cur: with self.conn.cursor() as cur:
# Get existing names # Get existing names
cur.execute("SELECT word_token FROM word WHERE country_code = %s", cur.execute("SELECT word_token FROM word WHERE country_code = %s",
@@ -429,14 +389,13 @@ class LegacyICUNameAnalyzer:
names = place.get('name') names = place.get('name')
if names: if names:
full_names = self._compute_full_names(names) fulls, partials = self._compute_name_tokens(names)
token_info.add_names(self.conn, full_names) token_info.add_names(fulls, partials)
country_feature = place.get('country_feature') country_feature = place.get('country_feature')
if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature): if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
self._add_normalized_country_names(country_feature.lower(), self.add_country_names(country_feature.lower(), names)
full_names)
address = place.get('address') address = place.get('address')
@@ -449,38 +408,60 @@ class LegacyICUNameAnalyzer:
elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'): elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
hnrs.append(value) hnrs.append(value)
elif key == 'street': elif key == 'street':
token_info.add_street(self.conn, self.make_standard_word(value)) token_info.add_street(*self._compute_name_tokens({'name': value}))
elif key == 'place': elif key == 'place':
token_info.add_place(self.conn, self.make_standard_word(value)) token_info.add_place(*self._compute_name_tokens({'name': value}))
elif not key.startswith('_') and \ elif not key.startswith('_') and \
key not in ('country', 'full'): key not in ('country', 'full'):
addr_terms.append((key, self.make_standard_word(value))) addr_terms.append((key, *self._compute_name_tokens({'name': value})))
if hnrs: if hnrs:
hnrs = self._split_housenumbers(hnrs) hnrs = self._split_housenumbers(hnrs)
token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs]) token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
if addr_terms: if addr_terms:
token_info.add_address_terms(self.conn, addr_terms) token_info.add_address_terms(addr_terms)
return token_info.data return token_info.data
def _compute_name_tokens(self, names):
""" Computes the full name and partial name tokens for the given
dictionary of names.
"""
full_names = self._compute_full_names(names)
full_tokens = set()
partial_tokens = set()
for name in full_names:
norm_name = self.name_processor.get_normalized(name)
full, part = self._cache.names.get(norm_name, (None, None))
if full is None:
variants = self.name_processor.get_variants_ascii(norm_name)
with self.conn.cursor() as cur:
cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
(norm_name, variants))
full, part = cur.fetchone()
self._cache.names[norm_name] = (full, part)
full_tokens.add(full)
partial_tokens.update(part)
return full_tokens, partial_tokens
def _compute_full_names(self, names): def _compute_full_names(self, names):
""" Return the set of all full name word ids to be used with the """ Return the set of all full name word ids to be used with the
given dictionary of names. given dictionary of names.
""" """
full_names = set() full_names = set()
for name in (n for ns in names.values() for n in re.split('[;,]', ns)): for name in (n for ns in names.values() for n in re.split('[;,]', ns)):
word = self.make_standard_word(name) full_names.add(name.strip())
if word:
full_names.add(word)
brace_split = name.split('(', 2) brace_idx = name.find('(')
if len(brace_split) > 1: if brace_idx >= 0:
word = self.make_standard_word(brace_split[0]) full_names.add(name[:brace_idx].strip())
if word:
full_names.add(word)
return full_names return full_names
@@ -492,7 +473,7 @@ class LegacyICUNameAnalyzer:
postcode = self.normalize_postcode(postcode) postcode = self.normalize_postcode(postcode)
if postcode not in self._cache.postcodes: if postcode not in self._cache.postcodes:
term = self.make_standard_word(postcode) term = self.name_processor.get_search_normalized(postcode)
if not term: if not term:
return return
@@ -508,6 +489,7 @@ class LegacyICUNameAnalyzer:
""", (' ' + term, postcode)) """, (' ' + term, postcode))
self._cache.postcodes.add(postcode) self._cache.postcodes.add(postcode)
@staticmethod @staticmethod
def _split_housenumbers(hnrs): def _split_housenumbers(hnrs):
if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]: if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
@@ -530,7 +512,7 @@ class _TokenInfo:
""" Collect token information to be sent back to the database. """ Collect token information to be sent back to the database.
""" """
def __init__(self, cache): def __init__(self, cache):
self.cache = cache self._cache = cache
self.data = {} self.data = {}
@staticmethod @staticmethod
@@ -538,86 +520,44 @@ class _TokenInfo:
return '{%s}' % ','.join((str(s) for s in tokens)) return '{%s}' % ','.join((str(s) for s in tokens))
def add_names(self, conn, names): def add_names(self, fulls, partials):
""" Adds token information for the normalised names. """ Adds token information for the normalised names.
""" """
# Start with all partial names self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
terms = set((part for ns in names for part in ns.split()))
# Add the full names
terms.update((' ' + n for n in names))
self.data['names'] = self._mk_array(self.cache.get_term_tokens(conn, terms))
def add_housenumbers(self, conn, hnrs): def add_housenumbers(self, conn, hnrs):
""" Extract housenumber information from a list of normalised """ Extract housenumber information from a list of normalised
housenumbers. housenumbers.
""" """
self.data['hnr_tokens'] = self._mk_array(self.cache.get_hnr_tokens(conn, hnrs)) self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
self.data['hnr'] = ';'.join(hnrs) self.data['hnr'] = ';'.join(hnrs)
def add_street(self, conn, street): def add_street(self, fulls, partials):
""" Add addr:street match terms. """ Add addr:street match terms.
""" """
if not street: if fulls:
return self.data['street'] = self._mk_array(fulls)
term = ' ' + street
tid = self.cache.names.get(term)
if tid is None:
with conn.cursor() as cur:
cur.execute("""SELECT word_id FROM word
WHERE word_token = %s
and class is null and type is null""",
(term, ))
if cur.rowcount > 0:
tid = cur.fetchone()[0]
self.cache.names[term] = tid
if tid is not None:
self.data['street'] = '{%d}' % tid
def add_place(self, conn, place): def add_place(self, fulls, partials):
""" Add addr:place search and match terms. """ Add addr:place search and match terms.
""" """
if not place: if fulls:
return self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials))
self.data['place_match'] = self._mk_array(fulls)
partial_ids = self.cache.get_term_tokens(conn, place.split())
tid = self.cache.get_term_tokens(conn, [' ' + place])
self.data['place_search'] = self._mk_array(itertools.chain(partial_ids, tid))
self.data['place_match'] = '{%s}' % tid[0]
def add_address_terms(self, conn, terms): def add_address_terms(self, terms):
""" Add additional address terms. """ Add additional address terms.
""" """
tokens = {} tokens = {}
for key, value in terms: for key, fulls, partials in terms:
if not value: if fulls:
continue tokens[key] = [self._mk_array(itertools.chain(fulls, partials)),
partial_ids = self.cache.get_term_tokens(conn, value.split()) self._mk_array(fulls)]
term = ' ' + value
tid = self.cache.names.get(term)
if tid is None:
with conn.cursor() as cur:
cur.execute("""SELECT word_id FROM word
WHERE word_token = %s
and class is null and type is null""",
(term, ))
if cur.rowcount > 0:
tid = cur.fetchone()[0]
self.cache.names[term] = tid
tokens[key] = [self._mk_array(partial_ids),
'{%s}' % ('' if tid is None else str(tid))]
if tokens: if tokens:
self.data['addr'] = tokens self.data['addr'] = tokens
@@ -635,32 +575,6 @@ class _TokenCache:
self.housenumbers = {} self.housenumbers = {}
def get_term_tokens(self, conn, terms):
""" Get token ids for a list of terms, looking them up in the database
if necessary.
"""
tokens = []
askdb = []
for term in terms:
token = self.names.get(term)
if token is None:
askdb.append(term)
elif token != 0:
tokens.append(token)
if askdb:
with conn.cursor() as cur:
cur.execute("SELECT term, getorcreate_term_id(term) FROM unnest(%s) as term",
(askdb, ))
for term, tid in cur:
self.names[term] = tid
if tid != 0:
tokens.append(tid)
return tokens
def get_hnr_tokens(self, conn, terms): def get_hnr_tokens(self, conn, terms):
""" Get token ids for a list of housenumbers, looking them up in the """ Get token ids for a list of housenumbers, looking them up in the
database if necessary. database if necessary.

View File

@@ -404,7 +404,7 @@ class LegacyNameAnalyzer:
FROM unnest(%s)n) y FROM unnest(%s)n) y
WHERE NOT EXISTS(SELECT * FROM word WHERE NOT EXISTS(SELECT * FROM word
WHERE word_token = lookup_token and country_code = %s)) WHERE word_token = lookup_token and country_code = %s))
""", (country_code, names, country_code)) """, (country_code, list(names.values()), country_code))
def process_place(self, place): def process_place(self, place):
@@ -422,7 +422,7 @@ class LegacyNameAnalyzer:
country_feature = place.get('country_feature') country_feature = place.get('country_feature')
if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature): if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
self.add_country_names(country_feature.lower(), list(names.values())) self.add_country_names(country_feature.lower(), names)
address = place.get('address') address = place.get('address')

View File

@@ -272,15 +272,15 @@ def create_country_names(conn, tokenizer, languages=None):
with tokenizer.name_analyzer() as analyzer: with tokenizer.name_analyzer() as analyzer:
for code, name in cur: for code, name in cur:
names = [code] names = {'countrycode' : code}
if code == 'gb': if code == 'gb':
names.append('UK') names['short_name'] = 'UK'
if code == 'us': if code == 'us':
names.append('United States') names['short_name'] = 'United States'
# country names (only in languages as provided) # country names (only in languages as provided)
if name: if name:
names.extend((v for k, v in name.items() if _include_key(k))) names.update(((k, v) for k, v in name.items() if _include_key(k)))
analyzer.add_country_names(code, names) analyzer.add_country_names(code, names)

View File

@@ -0,0 +1,116 @@
normalization:
- ":: NFD ()"
- "[[:Nonspacing Mark:] [:Cf:]] >"
- ":: lower ()"
- "ß > 'ss'" # German szet is unimbigiously equal to double ss
- "[[:Punctuation:][:Space:]]+ > ' '"
- ":: NFC ()"
transliteration: icu_transliteration.rules
compound_suffixes:
# Danish
- hal
- hallen
- hallerne
# German
- berg
- brücke
- fabrik
- gasse
- graben
- haus
- höhle
- hütte
- kapelle
- kogel
- pfad
- platz
- quelle
- spitze
- stiege
- strasse
- teich
- universität
- wald
- weg
- wiese
# Dutch
- gracht
- laan
- markt
- plein
- straat
- vliet
- weg
# Norwegian
- vei
- veien
- veg
- vegen
- gate
- gaten
- gata
- plass
- plassen
- sving
- svingen
# Finnish
- alue
- asema
- aukio
- kaari
- katu
- kuja
- kylä
- penger
- polku
- puistikko
- puisto
- raitti
- ranta
- rinne
- taival
- tie
- tori
- väylä
# Swedish
- väg
- vägen
- gatan
- gata
- gränd
- gränden
- stig
- stigen
- plats
- platsen
abbreviations:
# German
- am => a
- an der => a d
- allgemeines krankenhaus => akh
- altstoffsammelzentrum => asz
- auf der => a d
- bach => b
- bad => b
- bahnhof => bhf,bf
- berg => bg
- bezirk => bez
- brücke => br
- burg => bg
- chaussee => ch
- deutsche,deutscher,deutsches => dt
- dorf => df
- doktor => dr
- fachhochschule => fh
- Freiwillige Feuerwehr => ff
- sankt => st
- strasse => str
- weg => wg
# English
- alley => al
- beach => bch
- street => st
- road => rd
- bridge => brdg

View File

@@ -0,0 +1,60 @@
"""
Tests for import name normalisation and variant generation.
"""
from textwrap import dedent
import pytest
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
from nominatim.errors import UsageError
@pytest.fixture
def cfgfile(tmp_path, suffix='.yaml'):
def _create_config(suffixes, abbr):
content = dedent("""\
normalization:
- ":: NFD ()"
- "[[:Nonspacing Mark:] [:Cf:]] >"
- ":: lower ()"
- "[[:Punctuation:][:Space:]]+ > ' '"
- ":: NFC ()"
transliteration:
- ":: Latin ()"
""")
content += "compound_suffixes:\n"
content += '\n'.join((" - " + s for s in suffixes)) + '\n'
content += "abbreviations:\n"
content += '\n'.join((" - " + s for s in abbr)) + '\n'
fpath = tmp_path / ('test_config' + suffix)
fpath.write_text(dedent(content))
return fpath
return _create_config
def test_simple_variants(cfgfile):
fpath = cfgfile(['strasse', 'straße', 'weg'],
['strasse,straße => str',
'prospekt => pr'])
rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath))
proc = ICUNameProcessor(rules)
assert set(proc.get_normalized_variants("Bauwegstraße")) \
== {'bauweg straße', 'bauweg str'}
assert proc.get_normalized_variants("Bauwegstr") == ['bauweg str']
assert proc.get_normalized_variants("holzweg") == ['holz weg']
assert proc.get_normalized_variants("hallo") == ['hallo']
def test_multiple_replacements(cfgfile):
fpath = cfgfile([], ['saint => s,st', 'street => st'])
rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath))
proc = ICUNameProcessor(rules)
assert set(proc.get_normalized_variants("Saint Johns Street")) == \
{'saint johns street', 's johns street', 'st johns street',
'saint johns st', 's johns st', 'st johns st'}

View File

@@ -0,0 +1,75 @@
"""
Tests for converting a config file to ICU rules.
"""
import pytest
from textwrap import dedent
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
from nominatim.errors import UsageError
from icu import Transliterator
@pytest.fixture
def cfgfile(tmp_path, suffix='.yaml'):
def _create_config(suffixes, abbr):
content = dedent("""\
normalization:
- ":: NFD ()"
- "[[:Nonspacing Mark:] [:Cf:]] >"
- ":: lower ()"
- "[[:Punctuation:][:Space:]]+ > ' '"
- ":: NFC ()"
transliteration:
- ":: Latin ()"
""")
content += "compound_suffixes:\n"
content += '\n'.join((" - " + s for s in suffixes)) + '\n'
content += "abbreviations:\n"
content += '\n'.join((" - " + s for s in abbr)) + '\n'
fpath = tmp_path / ('test_config' + suffix)
fpath.write_text(dedent(content))
return fpath
return _create_config
def test_missing_normalization(tmp_path):
fpath = tmp_path / ('test_config.yaml')
fpath.write_text(dedent("""\
normalizatio:
- ":: NFD ()"
"""))
with pytest.raises(UsageError):
ICURuleLoader(fpath)
def test_get_search_rules(cfgfile):
fpath = cfgfile(['strasse', 'straße', 'weg'],
['strasse,straße => str',
'prospekt => pr'])
loader = ICURuleLoader(fpath)
rules = loader.get_search_rules()
trans = Transliterator.createFromRules("test", rules)
assert trans.transliterate(" Baumstraße ") == " baum straße "
assert trans.transliterate(" Baumstrasse ") == " baum strasse "
assert trans.transliterate(" Baumstr ") == " baum str "
assert trans.transliterate(" Baumwegstr ") == " baumweg str "
assert trans.transliterate(" Αθήνα ") == " athēna "
assert trans.transliterate(" проспект ") == " prospekt "
def test_get_synonym_pairs(cfgfile):
fpath = cfgfile(['Weg', 'Strasse'],
['Strasse => str,st'])
loader = ICURuleLoader(fpath)
repl = loader.get_replacement_pairs()
assert repl == [(' strasse ', {' strasse ', ' str ', ' st '}),
('strasse ', {' strasse ', ' str ', ' st '}),
('weg ', {' weg '})]