diff --git a/lib-php/tokenizer/legacy_icu_tokenizer.php b/lib-php/tokenizer/legacy_icu_tokenizer.php index 09cfe70f..92dd7272 100644 --- a/lib-php/tokenizer/legacy_icu_tokenizer.php +++ b/lib-php/tokenizer/legacy_icu_tokenizer.php @@ -47,9 +47,7 @@ class Tokenizer private function makeStandardWord($sTerm) { - $sNorm = ' '.$this->oTransliterator->transliterate($sTerm).' '; - - return trim(str_replace(CONST_Abbreviations[0], CONST_Abbreviations[1], $sNorm)); + return trim($this->oTransliterator->transliterate(' '.$sTerm.' ')); } @@ -90,6 +88,7 @@ class Tokenizer foreach ($aPhrases as $iPhrase => $oPhrase) { $sNormQuery .= ','.$this->normalizeString($oPhrase->getPhrase()); $sPhrase = $this->makeStandardWord($oPhrase->getPhrase()); + Debug::printVar('Phrase', $sPhrase); if (strlen($sPhrase) > 0) { $aWords = explode(' ', $sPhrase); Tokenizer::addTokens($aTokens, $aWords); diff --git a/lib-sql/tokenizer/legacy_icu_tokenizer.sql b/lib-sql/tokenizer/legacy_icu_tokenizer.sql index 8fd0ede4..686137de 100644 --- a/lib-sql/tokenizer/legacy_icu_tokenizer.sql +++ b/lib-sql/tokenizer/legacy_icu_tokenizer.sql @@ -87,25 +87,48 @@ $$ LANGUAGE SQL IMMUTABLE STRICT; --------------- private functions ---------------------------------------------- -CREATE OR REPLACE FUNCTION getorcreate_term_id(lookup_term TEXT) - RETURNS INTEGER +CREATE OR REPLACE FUNCTION getorcreate_full_word(norm_term TEXT, lookup_terms TEXT[], + OUT full_token INT, + OUT partial_tokens INT[]) AS $$ DECLARE - return_id INTEGER; + partial_terms TEXT[] = '{}'::TEXT[]; + term TEXT; + term_id INTEGER; term_count INTEGER; BEGIN - SELECT min(word_id), max(search_name_count) INTO return_id, term_count - FROM word WHERE word_token = lookup_term and class is null and type is null; + SELECT min(word_id) INTO full_token + FROM word WHERE word = norm_term and class is null and country_code is null; - IF return_id IS NULL THEN - return_id := nextval('seq_word'); - INSERT INTO word (word_id, word_token, search_name_count) - VALUES (return_id, lookup_term, 0); - ELSEIF left(lookup_term, 1) = ' ' and term_count > {{ max_word_freq }} THEN - return_id := 0; + IF full_token IS NULL THEN + full_token := nextval('seq_word'); + INSERT INTO word (word_id, word_token, word, search_name_count) + SELECT full_token, ' ' || lookup_term, norm_term, 0 FROM unnest(lookup_terms) as lookup_term; END IF; - RETURN return_id; + FOR term IN SELECT unnest(string_to_array(unnest(lookup_terms), ' ')) LOOP + term := trim(term); + IF NOT (ARRAY[term] <@ partial_terms) THEN + partial_terms := partial_terms || term; + END IF; + END LOOP; + + partial_tokens := '{}'::INT[]; + FOR term IN SELECT unnest(partial_terms) LOOP + SELECT min(word_id), max(search_name_count) INTO term_id, term_count + FROM word WHERE word_token = term and class is null and country_code is null; + + IF term_id IS NULL THEN + term_id := nextval('seq_word'); + term_count := 0; + INSERT INTO word (word_id, word_token, search_name_count) + VALUES (term_id, term, 0); + END IF; + + IF term_count < {{ max_word_freq }} THEN + partial_tokens := array_merge(partial_tokens, ARRAY[term_id]); + END IF; + END LOOP; END; $$ LANGUAGE plpgsql; diff --git a/nominatim/tokenizer/icu_name_processor.py b/nominatim/tokenizer/icu_name_processor.py new file mode 100644 index 00000000..0e717995 --- /dev/null +++ b/nominatim/tokenizer/icu_name_processor.py @@ -0,0 +1,111 @@ +""" +Processor for names that are imported into the database based on the +ICU library. +""" +import json +import itertools + +from icu import Transliterator +import datrie + +from nominatim.db.properties import set_property, get_property + +DBCFG_IMPORT_NORM_RULES = "tokenizer_import_normalisation" +DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration" +DBCFG_IMPORT_REPLACEMENTS = "tokenizer_import_replacements" +DBCFG_SEARCH_STD_RULES = "tokenizer_search_standardization" + + +class ICUNameProcessorRules: + """ Data object that saves the rules needed for the name processor. + + The rules can either be initialised through an ICURuleLoader or + be loaded from a database when a connection is given. + """ + def __init__(self, loader=None, conn=None): + if loader is not None: + self.norm_rules = loader.get_normalization_rules() + self.trans_rules = loader.get_transliteration_rules() + self.replacements = loader.get_replacement_pairs() + self.search_rules = loader.get_search_rules() + elif conn is not None: + self.norm_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES) + self.trans_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES) + self.replacements = json.loads(get_property(conn, DBCFG_IMPORT_REPLACEMENTS)) + self.search_rules = get_property(conn, DBCFG_SEARCH_STD_RULES) + else: + assert False, "Parameter loader or conn required." + + # Compute the set of characters used in the replacement list. + # We need this later when computing the tree. + chars = set() + for full, repl in self.replacements: + chars.update(full) + for word in repl: + chars.update(word) + self.replacement_charset = ''.join(chars) + + + def save_rules(self, conn): + """ Save the rules in the property table of the given database. + the rules can be loaded again by handing in a connection into + the constructor of the class. + """ + set_property(conn, DBCFG_IMPORT_NORM_RULES, self.norm_rules) + set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.trans_rules) + set_property(conn, DBCFG_IMPORT_REPLACEMENTS, json.dumps(self.replacements)) + set_property(conn, DBCFG_SEARCH_STD_RULES, self.search_rules) + + +class ICUNameProcessor: + + def __init__(self, rules): + self.normalizer = Transliterator.createFromRules("icu_normalization", + rules.norm_rules) + self.to_ascii = Transliterator.createFromRules("icu_to_ascii", + rules.trans_rules) + self.search = Transliterator.createFromRules("icu_search", + rules.search_rules) + + self.replacements = datrie.Trie(rules.replacement_charset) + for full, repl in rules.replacements: + self.replacements[full] = repl + + + def get_normalized(self, name): + """ Normalize the given name, i.e. remove all elements not relevant + for search. + """ + return self.normalizer.transliterate(name) + + def get_variants_ascii(self, norm_name): + """ Compute the spelling variants for the given normalized name + and transliterate the result. + """ + baseform = ' ' + norm_name + ' ' + variants = [''] + + startpos = 0 + pos = 0 + while pos < len(baseform): + full, repl = self.replacements.longest_prefix_item(baseform[pos:], + (None, None)) + if full is not None: + done = baseform[startpos:pos] + variants = [v + done + r for v, r in itertools.product(variants, repl)] + startpos = pos + len(full) + pos = startpos + else: + pos += 1 + + if startpos == 0: + return [self.to_ascii.transliterate(norm_name)] + + return [self.to_ascii.transliterate(v + baseform[startpos:pos]).strip() for v in variants] + + + def get_search_normalized(self, name): + """ Return the normalized version of the name (including transliteration) + to be applied at search time. + """ + return self.search.transliterate(name) diff --git a/nominatim/tokenizer/icu_rule_loader.py b/nominatim/tokenizer/icu_rule_loader.py new file mode 100644 index 00000000..3b721169 --- /dev/null +++ b/nominatim/tokenizer/icu_rule_loader.py @@ -0,0 +1,161 @@ +""" +Helper class to create ICU rules from a configuration file. +""" +import io +import yaml +import logging +from collections import defaultdict +import itertools + +from icu import Transliterator + +from nominatim.errors import UsageError + +LOG = logging.getLogger() + + +class ICURuleLoader: + """ Compiler for ICU rules from a tokenizer configuration file. + """ + + def __init__(self, configfile): + self.configfile = configfile + + if configfile.suffix == '.yaml': + self._load_from_yaml() + else: + raise UsageError("Unknown format of tokenizer configuration.") + + + def get_search_rules(self): + """ Returns the ICU rules to be used during search. + The rules combine normalization, compound decomposition (including + abbreviated compounds) and transliteration. + """ + # First apply the normalization rules. + rules = io.StringIO() + rules.write(self.normalization_rules) + + # For all compound suffixes: add them in their full and any abbreviated form. + suffixes = set() + for suffix in self.compound_suffixes: + suffixes.add(suffix) + suffixes.update(self.abbreviations.get(suffix, [])) + + for suffix in sorted(suffixes, key=lambda x:len(x), reverse=True): + rules.write("'{0} ' > ' {0} ';".format(suffix)) + + # Finally add transliteration. + rules.write(self.transliteration_rules) + return rules.getvalue() + + def get_normalization_rules(self): + """ Return rules for normalisation of a term. + """ + return self.normalization_rules + + def get_transliteration_rules(self): + """ Return the rules for converting a string into its asciii representation. + """ + return self.transliteration_rules + + def get_replacement_pairs(self): + """ Returns the list of possible compound decompositions with + application of abbreviations included. + The result is a list of pairs: the first item is the sequence to + replace, the second is a list of replacements. + """ + synonyms = defaultdict(set) + + for full, abbr in self.abbreviations.items(): + key = ' ' + full + ' ' + # Entries in the abbreviation list always apply to full words: + synonyms[key].update((' ' + a + ' ' for a in abbr)) + # Replacements are optional, so add a noop + synonyms[key].add(key) + + # Entries in the compound list expand to themselves and to + # abbreviations. + for suffix in self.compound_suffixes: + keyset = synonyms[suffix + ' '] + keyset.add(' ' + suffix + ' ') + keyset.update((' ' + a + ' ' for a in self.abbreviations.get(suffix, []))) + # The terms the entries are shortended to, need to be decompunded as well. + for abbr in self.abbreviations.get(suffix, []): + synonyms[abbr + ' '].add(' ' + abbr + ' ') + + # sort the resulting list by descending length (longer matches are prefered). + sorted_keys = sorted(synonyms.keys(), key=lambda x: len(x), reverse=True) + + return [(k, list(synonyms[k])) for k in sorted_keys] + + + def _load_from_yaml(self): + rules = yaml.load(self.configfile.read_text()) + + self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization') + self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration') + self._parse_compound_suffix_list(self._get_section(rules, 'compound_suffixes')) + self._parse_abbreviation_list(self._get_section(rules, 'abbreviations')) + + + def _get_section(self, rules, section): + """ Get the section named 'section' from the rules. If the section does + not exist, raise a usage error with a meaningful message. + """ + if section not in rules: + LOG.fatal("Section '%s' not found in tokenizer config '%s'.", + section, str(self.configfile)) + raise UsageError("Syntax error in tokenizer configuration file.") + + return rules[section] + + + def _cfg_to_icu_rules(self, rules, section): + """ Load an ICU ruleset from the given section. If the section is a + simple string, it is interpreted as a file name and the rules are + loaded verbatim from the given file. The filename is expected to be + relative to the tokenizer rule file. If the section is a list then + each line is assumed to be a rule. All rules are concatenated and returned. + """ + content = self._get_section(rules, section) + + if isinstance(content, str): + return (self.configfile.parent / content).read_text().replace('\n', ' ') + + return ';'.join(content) + ';' + + + def _parse_compound_suffix_list(self, rules): + if not rules: + self.compound_suffixes = set() + return + + norm = Transliterator.createFromRules("rule_loader_normalization", + self.normalization_rules) + + # Make sure all suffixes are in their normalised form. + self.compound_suffixes = set((norm.transliterate(s) for s in rules)) + + + def _parse_abbreviation_list(self, rules): + self.abbreviations = defaultdict(list) + + if not rules: + return + + norm = Transliterator.createFromRules("rule_loader_normalization", + self.normalization_rules) + + for rule in rules: + parts = rule.split('=>') + if len(parts) != 2: + LOG.fatal("Syntax error in abbreviation section, line: %s", rule) + raise UsageError("Syntax error in tokenizer configuration file.") + + # Make sure all terms match the normalised version. + fullterms = (norm.transliterate(t.strip()) for t in parts[0].split(',')) + abbrterms = (norm.transliterate(t.strip()) for t in parts[1].split(',')) + + for full, abbr in itertools.product(fullterms, abbrterms): + self.abbreviations[full].append(abbr) diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py index 689318d7..eb850237 100644 --- a/nominatim/tokenizer/legacy_icu_tokenizer.py +++ b/nominatim/tokenizer/legacy_icu_tokenizer.py @@ -18,11 +18,11 @@ import psycopg2.extras from nominatim.db.connection import connect from nominatim.db.properties import set_property, get_property from nominatim.db.sql_preprocessor import SQLPreprocessor +from nominatim.tokenizer.icu_rule_loader import ICURuleLoader +from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules -DBCFG_NORMALIZATION = "tokenizer_normalization" DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq" -DBCFG_TRANSLITERATION = "tokenizer_transliteration" -DBCFG_ABBREVIATIONS = "tokenizer_abbreviations" +DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization" LOG = logging.getLogger() @@ -41,9 +41,9 @@ class LegacyICUTokenizer: def __init__(self, dsn, data_dir): self.dsn = dsn self.data_dir = data_dir - self.normalization = None - self.transliteration = None - self.abbreviations = None + self.naming_rules = None + self.term_normalization = None + self.max_word_frequency = None def init_new_db(self, config, init_db=True): @@ -55,14 +55,14 @@ class LegacyICUTokenizer: if config.TOKENIZER_CONFIG: cfgfile = Path(config.TOKENIZER_CONFIG) else: - cfgfile = config.config_dir / 'legacy_icu_tokenizer.json' + cfgfile = config.config_dir / 'legacy_icu_tokenizer.yaml' - rules = json.loads(cfgfile.read_text()) - self._load_transliteration(rules['normalization'], cfgfile.parent) - self.abbreviations = rules["abbreviations"] - self.normalization = config.TERM_NORMALIZATION + loader = ICURuleLoader(cfgfile) + self.naming_rules = ICUNameProcessorRules(loader=loader) + self.term_normalization = config.TERM_NORMALIZATION + self.max_word_frequency = config.MAX_WORD_FREQUENCY - self._install_php(config) + self._install_php(config.lib_dir.php) self._save_config(config) if init_db: @@ -70,19 +70,13 @@ class LegacyICUTokenizer: self._init_db_tables(config) - def _load_transliteration(self, rules, cfg_path): - if isinstance(rules, str): - self.transliteration = (cfg_path / rules).read_text().replace('\n', ' ') - else: - self.transliteration = ';'.join(rules) + ';' - def init_from_project(self): """ Initialise the tokenizer from the project directory. """ with connect(self.dsn) as conn: - self.normalization = get_property(conn, DBCFG_NORMALIZATION) - self.transliteration = get_property(conn, DBCFG_TRANSLITERATION) - self.abbreviations = json.loads(get_property(conn, DBCFG_ABBREVIATIONS)) + self.naming_rules = ICUNameProcessorRules(conn=conn) + self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION) + self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ) def finalize_import(self, config): @@ -132,26 +126,20 @@ class LegacyICUTokenizer: Analyzers are not thread-safe. You need to instantiate one per thread. """ - norm = Transliterator.createFromRules("normalizer", self.normalization) - trans = Transliterator.createFromRules("trans", self.transliteration) - return LegacyICUNameAnalyzer(self.dsn, norm, trans, self.abbreviations) + return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules)) - def _install_php(self, config): + def _install_php(self, phpdir): """ Install the php script for the tokenizer. """ - abbr_inverse = list(zip(*self.abbreviations)) php_file = self.data_dir / "tokenizer.php" php_file.write_text(dedent("""\ 1: - word = self.make_standard_word(brace_split[0]) - if word: - full_names.add(word) + brace_idx = name.find('(') + if brace_idx >= 0: + full_names.add(name[:brace_idx].strip()) return full_names @@ -492,7 +473,7 @@ class LegacyICUNameAnalyzer: postcode = self.normalize_postcode(postcode) if postcode not in self._cache.postcodes: - term = self.make_standard_word(postcode) + term = self.name_processor.get_search_normalized(postcode) if not term: return @@ -508,6 +489,7 @@ class LegacyICUNameAnalyzer: """, (' ' + term, postcode)) self._cache.postcodes.add(postcode) + @staticmethod def _split_housenumbers(hnrs): if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]: @@ -530,7 +512,7 @@ class _TokenInfo: """ Collect token information to be sent back to the database. """ def __init__(self, cache): - self.cache = cache + self._cache = cache self.data = {} @staticmethod @@ -538,86 +520,44 @@ class _TokenInfo: return '{%s}' % ','.join((str(s) for s in tokens)) - def add_names(self, conn, names): + def add_names(self, fulls, partials): """ Adds token information for the normalised names. """ - # Start with all partial names - terms = set((part for ns in names for part in ns.split())) - # Add the full names - terms.update((' ' + n for n in names)) - - self.data['names'] = self._mk_array(self.cache.get_term_tokens(conn, terms)) + self.data['names'] = self._mk_array(itertools.chain(fulls, partials)) def add_housenumbers(self, conn, hnrs): """ Extract housenumber information from a list of normalised housenumbers. """ - self.data['hnr_tokens'] = self._mk_array(self.cache.get_hnr_tokens(conn, hnrs)) + self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs)) self.data['hnr'] = ';'.join(hnrs) - def add_street(self, conn, street): + def add_street(self, fulls, partials): """ Add addr:street match terms. """ - if not street: - return - - term = ' ' + street - - tid = self.cache.names.get(term) - - if tid is None: - with conn.cursor() as cur: - cur.execute("""SELECT word_id FROM word - WHERE word_token = %s - and class is null and type is null""", - (term, )) - if cur.rowcount > 0: - tid = cur.fetchone()[0] - self.cache.names[term] = tid - - if tid is not None: - self.data['street'] = '{%d}' % tid + if fulls: + self.data['street'] = self._mk_array(fulls) - def add_place(self, conn, place): + def add_place(self, fulls, partials): """ Add addr:place search and match terms. """ - if not place: - return - - partial_ids = self.cache.get_term_tokens(conn, place.split()) - tid = self.cache.get_term_tokens(conn, [' ' + place]) - - self.data['place_search'] = self._mk_array(itertools.chain(partial_ids, tid)) - self.data['place_match'] = '{%s}' % tid[0] + if fulls: + self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials)) + self.data['place_match'] = self._mk_array(fulls) - def add_address_terms(self, conn, terms): + def add_address_terms(self, terms): """ Add additional address terms. """ tokens = {} - for key, value in terms: - if not value: - continue - partial_ids = self.cache.get_term_tokens(conn, value.split()) - term = ' ' + value - tid = self.cache.names.get(term) - - if tid is None: - with conn.cursor() as cur: - cur.execute("""SELECT word_id FROM word - WHERE word_token = %s - and class is null and type is null""", - (term, )) - if cur.rowcount > 0: - tid = cur.fetchone()[0] - self.cache.names[term] = tid - - tokens[key] = [self._mk_array(partial_ids), - '{%s}' % ('' if tid is None else str(tid))] + for key, fulls, partials in terms: + if fulls: + tokens[key] = [self._mk_array(itertools.chain(fulls, partials)), + self._mk_array(fulls)] if tokens: self.data['addr'] = tokens @@ -635,32 +575,6 @@ class _TokenCache: self.housenumbers = {} - def get_term_tokens(self, conn, terms): - """ Get token ids for a list of terms, looking them up in the database - if necessary. - """ - tokens = [] - askdb = [] - - for term in terms: - token = self.names.get(term) - if token is None: - askdb.append(term) - elif token != 0: - tokens.append(token) - - if askdb: - with conn.cursor() as cur: - cur.execute("SELECT term, getorcreate_term_id(term) FROM unnest(%s) as term", - (askdb, )) - for term, tid in cur: - self.names[term] = tid - if tid != 0: - tokens.append(tid) - - return tokens - - def get_hnr_tokens(self, conn, terms): """ Get token ids for a list of housenumbers, looking them up in the database if necessary. diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py index d6fbc2cd..bb37115b 100644 --- a/nominatim/tokenizer/legacy_tokenizer.py +++ b/nominatim/tokenizer/legacy_tokenizer.py @@ -404,7 +404,7 @@ class LegacyNameAnalyzer: FROM unnest(%s)n) y WHERE NOT EXISTS(SELECT * FROM word WHERE word_token = lookup_token and country_code = %s)) - """, (country_code, names, country_code)) + """, (country_code, list(names.values()), country_code)) def process_place(self, place): @@ -422,7 +422,7 @@ class LegacyNameAnalyzer: country_feature = place.get('country_feature') if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature): - self.add_country_names(country_feature.lower(), list(names.values())) + self.add_country_names(country_feature.lower(), names) address = place.get('address') diff --git a/nominatim/tools/database_import.py b/nominatim/tools/database_import.py index 28a10ebe..efbf2ec8 100644 --- a/nominatim/tools/database_import.py +++ b/nominatim/tools/database_import.py @@ -272,15 +272,15 @@ def create_country_names(conn, tokenizer, languages=None): with tokenizer.name_analyzer() as analyzer: for code, name in cur: - names = [code] + names = {'countrycode' : code} if code == 'gb': - names.append('UK') + names['short_name'] = 'UK' if code == 'us': - names.append('United States') + names['short_name'] = 'United States' # country names (only in languages as provided) if name: - names.extend((v for k, v in name.items() if _include_key(k))) + names.update(((k, v) for k, v in name.items() if _include_key(k))) analyzer.add_country_names(code, names) diff --git a/settings/legacy_icu_tokenizer.yaml b/settings/legacy_icu_tokenizer.yaml new file mode 100644 index 00000000..34cd8b0b --- /dev/null +++ b/settings/legacy_icu_tokenizer.yaml @@ -0,0 +1,116 @@ +normalization: + - ":: NFD ()" + - "[[:Nonspacing Mark:] [:Cf:]] >" + - ":: lower ()" + - "ß > 'ss'" # German szet is unimbigiously equal to double ss + - "[[:Punctuation:][:Space:]]+ > ' '" + - ":: NFC ()" +transliteration: icu_transliteration.rules +compound_suffixes: + # Danish + - hal + - hallen + - hallerne + # German + - berg + - brücke + - fabrik + - gasse + - graben + - haus + - höhle + - hütte + - kapelle + - kogel + - pfad + - platz + - quelle + - spitze + - stiege + - strasse + - teich + - universität + - wald + - weg + - wiese + # Dutch + - gracht + - laan + - markt + - plein + - straat + - vliet + - weg + # Norwegian + - vei + - veien + - veg + - vegen + - gate + - gaten + - gata + - plass + - plassen + - sving + - svingen + # Finnish + - alue + - asema + - aukio + - kaari + - katu + - kuja + - kylä + - penger + - polku + - puistikko + - puisto + - raitti + - ranta + - rinne + - taival + - tie + - tori + - väylä + # Swedish + - väg + - vägen + - gatan + - gata + - gränd + - gränden + - stig + - stigen + - plats + - platsen +abbreviations: + # German + - am => a + - an der => a d + - allgemeines krankenhaus => akh + - altstoffsammelzentrum => asz + - auf der => a d + - bach => b + - bad => b + - bahnhof => bhf,bf + - berg => bg + - bezirk => bez + - brücke => br + - burg => bg + - chaussee => ch + - deutsche,deutscher,deutsches => dt + - dorf => df + - doktor => dr + - fachhochschule => fh + - Freiwillige Feuerwehr => ff + - sankt => st + - strasse => str + - weg => wg + # English + - alley => al + - beach => bch + - street => st + - road => rd + - bridge => brdg + + diff --git a/test/python/test_tokenizer_icu_name_processor.py b/test/python/test_tokenizer_icu_name_processor.py new file mode 100644 index 00000000..9c09bccc --- /dev/null +++ b/test/python/test_tokenizer_icu_name_processor.py @@ -0,0 +1,60 @@ +""" +Tests for import name normalisation and variant generation. +""" +from textwrap import dedent + +import pytest + +from nominatim.tokenizer.icu_rule_loader import ICURuleLoader +from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules + +from nominatim.errors import UsageError + +@pytest.fixture +def cfgfile(tmp_path, suffix='.yaml'): + def _create_config(suffixes, abbr): + content = dedent("""\ + normalization: + - ":: NFD ()" + - "[[:Nonspacing Mark:] [:Cf:]] >" + - ":: lower ()" + - "[[:Punctuation:][:Space:]]+ > ' '" + - ":: NFC ()" + transliteration: + - ":: Latin ()" + """) + content += "compound_suffixes:\n" + content += '\n'.join((" - " + s for s in suffixes)) + '\n' + content += "abbreviations:\n" + content += '\n'.join((" - " + s for s in abbr)) + '\n' + fpath = tmp_path / ('test_config' + suffix) + fpath.write_text(dedent(content)) + return fpath + + return _create_config + + +def test_simple_variants(cfgfile): + fpath = cfgfile(['strasse', 'straße', 'weg'], + ['strasse,straße => str', + 'prospekt => pr']) + + rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath)) + proc = ICUNameProcessor(rules) + + assert set(proc.get_normalized_variants("Bauwegstraße")) \ + == {'bauweg straße', 'bauweg str'} + assert proc.get_normalized_variants("Bauwegstr") == ['bauweg str'] + assert proc.get_normalized_variants("holzweg") == ['holz weg'] + assert proc.get_normalized_variants("hallo") == ['hallo'] + + +def test_multiple_replacements(cfgfile): + fpath = cfgfile([], ['saint => s,st', 'street => st']) + + rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath)) + proc = ICUNameProcessor(rules) + + assert set(proc.get_normalized_variants("Saint Johns Street")) == \ + {'saint johns street', 's johns street', 'st johns street', + 'saint johns st', 's johns st', 'st johns st'} diff --git a/test/python/test_tokenizer_icu_rule_loader.py b/test/python/test_tokenizer_icu_rule_loader.py new file mode 100644 index 00000000..d89e13b5 --- /dev/null +++ b/test/python/test_tokenizer_icu_rule_loader.py @@ -0,0 +1,75 @@ +""" +Tests for converting a config file to ICU rules. +""" +import pytest +from textwrap import dedent + +from nominatim.tokenizer.icu_rule_loader import ICURuleLoader +from nominatim.errors import UsageError + +from icu import Transliterator + +@pytest.fixture +def cfgfile(tmp_path, suffix='.yaml'): + def _create_config(suffixes, abbr): + content = dedent("""\ + normalization: + - ":: NFD ()" + - "[[:Nonspacing Mark:] [:Cf:]] >" + - ":: lower ()" + - "[[:Punctuation:][:Space:]]+ > ' '" + - ":: NFC ()" + transliteration: + - ":: Latin ()" + """) + content += "compound_suffixes:\n" + content += '\n'.join((" - " + s for s in suffixes)) + '\n' + content += "abbreviations:\n" + content += '\n'.join((" - " + s for s in abbr)) + '\n' + fpath = tmp_path / ('test_config' + suffix) + fpath.write_text(dedent(content)) + return fpath + + return _create_config + +def test_missing_normalization(tmp_path): + fpath = tmp_path / ('test_config.yaml') + fpath.write_text(dedent("""\ + normalizatio: + - ":: NFD ()" + """)) + + with pytest.raises(UsageError): + ICURuleLoader(fpath) + + +def test_get_search_rules(cfgfile): + fpath = cfgfile(['strasse', 'straße', 'weg'], + ['strasse,straße => str', + 'prospekt => pr']) + + loader = ICURuleLoader(fpath) + + rules = loader.get_search_rules() + trans = Transliterator.createFromRules("test", rules) + + assert trans.transliterate(" Baumstraße ") == " baum straße " + assert trans.transliterate(" Baumstrasse ") == " baum strasse " + assert trans.transliterate(" Baumstr ") == " baum str " + assert trans.transliterate(" Baumwegstr ") == " baumweg str " + assert trans.transliterate(" Αθήνα ") == " athēna " + assert trans.transliterate(" проспект ") == " prospekt " + + +def test_get_synonym_pairs(cfgfile): + fpath = cfgfile(['Weg', 'Strasse'], + ['Strasse => str,st']) + + loader = ICURuleLoader(fpath) + + repl = loader.get_replacement_pairs() + + assert repl == [(' strasse ', {' strasse ', ' str ', ' st '}), + ('strasse ', {' strasse ', ' str ', ' st '}), + ('weg ', {' weg '})] +