diff --git a/nominatim/tokenizer/icu_rule_loader.py b/nominatim/tokenizer/icu_rule_loader.py index 361b67d4..b3e9c4c7 100644 --- a/nominatim/tokenizer/icu_rule_loader.py +++ b/nominatim/tokenizer/icu_rule_loader.py @@ -11,6 +11,7 @@ from nominatim.db.properties import set_property, get_property from nominatim.errors import UsageError from nominatim.tokenizer.place_sanitizer import PlaceSanitizer from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis +import nominatim.tools.country_info LOG = logging.getLogger() @@ -38,6 +39,9 @@ class ICURuleLoader: rules = config.load_sub_configuration('icu_tokenizer.yaml', config='TOKENIZER_CONFIG') + # Make sure country information is available to analyzers and sanatizers. + nominatim.tools.country_info.setup_country_config(config) + self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization') self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration') self.analysis_rules = _get_section(rules, 'token-analysis') diff --git a/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py b/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py new file mode 100644 index 00000000..c98c825d --- /dev/null +++ b/nominatim/tokenizer/sanitizers/tag_analyzer_by_language.py @@ -0,0 +1,100 @@ +""" +Name processor for tagging the langauge of the name +""" +import re + +from nominatim.tools import country_info + +class _AnalyzerByLanguage: + """ Processor for tagging the language of names in a place. + """ + + def __init__(self, config): + if 'filter-kind' in config: + self.regexes = [re.compile(regex) for regex in config['filter-kind']] + else: + self.regexes = None + + self.use_defaults = config.get('use-defaults', 'no') + if self.use_defaults not in ('mono', 'all'): + self.use_defaults = False + + self.replace = config.get('mode', 'replace') != 'append' + self.whitelist = config.get('whitelist') + + # Compute the languages to use when no suffix is given. + self.deflangs = {} + for ccode, prop in country_info.iterate(): + clangs = prop['languages'] + if len(clangs) == 1 or self.use_defaults == 'all': + if self.whitelist: + self.deflangs[ccode] = [l for l in clangs if l in self.whitelist] + else: + self.deflangs[ccode] = clangs + + + + def _kind_matches(self, kind): + if self.regexes is None: + return True + + return any(regex.search(kind) for regex in self.regexes) + + + def _suffix_matches(self, suffix): + if self.whitelist is None: + return len(suffix) in (2, 3) and suffix.islower() + + return suffix in self.whitelist + + + def __call__(self, obj): + if not obj.names: + return + + more_names = [] + + for name in (n for n in obj.names + if not n.has_attr('analyzer') and self._kind_matches(n.kind)): + if name.suffix: + langs = [name.suffix] if self._suffix_matches(name.suffix) else None + else: + if self.use_defaults: + langs = self.deflangs.get(obj.place.country_code) + if self.use_defaults == 'mono' and len(langs) > 1: + langs = None + + if langs: + if self.replace: + name.set_attr('analyzer', langs[0]) + else: + more_names.append(name.clone(attr={'analyzer': langs[0]})) + + more_names.extend(name.clone(attr={'analyzer': l}) for l in langs[1:]) + + obj.names.extend(more_names) + + +def create(config): + """ Create a function that sets the analyzer property depending on the + language of the tag. The language is taken from the suffix. + + To restrict the set of languages that should be tagged, use + 'whitelist'. A list of acceptable suffixes. When unset, all 2- and + 3-letter codes are accepted. + + 'use-defaults' configures what happens when the name has no suffix + with a language tag. When set to 'all', a variant is created for + each on the spoken languages in the country the feature is in. When + set to 'mono', a variant is created, when only one language is spoken + in the country. The default is, to do nothing with the default languages + of a country. + + 'mode' hay be 'replace' (the default) or 'append' and configures if + the original name (without any analyzer tagged) is retained. + + With 'filter-kind' the set of names the sanitizer should be applied + to can be retricted to the given patterns of 'kind'. It expects a + list of regular expression to be matched against 'kind'. + """ + return _AnalyzerByLanguage(config) diff --git a/nominatim/tokenizer/token_analysis/generic.py b/nominatim/tokenizer/token_analysis/generic.py index c904d87d..b8cfde39 100644 --- a/nominatim/tokenizer/token_analysis/generic.py +++ b/nominatim/tokenizer/token_analysis/generic.py @@ -18,7 +18,19 @@ ICUVariant = namedtuple('ICUVariant', ['source', 'replacement']) def configure(rules, normalization_rules): """ Extract and preprocess the configuration for this module. """ - rules = rules.get('variants') + config = {} + + config['replacements'], config['chars'] = _get_variant_config(rules.get('variants'), + normalization_rules) + config['variant_only'] = rules.get('mode', '') == 'variant-only' + + return config + + +def _get_variant_config(rules, normalization_rules): + """ Convert the variant definition from the configuration into + replacement sets. + """ immediate = defaultdict(list) chars = set() @@ -41,8 +53,7 @@ def configure(rules, normalization_rules): immediate[variant.source].append(replstr) chars.update(variant.source) - return {'replacements': list(immediate.items()), - 'chars': ''.join(chars)} + return list(immediate.items()), ''.join(chars) class _VariantMaker: @@ -144,11 +155,15 @@ class GenericTokenAnalysis: def __init__(self, to_ascii, config): self.to_ascii = to_ascii + self.variant_only = config['variant_only'] # Set up datrie - self.replacements = datrie.Trie(config['chars']) - for src, repllist in config['replacements']: - self.replacements[src] = repllist + if config['replacements']: + self.replacements = datrie.Trie(config['chars']) + for src, repllist in config['replacements']: + self.replacements[src] = repllist + else: + self.replacements = None def get_variants_ascii(self, norm_name): @@ -159,45 +174,51 @@ class GenericTokenAnalysis: partials = [''] startpos = 0 - pos = 0 - force_space = False - while pos < len(baseform): - full, repl = self.replacements.longest_prefix_item(baseform[pos:], - (None, None)) - if full is not None: - done = baseform[startpos:pos] - partials = [v + done + r - for v, r in itertools.product(partials, repl) - if not force_space or r.startswith(' ')] - if len(partials) > 128: - # If too many variants are produced, they are unlikely - # to be helpful. Only use the original term. - startpos = 0 - break - startpos = pos + len(full) - if full[-1] == ' ': - startpos -= 1 - force_space = True - pos = startpos - else: - pos += 1 - force_space = False + if self.replacements is not None: + pos = 0 + force_space = False + while pos < len(baseform): + full, repl = self.replacements.longest_prefix_item(baseform[pos:], + (None, None)) + if full is not None: + done = baseform[startpos:pos] + partials = [v + done + r + for v, r in itertools.product(partials, repl) + if not force_space or r.startswith(' ')] + if len(partials) > 128: + # If too many variants are produced, they are unlikely + # to be helpful. Only use the original term. + startpos = 0 + break + startpos = pos + len(full) + if full[-1] == ' ': + startpos -= 1 + force_space = True + pos = startpos + else: + pos += 1 + force_space = False # No variants detected? Fast return. if startpos == 0: + if self.variant_only: + return [] + trans_name = self.to_ascii.transliterate(norm_name).strip() return [trans_name] if trans_name else [] - return self._compute_result_set(partials, baseform[startpos:]) + return self._compute_result_set(partials, baseform[startpos:], + norm_name if self.variant_only else '') - def _compute_result_set(self, partials, prefix): + def _compute_result_set(self, partials, prefix, exclude): results = set() for variant in partials: - vname = variant + prefix - trans_name = self.to_ascii.transliterate(vname[1:-1]).strip() - if trans_name: - results.add(trans_name) + vname = (variant + prefix)[1:-1].strip() + if vname != exclude: + trans_name = self.to_ascii.transliterate(vname).strip() + if trans_name: + results.add(trans_name) return list(results) diff --git a/nominatim/tools/country_info.py b/nominatim/tools/country_info.py index e04a8693..635d1584 100644 --- a/nominatim/tools/country_info.py +++ b/nominatim/tools/country_info.py @@ -13,12 +13,21 @@ class _CountryInfo: def __init__(self): self._info = {} + def load(self, config): """ Load the country properties from the configuration files, if they are not loaded yet. """ if not self._info: self._info = config.load_sub_configuration('country_settings.yaml') + # Convert languages into a list for simpler handling. + for prop in self._info.values(): + if 'languages' not in prop: + prop['languages'] = [] + elif not isinstance(prop['languages'], list): + prop['languages'] = [x.strip() + for x in prop['languages'].split(',')] + def items(self): """ Return tuples of (country_code, property dict) as iterable. @@ -36,6 +45,12 @@ def setup_country_config(config): _COUNTRY_INFO.load(config) +def iterate(): + """ Iterate over country code and properties. + """ + return _COUNTRY_INFO.items() + + def setup_country_tables(dsn, sql_dir, ignore_partitions=False): """ Create and populate the tables with basic static data that provides the background for geocoding. Data is assumed to not yet exist. @@ -50,10 +65,7 @@ def setup_country_tables(dsn, sql_dir, ignore_partitions=False): partition = 0 else: partition = props.get('partition') - if ',' in (props.get('languages', ',') or ','): - lang = None - else: - lang = props['languages'] + lang = props['languages'][0] if len(props['languages']) == 1 else None params.append((ccode, partition, lang)) with connect(dsn) as conn: diff --git a/settings/country_settings.yaml b/settings/country_settings.yaml index 77b137a1..dcbb1847 100644 --- a/settings/country_settings.yaml +++ b/settings/country_settings.yaml @@ -171,7 +171,7 @@ bt: # (Bouvet Island) bv: partition: 185 - languages: no + languages: "no" # Botswana (Botswana) bw: @@ -1006,7 +1006,7 @@ si: # (Svalbard and Jan Mayen) sj: partition: 197 - languages: no + languages: "no" # Slovakia (Slovensko) sk: diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml index d070adcb..41760c49 100644 --- a/settings/icu_tokenizer.yaml +++ b/settings/icu_tokenizer.yaml @@ -27,36 +27,160 @@ transliteration: sanitizers: - step: split-name-list - step: strip-brace-terms + - step: tag-analyzer-by-language + filter-kind: [".*name.*"] + whitelist: [bg,ca,cs,da,de,el,en,es,et,eu,fi,fr,gl,hu,it,ja,mg,ms,nl,no,pl,pt,ro,ru,sk,sl,sv,tr,uk,vi] + use-defaults: all + mode: append token-analysis: - analyzer: generic + - id: bg + analyzer: generic + mode: variant-only variants: - !include icu-rules/variants-bg.yaml + - id: ca + analyzer: generic + mode: variant-only + variants: - !include icu-rules/variants-ca.yaml + - id: cs + analyzer: generic + mode: variant-only + variants: - !include icu-rules/variants-cs.yaml + - id: da + analyzer: generic + mode: variant-only + variants: - !include icu-rules/variants-da.yaml + - id: de + analyzer: generic + mode: variant-only + variants: - !include icu-rules/variants-de.yaml + - id: el + analyzer: generic + mode: variant-only + variants: - !include icu-rules/variants-el.yaml + - id: en + analyzer: generic + mode: variant-only + variants: - !include icu-rules/variants-en.yaml + - id: es + analyzer: generic + mode: variant-only + variants: - !include icu-rules/variants-es.yaml + - id: et + analyzer: generic + mode: variant-only + variants: - !include icu-rules/variants-et.yaml + - id: eu + analyzer: generic + mode: variant-only + variants: - !include icu-rules/variants-eu.yaml + - id: fi + analyzer: generic + mode: variant-only + variants: - !include icu-rules/variants-fi.yaml + - id: fr + analyzer: generic + mode: variant-only + variants: - !include icu-rules/variants-fr.yaml + - id: gl + analyzer: generic + mode: variant-only + variants: - !include icu-rules/variants-gl.yaml + - id: hu + analyzer: generic + mode: variant-only + variants: - !include icu-rules/variants-hu.yaml + - id: it + analyzer: generic + mode: variant-only + variants: - !include icu-rules/variants-it.yaml + - id: ja + analyzer: generic + mode: variant-only + variants: - !include icu-rules/variants-ja.yaml + - id: mg + analyzer: generic + mode: variant-only + variants: - !include icu-rules/variants-mg.yaml + - id: ms + analyzer: generic + mode: variant-only + variants: - !include icu-rules/variants-ms.yaml + - id: nl + analyzer: generic + mode: variant-only + variants: - !include icu-rules/variants-nl.yaml + - id: no + analyzer: generic + mode: variant-only + variants: - !include icu-rules/variants-no.yaml + - id: pl + analyzer: generic + mode: variant-only + variants: - !include icu-rules/variants-pl.yaml + - id: pt + analyzer: generic + mode: variant-only + variants: - !include icu-rules/variants-pt.yaml + - id: ro + analyzer: generic + mode: variant-only + variants: - !include icu-rules/variants-ro.yaml + - id: ru + analyzer: generic + mode: variant-only + variants: - !include icu-rules/variants-ru.yaml + - id: sk + analyzer: generic + mode: variant-only + variants: - !include icu-rules/variants-sk.yaml + - id: sl + analyzer: generic + mode: variant-only + variants: - !include icu-rules/variants-sl.yaml + - id: sv + analyzer: generic + mode: variant-only + variants: - !include icu-rules/variants-sv.yaml + - id: tr + analyzer: generic + mode: variant-only + variants: - !include icu-rules/variants-tr.yaml + - id: uk + analyzer: generic + mode: variant-only + variants: - !include icu-rules/variants-uk.yaml + - id: vi + analyzer: generic + mode: variant-only + variants: - !include icu-rules/variants-vi.yaml diff --git a/test/bdd/db/query/normalization.feature b/test/bdd/db/query/normalization.feature index b8a760f9..deaa635e 100644 --- a/test/bdd/db/query/normalization.feature +++ b/test/bdd/db/query/normalization.feature @@ -52,7 +52,7 @@ Feature: Import and search of names Scenario: Special characters in name Given the places - | osm | class | type | name | + | osm | class | type | name+name:de | | N1 | place | locality | Jim-Knopf-Straße | | N2 | place | locality | Smith/Weston | | N3 | place | locality | space mountain | diff --git a/test/python/tokenizer/token_analysis/test_generic.py b/test/python/tokenizer/token_analysis/test_generic.py index f0ce4208..a9b09ea4 100644 --- a/test/python/tokenizer/token_analysis/test_generic.py +++ b/test/python/tokenizer/token_analysis/test_generic.py @@ -40,7 +40,7 @@ def cfgfile(def_config, tmp_path): def get_normalized_variants(proc, name): - return proc.get_variants_ascii(proc.get_normalized(name)) + return proc.analysis[None].get_variants_ascii(proc.normalizer.transliterate(name).strip()) def test_variants_empty(cfgfile): @@ -99,6 +99,6 @@ def test_search_normalized(cfgfile): config = cfgfile('~street => s,st', 'master => mstr') proc = ICURuleLoader(config).make_token_analysis() - assert proc.get_search_normalized('Master Street') == 'master street' - assert proc.get_search_normalized('Earnes St') == 'earnes st' - assert proc.get_search_normalized('Nostreet') == 'nostreet' + assert proc.search.transliterate('Master Street').strip() == 'master street' + assert proc.search.transliterate('Earnes St').strip() == 'earnes st' + assert proc.search.transliterate('Nostreet').strip() == 'nostreet'