apply variants by languages

Adds a tagger for names by language so that the analyzer of that
language is used. Thus variants are now only applied to names
in the specific language and only tag name tags, no longer to
reference-like tags.
This commit is contained in:
Sarah Hoffmann
2021-10-05 17:18:10 +02:00
parent d35400a7d7
commit 97a10ec218
8 changed files with 307 additions and 46 deletions

View File

@@ -11,6 +11,7 @@ from nominatim.db.properties import set_property, get_property
from nominatim.errors import UsageError
from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
import nominatim.tools.country_info
LOG = logging.getLogger()
@@ -38,6 +39,9 @@ class ICURuleLoader:
rules = config.load_sub_configuration('icu_tokenizer.yaml',
config='TOKENIZER_CONFIG')
# Make sure country information is available to analyzers and sanatizers.
nominatim.tools.country_info.setup_country_config(config)
self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
self.analysis_rules = _get_section(rules, 'token-analysis')