use analyser provided in the 'analyzer' property

Implements per-name choice of analyzer. If a non-default
analyzer is choosen, then the 'word' identifier is extended
with the name of the ana;yzer, so that we still have unique
items.
This commit is contained in:
Sarah Hoffmann
2021-10-05 14:10:32 +02:00
parent 92f6ec2328
commit d35400a7d7
6 changed files with 50 additions and 51 deletions

View File

@@ -10,6 +10,7 @@ from nominatim.config import flatten_config_list
from nominatim.db.properties import set_property, get_property from nominatim.db.properties import set_property, get_property
from nominatim.errors import UsageError from nominatim.errors import UsageError
from nominatim.tokenizer.place_sanitizer import PlaceSanitizer from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
from nominatim.tokenizer.icu_token_analysis import ICUTokenAnalysis
LOG = logging.getLogger() LOG = logging.getLogger()
@@ -74,8 +75,8 @@ class ICURuleLoader:
def make_token_analysis(self): def make_token_analysis(self):
""" Create a token analyser from the reviouly loaded rules. """ Create a token analyser from the reviouly loaded rules.
""" """
return self.analysis[None].create(self.normalization_rules, return ICUTokenAnalysis(self.normalization_rules,
self.transliteration_rules) self.transliteration_rules, self.analysis)
def get_search_rules(self): def get_search_rules(self):
@@ -149,15 +150,7 @@ class TokenAnalyzerRule:
module_name = 'nominatim.tokenizer.token_analysis.' \ module_name = 'nominatim.tokenizer.token_analysis.' \
+ _get_section(rules, 'analyzer').replace('-', '_') + _get_section(rules, 'analyzer').replace('-', '_')
analysis_mod = importlib.import_module(module_name) analysis_mod = importlib.import_module(module_name)
self._mod_create = analysis_mod.create self.create = analysis_mod.create
# Load the configuration. # Load the configuration.
self.config = analysis_mod.configure(rules, normalization_rules) self.config = analysis_mod.configure(rules, normalization_rules)
def create(self, normalization_rules, transliteration_rules):
""" Create an analyzer from the given rules.
"""
return self._mod_create(normalization_rules,
transliteration_rules,
self.config)

View File

@@ -0,0 +1,23 @@
"""
Container class collecting all components required to transform an OSM name
into a Nominatim token.
"""
from icu import Transliterator
class ICUTokenAnalysis:
""" Container class collecting the transliterators and token analysis
modules for a single NameAnalyser instance.
"""
def __init__(self, norm_rules, trans_rules, analysis_rules):
self.normalizer = Transliterator.createFromRules("icu_normalization",
norm_rules)
trans_rules += ";[:Space:]+ > ' '"
self.to_ascii = Transliterator.createFromRules("icu_to_ascii",
trans_rules)
self.search = Transliterator.createFromRules("icu_search",
norm_rules + trans_rules)
self.analysis = {name: arules.create(self.to_ascii, arules.config)
for name, arules in analysis_rules.items()}

View File

@@ -164,7 +164,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
""" Count the partial terms from the names in the place table. """ Count the partial terms from the names in the place table.
""" """
words = Counter() words = Counter()
name_proc = self.loader.make_token_analysis() analysis = self.loader.make_token_analysis()
with conn.cursor(name="words") as cur: with conn.cursor(name="words") as cur:
cur.execute(""" SELECT v, count(*) FROM cur.execute(""" SELECT v, count(*) FROM
@@ -172,12 +172,10 @@ class LegacyICUTokenizer(AbstractTokenizer):
WHERE length(v) < 75 GROUP BY v""") WHERE length(v) < 75 GROUP BY v""")
for name, cnt in cur: for name, cnt in cur:
terms = set() word = analysis.search.transliterate(name)
for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)): if word and ' ' in word:
if ' ' in word: for term in set(word.split()):
terms.update(word.split()) words[term] += cnt
for term in terms:
words[term] += cnt
return words return words
@@ -209,14 +207,14 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
def _search_normalized(self, name): def _search_normalized(self, name):
""" Return the search token transliteration of the given name. """ Return the search token transliteration of the given name.
""" """
return self.token_analysis.get_search_normalized(name) return self.token_analysis.search.transliterate(name).strip()
def _normalized(self, name): def _normalized(self, name):
""" Return the normalized version of the given name with all """ Return the normalized version of the given name with all
non-relevant information removed. non-relevant information removed.
""" """
return self.token_analysis.get_normalized(name) return self.token_analysis.normalizer.transliterate(name).strip()
def get_word_token_info(self, words): def get_word_token_info(self, words):
@@ -456,6 +454,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
if addr_terms: if addr_terms:
token_info.add_address_terms(addr_terms) token_info.add_address_terms(addr_terms)
def _compute_partial_tokens(self, name): def _compute_partial_tokens(self, name):
""" Normalize the given term, split it into partial words and return """ Normalize the given term, split it into partial words and return
then token list for them. then token list for them.
@@ -492,19 +491,25 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
partial_tokens = set() partial_tokens = set()
for name in names: for name in names:
analyzer_id = name.get_attr('analyzer')
norm_name = self._normalized(name.name) norm_name = self._normalized(name.name)
full, part = self._cache.names.get(norm_name, (None, None)) if analyzer_id is None:
token_id = norm_name
else:
token_id = f'{norm_name}@{analyzer_id}'
full, part = self._cache.names.get(token_id, (None, None))
if full is None: if full is None:
variants = self.token_analysis.get_variants_ascii(norm_name) variants = self.token_analysis.analysis[analyzer_id].get_variants_ascii(norm_name)
if not variants: if not variants:
continue continue
with self.conn.cursor() as cur: with self.conn.cursor() as cur:
cur.execute("SELECT (getorcreate_full_word(%s, %s)).*", cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
(norm_name, variants)) (token_id, variants))
full, part = cur.fetchone() full, part = cur.fetchone()
self._cache.names[norm_name] = (full, part) self._cache.names[token_id] = (full, part)
full_tokens.add(full) full_tokens.add(full)
partial_tokens.update(part) partial_tokens.update(part)

View File

@@ -24,7 +24,6 @@ def create(func):
new_names = [] new_names = []
for name in obj.names: for name in obj.names:
split_names = regexp.split(name.name) split_names = regexp.split(name.name)
print(split_names)
if len(split_names) == 1: if len(split_names) == 1:
new_names.append(name) new_names.append(name)
else: else:

View File

@@ -131,10 +131,10 @@ def _create_variants(src, preflag, postflag, repl, decompose):
### Analysis section ### Analysis section
def create(norm_rules, trans_rules, config): def create(trans_rules, config):
""" Create a new token analysis instance for this module. """ Create a new token analysis instance for this module.
""" """
return GenericTokenAnalysis(norm_rules, trans_rules, config) return GenericTokenAnalysis(trans_rules, config)
class GenericTokenAnalysis: class GenericTokenAnalysis:
@@ -142,14 +142,8 @@ class GenericTokenAnalysis:
and provides the functions to apply the transformations. and provides the functions to apply the transformations.
""" """
def __init__(self, norm_rules, trans_rules, config): def __init__(self, to_ascii, config):
self.normalizer = Transliterator.createFromRules("icu_normalization", self.to_ascii = to_ascii
norm_rules)
self.to_ascii = Transliterator.createFromRules("icu_to_ascii",
trans_rules +
";[:Space:]+ > ' '")
self.search = Transliterator.createFromRules("icu_search",
norm_rules + trans_rules)
# Set up datrie # Set up datrie
self.replacements = datrie.Trie(config['chars']) self.replacements = datrie.Trie(config['chars'])
@@ -157,12 +151,6 @@ class GenericTokenAnalysis:
self.replacements[src] = repllist self.replacements[src] = repllist
def get_normalized(self, name):
""" Normalize the given name, i.e. remove all elements not relevant
for search.
"""
return self.normalizer.transliterate(name).strip()
def get_variants_ascii(self, norm_name): def get_variants_ascii(self, norm_name):
""" Compute the spelling variants for the given normalized name """ Compute the spelling variants for the given normalized name
and transliterate the result. and transliterate the result.
@@ -213,10 +201,3 @@ class GenericTokenAnalysis:
results.add(trans_name) results.add(trans_name)
return list(results) return list(results)
def get_search_normalized(self, name):
""" Return the normalized version of the name (including transliteration)
to be applied at search time.
"""
return self.search.transliterate(' ' + name + ' ').strip()

View File

@@ -169,9 +169,7 @@ def test_init_word_table(tokenizer_factory, test_config, place_row, word_table):
tok.init_new_db(test_config) tok.init_new_db(test_config)
assert word_table.get_partial_words() == {('test', 1), assert word_table.get_partial_words() == {('test', 1),
('no', 1), ('area', 2), ('no', 1), ('area', 2)}
('holz', 1), ('strasse', 1),
('str', 1)}
def test_init_from_project(monkeypatch, test_config, tokenizer_factory): def test_init_from_project(monkeypatch, test_config, tokenizer_factory):