mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-02-26 11:08:13 +00:00
introduce sanitizer step before token analysis
Sanatizer functions allow to transform name and address tags before they are handed to the tokenizer. Theses transformations are visible only for the tokenizer and thus only have an influence on the search terms and address match terms for a place. Currently two sanitizers are implemented which are responsible for splitting names with multiple values and removing bracket additions. Both was previously hard-coded in the tokenizer.
This commit is contained in:
@@ -12,6 +12,7 @@ from icu import Transliterator
|
|||||||
from nominatim.db.properties import set_property, get_property
|
from nominatim.db.properties import set_property, get_property
|
||||||
from nominatim.errors import UsageError
|
from nominatim.errors import UsageError
|
||||||
from nominatim.tokenizer.icu_name_processor import ICUNameProcessor
|
from nominatim.tokenizer.icu_name_processor import ICUNameProcessor
|
||||||
|
from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
|
||||||
import nominatim.tokenizer.icu_variants as variants
|
import nominatim.tokenizer.icu_variants as variants
|
||||||
|
|
||||||
LOG = logging.getLogger()
|
LOG = logging.getLogger()
|
||||||
@@ -65,6 +66,9 @@ class ICURuleLoader:
|
|||||||
self.analysis_rules = self._get_section(rules, 'variants')
|
self.analysis_rules = self._get_section(rules, 'variants')
|
||||||
self._parse_variant_list()
|
self._parse_variant_list()
|
||||||
|
|
||||||
|
# Load optional sanitizer rule set.
|
||||||
|
self.sanitizer_rules = rules.get('sanitizers', [])
|
||||||
|
|
||||||
|
|
||||||
def load_config_from_db(self, conn):
|
def load_config_from_db(self, conn):
|
||||||
""" Get previously saved parts of the configuration from the
|
""" Get previously saved parts of the configuration from the
|
||||||
@@ -85,6 +89,12 @@ class ICURuleLoader:
|
|||||||
set_property(conn, DBCFG_IMPORT_ANALYSIS_RULES, json.dumps(self.analysis_rules))
|
set_property(conn, DBCFG_IMPORT_ANALYSIS_RULES, json.dumps(self.analysis_rules))
|
||||||
|
|
||||||
|
|
||||||
|
def make_sanitizer(self):
|
||||||
|
""" Create a place sanitizer from the configured rules.
|
||||||
|
"""
|
||||||
|
return PlaceSanitizer(self.sanitizer_rules)
|
||||||
|
|
||||||
|
|
||||||
def make_token_analysis(self):
|
def make_token_analysis(self):
|
||||||
""" Create a token analyser from the reviouly loaded rules.
|
""" Create a token analyser from the reviouly loaded rules.
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -13,6 +13,7 @@ from nominatim.db.connection import connect
|
|||||||
from nominatim.db.properties import set_property, get_property
|
from nominatim.db.properties import set_property, get_property
|
||||||
from nominatim.db.utils import CopyBuffer
|
from nominatim.db.utils import CopyBuffer
|
||||||
from nominatim.db.sql_preprocessor import SQLPreprocessor
|
from nominatim.db.sql_preprocessor import SQLPreprocessor
|
||||||
|
from nominatim.indexer.place_info import PlaceInfo
|
||||||
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
|
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
|
||||||
from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
|
from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
|
||||||
|
|
||||||
@@ -107,7 +108,8 @@ class LegacyICUTokenizer(AbstractTokenizer):
|
|||||||
|
|
||||||
Analyzers are not thread-safe. You need to instantiate one per thread.
|
Analyzers are not thread-safe. You need to instantiate one per thread.
|
||||||
"""
|
"""
|
||||||
return LegacyICUNameAnalyzer(self.dsn, self.loader.make_token_analysis())
|
return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
|
||||||
|
self.loader.make_token_analysis())
|
||||||
|
|
||||||
|
|
||||||
def _install_php(self, phpdir):
|
def _install_php(self, phpdir):
|
||||||
@@ -187,10 +189,11 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
normalization.
|
normalization.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, dsn, name_proc):
|
def __init__(self, dsn, sanitizer, token_analysis):
|
||||||
self.conn = connect(dsn).connection
|
self.conn = connect(dsn).connection
|
||||||
self.conn.autocommit = True
|
self.conn.autocommit = True
|
||||||
self.name_processor = name_proc
|
self.sanitizer = sanitizer
|
||||||
|
self.token_analysis = token_analysis
|
||||||
|
|
||||||
self._cache = _TokenCache()
|
self._cache = _TokenCache()
|
||||||
|
|
||||||
@@ -203,6 +206,19 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
self.conn = None
|
self.conn = None
|
||||||
|
|
||||||
|
|
||||||
|
def _search_normalized(self, name):
|
||||||
|
""" Return the search token transliteration of the given name.
|
||||||
|
"""
|
||||||
|
return self.token_analysis.get_search_normalized(name)
|
||||||
|
|
||||||
|
|
||||||
|
def _normalized(self, name):
|
||||||
|
""" Return the normalized version of the given name with all
|
||||||
|
non-relevant information removed.
|
||||||
|
"""
|
||||||
|
return self.token_analysis.get_normalized(name)
|
||||||
|
|
||||||
|
|
||||||
def get_word_token_info(self, words):
|
def get_word_token_info(self, words):
|
||||||
""" Return token information for the given list of words.
|
""" Return token information for the given list of words.
|
||||||
If a word starts with # it is assumed to be a full name
|
If a word starts with # it is assumed to be a full name
|
||||||
@@ -218,9 +234,9 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
partial_tokens = {}
|
partial_tokens = {}
|
||||||
for word in words:
|
for word in words:
|
||||||
if word.startswith('#'):
|
if word.startswith('#'):
|
||||||
full_tokens[word] = self.name_processor.get_search_normalized(word[1:])
|
full_tokens[word] = self._search_normalized(word[1:])
|
||||||
else:
|
else:
|
||||||
partial_tokens[word] = self.name_processor.get_search_normalized(word)
|
partial_tokens[word] = self._search_normalized(word)
|
||||||
|
|
||||||
with self.conn.cursor() as cur:
|
with self.conn.cursor() as cur:
|
||||||
cur.execute("""SELECT word_token, word_id
|
cur.execute("""SELECT word_token, word_id
|
||||||
@@ -251,7 +267,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
|
|
||||||
This function takes minor shortcuts on transliteration.
|
This function takes minor shortcuts on transliteration.
|
||||||
"""
|
"""
|
||||||
return self.name_processor.get_search_normalized(hnr)
|
return self._search_normalized(hnr)
|
||||||
|
|
||||||
def update_postcodes_from_db(self):
|
def update_postcodes_from_db(self):
|
||||||
""" Update postcode tokens in the word table from the location_postcode
|
""" Update postcode tokens in the word table from the location_postcode
|
||||||
@@ -274,7 +290,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
if postcode is None:
|
if postcode is None:
|
||||||
to_delete.append(word)
|
to_delete.append(word)
|
||||||
else:
|
else:
|
||||||
copystr.add(self.name_processor.get_search_normalized(postcode),
|
copystr.add(self._search_normalized(postcode),
|
||||||
'P', postcode)
|
'P', postcode)
|
||||||
|
|
||||||
if to_delete:
|
if to_delete:
|
||||||
@@ -292,7 +308,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
completely replaced. Otherwise the phrases are added to the
|
completely replaced. Otherwise the phrases are added to the
|
||||||
already existing ones.
|
already existing ones.
|
||||||
"""
|
"""
|
||||||
norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
|
norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
|
||||||
for p in phrases))
|
for p in phrases))
|
||||||
|
|
||||||
with self.conn.cursor() as cur:
|
with self.conn.cursor() as cur:
|
||||||
@@ -322,7 +338,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
added = 0
|
added = 0
|
||||||
with CopyBuffer() as copystr:
|
with CopyBuffer() as copystr:
|
||||||
for word, cls, typ, oper in to_add:
|
for word, cls, typ, oper in to_add:
|
||||||
term = self.name_processor.get_search_normalized(word)
|
term = self._search_normalized(word)
|
||||||
if term:
|
if term:
|
||||||
copystr.add(term, 'S', word,
|
copystr.add(term, 'S', word,
|
||||||
json.dumps({'class': cls, 'type': typ,
|
json.dumps({'class': cls, 'type': typ,
|
||||||
@@ -356,9 +372,21 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
def add_country_names(self, country_code, names):
|
def add_country_names(self, country_code, names):
|
||||||
""" Add names for the given country to the search index.
|
""" Add names for the given country to the search index.
|
||||||
"""
|
"""
|
||||||
|
# Make sure any name preprocessing for country names applies.
|
||||||
|
info = PlaceInfo({'name': names, 'country_code': country_code,
|
||||||
|
'rank_address': 4, 'class': 'boundary',
|
||||||
|
'type': 'administrative'})
|
||||||
|
self._add_country_full_names(country_code,
|
||||||
|
self.sanitizer.process_names(info)[0])
|
||||||
|
|
||||||
|
|
||||||
|
def _add_country_full_names(self, country_code, names):
|
||||||
|
""" Add names for the given country from an already sanitized
|
||||||
|
name list.
|
||||||
|
"""
|
||||||
word_tokens = set()
|
word_tokens = set()
|
||||||
for name in self._compute_full_names(names):
|
for name in names:
|
||||||
norm_name = self.name_processor.get_search_normalized(name)
|
norm_name = self._search_normalized(name.name)
|
||||||
if norm_name:
|
if norm_name:
|
||||||
word_tokens.add(norm_name)
|
word_tokens.add(norm_name)
|
||||||
|
|
||||||
@@ -384,12 +412,12 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
def process_place(self, place):
|
def process_place(self, place):
|
||||||
""" Determine tokenizer information about the given place.
|
""" Determine tokenizer information about the given place.
|
||||||
|
|
||||||
Returns a JSON-serialisable structure that will be handed into
|
Returns a JSON-serializable structure that will be handed into
|
||||||
the database via the token_info field.
|
the database via the token_info field.
|
||||||
"""
|
"""
|
||||||
token_info = _TokenInfo(self._cache)
|
token_info = _TokenInfo(self._cache)
|
||||||
|
|
||||||
names = place.name
|
names, address = self.sanitizer.process_names(place)
|
||||||
|
|
||||||
if names:
|
if names:
|
||||||
fulls, partials = self._compute_name_tokens(names)
|
fulls, partials = self._compute_name_tokens(names)
|
||||||
@@ -397,9 +425,8 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
token_info.add_names(fulls, partials)
|
token_info.add_names(fulls, partials)
|
||||||
|
|
||||||
if place.is_country():
|
if place.is_country():
|
||||||
self.add_country_names(place.country_code, names)
|
self._add_country_full_names(place.country_code, names)
|
||||||
|
|
||||||
address = place.address
|
|
||||||
if address:
|
if address:
|
||||||
self._process_place_address(token_info, address)
|
self._process_place_address(token_info, address)
|
||||||
|
|
||||||
@@ -409,18 +436,18 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
def _process_place_address(self, token_info, address):
|
def _process_place_address(self, token_info, address):
|
||||||
hnrs = []
|
hnrs = []
|
||||||
addr_terms = []
|
addr_terms = []
|
||||||
for key, value in address.items():
|
for item in address:
|
||||||
if key == 'postcode':
|
if item.kind == 'postcode':
|
||||||
self._add_postcode(value)
|
self._add_postcode(item.name)
|
||||||
elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
|
elif item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'):
|
||||||
hnrs.append(value)
|
hnrs.append(item.name)
|
||||||
elif key == 'street':
|
elif item.kind == 'street':
|
||||||
token_info.add_street(self._compute_partial_tokens(value))
|
token_info.add_street(self._compute_partial_tokens(item.name))
|
||||||
elif key == 'place':
|
elif item.kind == 'place':
|
||||||
token_info.add_place(self._compute_partial_tokens(value))
|
token_info.add_place(self._compute_partial_tokens(item.name))
|
||||||
elif not key.startswith('_') and \
|
elif not item.kind.startswith('_') and \
|
||||||
key not in ('country', 'full'):
|
item.kind not in ('country', 'full'):
|
||||||
addr_terms.append((key, self._compute_partial_tokens(value)))
|
addr_terms.append((item.kind, self._compute_partial_tokens(item.name)))
|
||||||
|
|
||||||
if hnrs:
|
if hnrs:
|
||||||
hnrs = self._split_housenumbers(hnrs)
|
hnrs = self._split_housenumbers(hnrs)
|
||||||
@@ -433,7 +460,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
""" Normalize the given term, split it into partial words and return
|
""" Normalize the given term, split it into partial words and return
|
||||||
then token list for them.
|
then token list for them.
|
||||||
"""
|
"""
|
||||||
norm_name = self.name_processor.get_search_normalized(name)
|
norm_name = self._search_normalized(name)
|
||||||
|
|
||||||
tokens = []
|
tokens = []
|
||||||
need_lookup = []
|
need_lookup = []
|
||||||
@@ -456,19 +483,19 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
|
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
|
|
||||||
def _compute_name_tokens(self, names):
|
def _compute_name_tokens(self, names):
|
||||||
""" Computes the full name and partial name tokens for the given
|
""" Computes the full name and partial name tokens for the given
|
||||||
dictionary of names.
|
dictionary of names.
|
||||||
"""
|
"""
|
||||||
full_names = self._compute_full_names(names)
|
|
||||||
full_tokens = set()
|
full_tokens = set()
|
||||||
partial_tokens = set()
|
partial_tokens = set()
|
||||||
|
|
||||||
for name in full_names:
|
for name in names:
|
||||||
norm_name = self.name_processor.get_normalized(name)
|
norm_name = self._normalized(name.name)
|
||||||
full, part = self._cache.names.get(norm_name, (None, None))
|
full, part = self._cache.names.get(norm_name, (None, None))
|
||||||
if full is None:
|
if full is None:
|
||||||
variants = self.name_processor.get_variants_ascii(norm_name)
|
variants = self.token_analysis.get_variants_ascii(norm_name)
|
||||||
if not variants:
|
if not variants:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@@ -485,23 +512,6 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
return full_tokens, partial_tokens
|
return full_tokens, partial_tokens
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _compute_full_names(names):
|
|
||||||
""" Return the set of all full name word ids to be used with the
|
|
||||||
given dictionary of names.
|
|
||||||
"""
|
|
||||||
full_names = set()
|
|
||||||
for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)):
|
|
||||||
if name:
|
|
||||||
full_names.add(name)
|
|
||||||
|
|
||||||
brace_idx = name.find('(')
|
|
||||||
if brace_idx >= 0:
|
|
||||||
full_names.add(name[:brace_idx].strip())
|
|
||||||
|
|
||||||
return full_names
|
|
||||||
|
|
||||||
|
|
||||||
def _add_postcode(self, postcode):
|
def _add_postcode(self, postcode):
|
||||||
""" Make sure the normalized postcode is present in the word table.
|
""" Make sure the normalized postcode is present in the word table.
|
||||||
"""
|
"""
|
||||||
@@ -509,7 +519,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
postcode = self.normalize_postcode(postcode)
|
postcode = self.normalize_postcode(postcode)
|
||||||
|
|
||||||
if postcode not in self._cache.postcodes:
|
if postcode not in self._cache.postcodes:
|
||||||
term = self.name_processor.get_search_normalized(postcode)
|
term = self._search_normalized(postcode)
|
||||||
if not term:
|
if not term:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|||||||
127
nominatim/tokenizer/place_sanitizer.py
Normal file
127
nominatim/tokenizer/place_sanitizer.py
Normal file
@@ -0,0 +1,127 @@
|
|||||||
|
"""
|
||||||
|
Handler for cleaning name and address tags in place information before it
|
||||||
|
is handed to the token analysis.
|
||||||
|
"""
|
||||||
|
import importlib
|
||||||
|
|
||||||
|
from nominatim.errors import UsageError
|
||||||
|
|
||||||
|
class PlaceName:
|
||||||
|
""" A searchable name for a place together with properties.
|
||||||
|
Every name object saves the name proper and two basic properties:
|
||||||
|
* 'kind' describes the name of the OSM key used without any suffixes
|
||||||
|
(i.e. the part after the colon removed)
|
||||||
|
* 'suffix' contains the suffix of the OSM tag, if any. The suffix
|
||||||
|
is the part of the key after the first colon.
|
||||||
|
In addition to that, the name may have arbitrary additional attributes.
|
||||||
|
Which attributes are used, depends on the token analyser.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, name, kind, suffix):
|
||||||
|
self.name = name
|
||||||
|
self.kind = kind
|
||||||
|
self.suffix = suffix
|
||||||
|
self.attr = {}
|
||||||
|
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return f"PlaceName(name='{self.name}',kind='{self.kind}',suffix='{self.suffix}')"
|
||||||
|
|
||||||
|
|
||||||
|
def clone(self, name=None, kind=None, suffix=None, attr=None):
|
||||||
|
""" Create a deep copy of the place name, optionally with the
|
||||||
|
given parameters replaced. In the attribute list only the given
|
||||||
|
keys are updated. The list is not replaced completely.
|
||||||
|
In particular, the function cannot to be used to remove an
|
||||||
|
attribute from a place name.
|
||||||
|
"""
|
||||||
|
newobj = PlaceName(name or self.name,
|
||||||
|
kind or self.kind,
|
||||||
|
suffix or self.suffix)
|
||||||
|
|
||||||
|
newobj.attr.update(self.attr)
|
||||||
|
if attr:
|
||||||
|
newobj.attr.update(attr)
|
||||||
|
|
||||||
|
return newobj
|
||||||
|
|
||||||
|
|
||||||
|
def set_attr(self, key, value):
|
||||||
|
""" Add the given property to the name. If the property was already
|
||||||
|
set, then the value is overwritten.
|
||||||
|
"""
|
||||||
|
self.attr[key] = value
|
||||||
|
|
||||||
|
|
||||||
|
def get_attr(self, key, default=None):
|
||||||
|
""" Return the given property or the value of 'default' if it
|
||||||
|
is not set.
|
||||||
|
"""
|
||||||
|
return self.attr.get(key, default)
|
||||||
|
|
||||||
|
|
||||||
|
def has_attr(self, key):
|
||||||
|
""" Check if the given attribute is set.
|
||||||
|
"""
|
||||||
|
return key in self.attr
|
||||||
|
|
||||||
|
|
||||||
|
class _ProcessInfo:
|
||||||
|
""" Container class for information handed into to handler functions.
|
||||||
|
The 'names' and 'address' members are mutable. A handler must change
|
||||||
|
them by either modifying the lists place or replacing the old content
|
||||||
|
with a new list.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, place):
|
||||||
|
self.place = place
|
||||||
|
self.names = self._convert_name_dict(place.name)
|
||||||
|
self.address = self._convert_name_dict(place.address)
|
||||||
|
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _convert_name_dict(names):
|
||||||
|
""" Convert a dictionary of names into a list of PlaceNames.
|
||||||
|
The dictionary key is split into the primary part of the key
|
||||||
|
and the suffix (the part after an optional colon).
|
||||||
|
"""
|
||||||
|
out = []
|
||||||
|
|
||||||
|
if names:
|
||||||
|
for key, value in names.items():
|
||||||
|
parts = key.split(':', 1)
|
||||||
|
out.append(PlaceName(value.strip(),
|
||||||
|
parts[0].strip(),
|
||||||
|
parts[1].strip() if len(parts) > 1 else None))
|
||||||
|
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
class PlaceSanitizer:
|
||||||
|
""" Controller class which applies sanitizer functions on the place
|
||||||
|
names and address before they are used by the token analysers.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, rules):
|
||||||
|
self.handlers = []
|
||||||
|
|
||||||
|
if rules:
|
||||||
|
for func in rules:
|
||||||
|
if 'step' not in func:
|
||||||
|
raise UsageError("Sanitizer rule is missing the 'step' attribute.")
|
||||||
|
module_name = 'nominatim.tokenizer.sanitizers.' + func['step'].replace('-', '_')
|
||||||
|
handler_module = importlib.import_module(module_name)
|
||||||
|
self.handlers.append(handler_module.create(func))
|
||||||
|
|
||||||
|
|
||||||
|
def process_names(self, place):
|
||||||
|
""" Extract a sanitized list of names and address parts from the
|
||||||
|
given place. The function returns a tuple
|
||||||
|
(list of names, list of address names)
|
||||||
|
"""
|
||||||
|
obj = _ProcessInfo(place)
|
||||||
|
|
||||||
|
for func in self.handlers:
|
||||||
|
func(obj)
|
||||||
|
|
||||||
|
return obj.names, obj.address
|
||||||
0
nominatim/tokenizer/sanitizers/__init__.py
Normal file
0
nominatim/tokenizer/sanitizers/__init__.py
Normal file
28
nominatim/tokenizer/sanitizers/split_name_list.py
Normal file
28
nominatim/tokenizer/sanitizers/split_name_list.py
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
"""
|
||||||
|
Name processor that splits name values with multiple values into their components.
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
|
||||||
|
def create(func):
|
||||||
|
""" Create a name processing function that splits name values with
|
||||||
|
multiple values into their components. The optional parameter
|
||||||
|
'delimiters' can be used to define the characters that should be used
|
||||||
|
for splitting. The default is ',;'.
|
||||||
|
"""
|
||||||
|
regexp = re.compile('[{}]'.format(func.get('delimiters', ',;')))
|
||||||
|
|
||||||
|
def _process(obj):
|
||||||
|
if not obj.names:
|
||||||
|
return
|
||||||
|
|
||||||
|
new_names = []
|
||||||
|
for name in obj.names:
|
||||||
|
split_names = regexp.split(name.name)
|
||||||
|
if len(split_names) == 1:
|
||||||
|
new_names.append(name)
|
||||||
|
else:
|
||||||
|
new_names.extend(name.clone(name=n) for n in split_names)
|
||||||
|
|
||||||
|
obj.names = new_names
|
||||||
|
|
||||||
|
return _process
|
||||||
22
nominatim/tokenizer/sanitizers/strip_brace_terms.py
Normal file
22
nominatim/tokenizer/sanitizers/strip_brace_terms.py
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
"""
|
||||||
|
Sanitizer handling names with addendums in braces.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def create(_):
|
||||||
|
""" Create a name processing function that creates additional name variants
|
||||||
|
when a name has an addendum in brackets (e.g. "Halle (Saale)"). The
|
||||||
|
additional variant only contains the main name without the bracket part.
|
||||||
|
"""
|
||||||
|
def _process(obj):
|
||||||
|
""" Add variants for names that have a bracket extension.
|
||||||
|
"""
|
||||||
|
new_names = []
|
||||||
|
if obj.names:
|
||||||
|
for name in (n for n in obj.names if '(' in n.name):
|
||||||
|
new_name = name.name.split('(')[0].strip()
|
||||||
|
if new_name:
|
||||||
|
new_names.append(name.clone(name=new_name))
|
||||||
|
|
||||||
|
obj.names.extend(new_names)
|
||||||
|
|
||||||
|
return _process
|
||||||
@@ -24,6 +24,9 @@ transliteration:
|
|||||||
- "[^[:Ascii:]] >"
|
- "[^[:Ascii:]] >"
|
||||||
- ":: lower ()"
|
- ":: lower ()"
|
||||||
- ":: NFC ()"
|
- ":: NFC ()"
|
||||||
|
sanitizers:
|
||||||
|
- step: split-name-list
|
||||||
|
- step: strip-brace-terms
|
||||||
variants:
|
variants:
|
||||||
- !include icu-rules/variants-bg.yaml
|
- !include icu-rules/variants-bg.yaml
|
||||||
- !include icu-rules/variants-ca.yaml
|
- !include icu-rules/variants-ca.yaml
|
||||||
|
|||||||
@@ -67,10 +67,12 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
|
|||||||
monkeypatch.undo()
|
monkeypatch.undo()
|
||||||
|
|
||||||
def _mk_analyser(norm=("[[:Punctuation:][:Space:]]+ > ' '",), trans=(':: upper()',),
|
def _mk_analyser(norm=("[[:Punctuation:][:Space:]]+ > ' '",), trans=(':: upper()',),
|
||||||
variants=('~gasse -> gasse', 'street => st', )):
|
variants=('~gasse -> gasse', 'street => st', ),
|
||||||
|
sanitizers=[]):
|
||||||
cfgstr = {'normalization' : list(norm),
|
cfgstr = {'normalization' : list(norm),
|
||||||
'transliteration' : list(trans),
|
'sanitizers' : sanitizers,
|
||||||
'variants' : [ {'words': list(variants)}]}
|
'transliteration' : list(trans),
|
||||||
|
'variants' : [ {'words': list(variants)}]}
|
||||||
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr))
|
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr))
|
||||||
tok.loader = ICURuleLoader(test_config)
|
tok.loader = ICURuleLoader(test_config)
|
||||||
|
|
||||||
@@ -309,14 +311,15 @@ class TestPlaceNames:
|
|||||||
|
|
||||||
@pytest.fixture(autouse=True)
|
@pytest.fixture(autouse=True)
|
||||||
def setup(self, analyzer, sql_functions):
|
def setup(self, analyzer, sql_functions):
|
||||||
with analyzer() as anl:
|
sanitizers = [{'step': 'split-name-list'},
|
||||||
|
{'step': 'strip-brace-terms'}]
|
||||||
|
with analyzer(sanitizers=sanitizers) as anl:
|
||||||
self.analyzer = anl
|
self.analyzer = anl
|
||||||
yield anl
|
yield anl
|
||||||
|
|
||||||
|
|
||||||
def expect_name_terms(self, info, *expected_terms):
|
def expect_name_terms(self, info, *expected_terms):
|
||||||
tokens = self.analyzer.get_word_token_info(expected_terms)
|
tokens = self.analyzer.get_word_token_info(expected_terms)
|
||||||
print (tokens)
|
|
||||||
for token in tokens:
|
for token in tokens:
|
||||||
assert token[2] is not None, "No token for {0}".format(token)
|
assert token[2] is not None, "No token for {0}".format(token)
|
||||||
|
|
||||||
@@ -324,9 +327,7 @@ class TestPlaceNames:
|
|||||||
|
|
||||||
|
|
||||||
def process_named_place(self, names):
|
def process_named_place(self, names):
|
||||||
place = {'name': names}
|
return self.analyzer.process_place(PlaceInfo({'name': names}))
|
||||||
|
|
||||||
return self.analyzer.process_place(PlaceInfo(place))
|
|
||||||
|
|
||||||
|
|
||||||
def test_simple_names(self):
|
def test_simple_names(self):
|
||||||
|
|||||||
Reference in New Issue
Block a user