forked from hans/Nominatim
introduce sanitizer step before token analysis
Sanatizer functions allow to transform name and address tags before they are handed to the tokenizer. Theses transformations are visible only for the tokenizer and thus only have an influence on the search terms and address match terms for a place. Currently two sanitizers are implemented which are responsible for splitting names with multiple values and removing bracket additions. Both was previously hard-coded in the tokenizer.
This commit is contained in:
0
nominatim/tokenizer/sanitizers/__init__.py
Normal file
0
nominatim/tokenizer/sanitizers/__init__.py
Normal file
28
nominatim/tokenizer/sanitizers/split_name_list.py
Normal file
28
nominatim/tokenizer/sanitizers/split_name_list.py
Normal file
@@ -0,0 +1,28 @@
|
||||
"""
|
||||
Name processor that splits name values with multiple values into their components.
|
||||
"""
|
||||
import re
|
||||
|
||||
def create(func):
|
||||
""" Create a name processing function that splits name values with
|
||||
multiple values into their components. The optional parameter
|
||||
'delimiters' can be used to define the characters that should be used
|
||||
for splitting. The default is ',;'.
|
||||
"""
|
||||
regexp = re.compile('[{}]'.format(func.get('delimiters', ',;')))
|
||||
|
||||
def _process(obj):
|
||||
if not obj.names:
|
||||
return
|
||||
|
||||
new_names = []
|
||||
for name in obj.names:
|
||||
split_names = regexp.split(name.name)
|
||||
if len(split_names) == 1:
|
||||
new_names.append(name)
|
||||
else:
|
||||
new_names.extend(name.clone(name=n) for n in split_names)
|
||||
|
||||
obj.names = new_names
|
||||
|
||||
return _process
|
||||
22
nominatim/tokenizer/sanitizers/strip_brace_terms.py
Normal file
22
nominatim/tokenizer/sanitizers/strip_brace_terms.py
Normal file
@@ -0,0 +1,22 @@
|
||||
"""
|
||||
Sanitizer handling names with addendums in braces.
|
||||
"""
|
||||
|
||||
def create(_):
|
||||
""" Create a name processing function that creates additional name variants
|
||||
when a name has an addendum in brackets (e.g. "Halle (Saale)"). The
|
||||
additional variant only contains the main name without the bracket part.
|
||||
"""
|
||||
def _process(obj):
|
||||
""" Add variants for names that have a bracket extension.
|
||||
"""
|
||||
new_names = []
|
||||
if obj.names:
|
||||
for name in (n for n in obj.names if '(' in n.name):
|
||||
new_name = name.name.split('(')[0].strip()
|
||||
if new_name:
|
||||
new_names.append(name.clone(name=new_name))
|
||||
|
||||
obj.names.extend(new_names)
|
||||
|
||||
return _process
|
||||
Reference in New Issue
Block a user