mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-02-14 10:27:57 +00:00
Merge pull request #2993 from biswajit-k/delete-tags
Adds sanitizer for preventing certain tags to enter search index based on parameters
This commit is contained in:
@@ -102,7 +102,7 @@ Here is an example configuration file:
|
||||
``` yaml
|
||||
normalization:
|
||||
- ":: lower ()"
|
||||
- "ß > 'ss'" # German szet is unimbigiously equal to double ss
|
||||
- "ß > 'ss'" # German szet is unambiguously equal to double ss
|
||||
transliteration:
|
||||
- !include /etc/nominatim/icu-rules/extended-unicode-to-asccii.yaml
|
||||
- ":: Ascii ()"
|
||||
@@ -128,7 +128,7 @@ The configuration file contains four sections:
|
||||
The normalization and transliteration sections each define a set of
|
||||
ICU rules that are applied to the names.
|
||||
|
||||
The **normalisation** rules are applied after sanitation. They should remove
|
||||
The **normalization** rules are applied after sanitation. They should remove
|
||||
any information that is not relevant for search at all. Usual rules to be
|
||||
applied here are: lower-casing, removing of special characters, cleanup of
|
||||
spaces.
|
||||
@@ -221,7 +221,13 @@ The following is a list of sanitizers that are shipped with Nominatim.
|
||||
rendering:
|
||||
heading_level: 6
|
||||
|
||||
#### delete-tags
|
||||
|
||||
::: nominatim.tokenizer.sanitizers.delete_tags
|
||||
selection:
|
||||
members: False
|
||||
rendering:
|
||||
heading_level: 6
|
||||
|
||||
#### Token Analysis
|
||||
|
||||
|
||||
144
nominatim/tokenizer/sanitizers/delete_tags.py
Normal file
144
nominatim/tokenizer/sanitizers/delete_tags.py
Normal file
@@ -0,0 +1,144 @@
|
||||
# SPDX-License-Identifier: GPL-2.0-only
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2023 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Sanitizer which prevents certain tags from getting into the search index.
|
||||
It remove tags which matches all properties given below.
|
||||
|
||||
|
||||
Arguments:
|
||||
type: Define which type of tags should be considered for removal.
|
||||
There are two types of tags 'name' and 'address' tags.
|
||||
Takes a string 'name' or 'address'. (default: 'name')
|
||||
|
||||
filter-kind: Define which 'kind' of tags should be removed.
|
||||
Takes a string or list of strings where each
|
||||
string is a regular expression. A tag is considered
|
||||
to be a candidate for removal if its 'kind' property
|
||||
fully matches any of the given regular expressions.
|
||||
Note that by default all 'kind' of tags are considered.
|
||||
|
||||
suffix: Define the 'suffix' property of the tags which should be
|
||||
removed. Takes a string or list of strings where each
|
||||
string is a regular expression. A tag is considered to be a
|
||||
candidate for removal if its 'suffix' property fully
|
||||
matches any of the given regular expressions. Note that by
|
||||
default tags with any suffix value are considered including
|
||||
those which don't have a suffix at all.
|
||||
|
||||
name: Define the 'name' property corresponding to the 'kind' property
|
||||
of the tag. Takes a string or list of strings where each string
|
||||
is a regular expression. A tag is considered to be a candidate
|
||||
for removal if its name fully matches any of the given regular
|
||||
expressions. Note that by default tags with any 'name' are
|
||||
considered.
|
||||
|
||||
country_code: Define the country code of places whose tags should be
|
||||
considered for removed. Takes a string or list of strings
|
||||
where each string is a two-letter lower-case country code.
|
||||
Note that by default tags of places with any country code
|
||||
are considered including those which don't have a country
|
||||
code at all.
|
||||
|
||||
rank_address: Define the address rank of places whose tags should be
|
||||
considered for removal. Takes a string or list of strings
|
||||
where each string is a number or range of number or the
|
||||
form <from>-<to>.
|
||||
Note that default is '0-30', which means that tags of all
|
||||
places are considered.
|
||||
See https://nominatim.org/release-docs/latest/customize/Ranking/#address-rank
|
||||
to learn more about address rank.
|
||||
|
||||
|
||||
"""
|
||||
from typing import Callable, List, Optional, Pattern, Tuple, Sequence
|
||||
import re
|
||||
|
||||
from nominatim.tokenizer.sanitizers.base import ProcessInfo
|
||||
from nominatim.data.place_name import PlaceName
|
||||
from nominatim.tokenizer.sanitizers.config import SanitizerConfig
|
||||
|
||||
class _TagSanitizer:
|
||||
|
||||
def __init__(self, config: SanitizerConfig) -> None:
|
||||
self.type = config.get('type', 'name')
|
||||
self.filter_kind = config.get_filter_kind()
|
||||
self.country_codes = config.get_string_list('country_code', [])
|
||||
self.allowed_ranks = self._set_allowed_ranks( \
|
||||
config.get_string_list('rank_address', ['0-30']))
|
||||
|
||||
self.has_country_code = config.get('country_code', None) is not None
|
||||
|
||||
suffixregexps = config.get_string_list('suffix', [r'[\s\S]*'])
|
||||
self.suffix_regexp = [re.compile(r) for r in suffixregexps]
|
||||
|
||||
nameregexps = config.get_string_list('name', [r'[\s\S]*'])
|
||||
self.name_regexp = [re.compile(r) for r in nameregexps]
|
||||
|
||||
|
||||
|
||||
def __call__(self, obj: ProcessInfo) -> None:
|
||||
tags = obj.names if self.type == 'name' else obj.address
|
||||
|
||||
if (not tags or
|
||||
self.has_country_code and
|
||||
obj.place.country_code not in self.country_codes or
|
||||
not self.allowed_ranks[obj.place.rank_address]):
|
||||
return
|
||||
|
||||
filtered_tags: List[PlaceName] = []
|
||||
|
||||
for tag in tags:
|
||||
|
||||
if (not self.filter_kind(tag.kind) or
|
||||
not self._matches(tag.suffix, self.suffix_regexp) or
|
||||
not self._matches(tag.name, self.name_regexp)):
|
||||
filtered_tags.append(tag)
|
||||
|
||||
|
||||
if self.type == 'name':
|
||||
obj.names = filtered_tags
|
||||
else:
|
||||
obj.address = filtered_tags
|
||||
|
||||
|
||||
def _set_allowed_ranks(self, ranks: Sequence[str]) -> Tuple[bool, ...]:
|
||||
""" Returns a tuple of 31 boolean values corresponding to the
|
||||
address ranks 0-30. Value at index 'i' is True if rank 'i'
|
||||
is present in the ranks or lies in the range of any of the
|
||||
ranks provided in the sanitizer configuration, otherwise
|
||||
the value is False.
|
||||
"""
|
||||
allowed_ranks = [False] * 31
|
||||
|
||||
for rank in ranks:
|
||||
intvl = [int(x) for x in rank.split('-')]
|
||||
|
||||
start, end = (intvl[0], intvl[0]) if len(intvl) == 1 else (intvl[0], intvl[1])
|
||||
|
||||
for i in range(start, end + 1):
|
||||
allowed_ranks[i] = True
|
||||
|
||||
|
||||
return tuple(allowed_ranks)
|
||||
|
||||
|
||||
def _matches(self, value: Optional[str], patterns: List[Pattern[str]]) -> bool:
|
||||
""" Returns True if the given value fully matches any of the regular
|
||||
expression pattern in the list. Otherwise, returns False.
|
||||
|
||||
Note that if the value is None, it is taken as an empty string.
|
||||
"""
|
||||
target = '' if value is None else value
|
||||
return any(r.fullmatch(target) is not None for r in patterns)
|
||||
|
||||
|
||||
|
||||
def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
|
||||
""" Create a function to process removal of certain tags.
|
||||
"""
|
||||
|
||||
return _TagSanitizer(config)
|
||||
327
test/python/tokenizer/sanitizers/test_delete_tags.py
Normal file
327
test/python/tokenizer/sanitizers/test_delete_tags.py
Normal file
@@ -0,0 +1,327 @@
|
||||
# SPDX-License-Identifier: GPL-2.0-only
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2023 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Tests for the sanitizer that normalizes housenumbers.
|
||||
"""
|
||||
import pytest
|
||||
|
||||
|
||||
from nominatim.data.place_info import PlaceInfo
|
||||
from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
|
||||
|
||||
|
||||
class TestWithDefault:
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def setup_country(self, def_config):
|
||||
self.config = def_config
|
||||
|
||||
def run_sanitizer_on(self, type, **kwargs):
|
||||
|
||||
place = PlaceInfo({type: {k.replace('_', ':'): v for k, v in kwargs.items()},
|
||||
'country_code': 'de', 'rank_address': 30})
|
||||
|
||||
sanitizer_args = {'step': 'delete-tags'}
|
||||
|
||||
name, address = PlaceSanitizer([sanitizer_args],
|
||||
self.config).process_names(place)
|
||||
|
||||
return {
|
||||
'name': sorted([(p.name, p.kind, p.suffix or '') for p in name]),
|
||||
'address': sorted([(p.name, p.kind, p.suffix or '') for p in address])
|
||||
}
|
||||
|
||||
|
||||
def test_on_name(self):
|
||||
res = self.run_sanitizer_on('name', name='foo', ref='bar', ref_abc='baz')
|
||||
|
||||
assert res.get('name') == []
|
||||
|
||||
def test_on_address(self):
|
||||
res = self.run_sanitizer_on('address', name='foo', ref='bar', ref_abc='baz')
|
||||
|
||||
assert res.get('address') == [('bar', 'ref', ''), ('baz', 'ref', 'abc'),
|
||||
('foo', 'name', '')]
|
||||
|
||||
|
||||
class TestTypeField:
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def setup_country(self, def_config):
|
||||
self.config = def_config
|
||||
|
||||
def run_sanitizer_on(self, type, **kwargs):
|
||||
|
||||
place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()},
|
||||
'country_code': 'de', 'rank_address': 30})
|
||||
|
||||
sanitizer_args = {
|
||||
'step': 'delete-tags',
|
||||
'type': type,
|
||||
}
|
||||
|
||||
name, _ = PlaceSanitizer([sanitizer_args],
|
||||
self.config).process_names(place)
|
||||
|
||||
return sorted([(p.name, p.kind, p.suffix or '') for p in name])
|
||||
|
||||
def test_name_type(self):
|
||||
res = self.run_sanitizer_on('name', name='foo', ref='bar', ref_abc='baz')
|
||||
|
||||
assert res == []
|
||||
|
||||
def test_address_type(self):
|
||||
res = self.run_sanitizer_on('address', name='foo', ref='bar', ref_abc='baz')
|
||||
|
||||
assert res == [('bar', 'ref', ''), ('baz', 'ref', 'abc'),
|
||||
('foo', 'name', '')]
|
||||
|
||||
class TestFilterKind:
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def setup_country(self, def_config):
|
||||
self.config = def_config
|
||||
|
||||
def run_sanitizer_on(self, filt, **kwargs):
|
||||
|
||||
place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()},
|
||||
'country_code': 'de', 'rank_address': 30})
|
||||
|
||||
sanitizer_args = {
|
||||
'step': 'delete-tags',
|
||||
'filter-kind': filt,
|
||||
}
|
||||
|
||||
name, _ = PlaceSanitizer([sanitizer_args],
|
||||
self.config).process_names(place)
|
||||
|
||||
return sorted([(p.name, p.kind, p.suffix or '') for p in name])
|
||||
|
||||
def test_single_exact_name(self):
|
||||
res = self.run_sanitizer_on(['name'], ref='foo', name='foo',
|
||||
name_abc='bar', ref_abc='bar')
|
||||
|
||||
assert res == [('bar', 'ref', 'abc'), ('foo', 'ref', '')]
|
||||
|
||||
|
||||
def test_single_pattern(self):
|
||||
res = self.run_sanitizer_on(['.*name'],
|
||||
name_fr='foo', ref_fr='foo', namexx_fr='bar',
|
||||
shortname_fr='bar', name='bar')
|
||||
|
||||
assert res == [('bar', 'namexx', 'fr'), ('foo', 'ref', 'fr')]
|
||||
|
||||
|
||||
def test_multiple_patterns(self):
|
||||
res = self.run_sanitizer_on(['.*name', 'ref'],
|
||||
name_fr='foo', ref_fr='foo', oldref_fr='foo',
|
||||
namexx_fr='bar', shortname_fr='baz', name='baz')
|
||||
|
||||
assert res == [('bar', 'namexx', 'fr'), ('foo', 'oldref', 'fr')]
|
||||
|
||||
|
||||
class TestRankAddress:
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def setup_country(self, def_config):
|
||||
self.config = def_config
|
||||
|
||||
def run_sanitizer_on(self, rank_addr, **kwargs):
|
||||
|
||||
place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()},
|
||||
'country_code': 'de', 'rank_address': 30})
|
||||
|
||||
sanitizer_args = {
|
||||
'step': 'delete-tags',
|
||||
'rank_address': rank_addr
|
||||
}
|
||||
|
||||
name, _ = PlaceSanitizer([sanitizer_args],
|
||||
self.config).process_names(place)
|
||||
|
||||
return sorted([(p.name, p.kind, p.suffix or '') for p in name])
|
||||
|
||||
|
||||
def test_single_rank(self):
|
||||
res = self.run_sanitizer_on('30', name='foo', ref='bar')
|
||||
|
||||
assert res == []
|
||||
|
||||
def test_single_rank_fail(self):
|
||||
res = self.run_sanitizer_on('28', name='foo', ref='bar')
|
||||
|
||||
assert res == [('bar', 'ref', ''), ('foo', 'name', '')]
|
||||
|
||||
def test_ranged_rank_pass(self):
|
||||
res = self.run_sanitizer_on('26-30', name='foo', ref='bar')
|
||||
|
||||
assert res == []
|
||||
|
||||
def test_ranged_rank_fail(self):
|
||||
res = self.run_sanitizer_on('26-29', name='foo', ref='bar')
|
||||
|
||||
assert res == [('bar', 'ref', ''), ('foo', 'name', '')]
|
||||
|
||||
def test_mixed_rank_pass(self):
|
||||
res = self.run_sanitizer_on(['4', '20-28', '30', '10-12'], name='foo', ref='bar')
|
||||
|
||||
assert res == []
|
||||
|
||||
def test_mixed_rank_fail(self):
|
||||
res = self.run_sanitizer_on(['4-8', '10', '26-29', '18'], name='foo', ref='bar')
|
||||
|
||||
assert res == [('bar', 'ref', ''), ('foo', 'name', '')]
|
||||
|
||||
|
||||
class TestSuffix:
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def setup_country(self, def_config):
|
||||
self.config = def_config
|
||||
|
||||
def run_sanitizer_on(self, suffix, **kwargs):
|
||||
|
||||
place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()},
|
||||
'country_code': 'de', 'rank_address': 30})
|
||||
|
||||
sanitizer_args = {
|
||||
'step': 'delete-tags',
|
||||
'suffix': suffix,
|
||||
}
|
||||
|
||||
name, _ = PlaceSanitizer([sanitizer_args],
|
||||
self.config).process_names(place)
|
||||
|
||||
return sorted([(p.name, p.kind, p.suffix or '') for p in name])
|
||||
|
||||
|
||||
def test_single_suffix(self):
|
||||
res = self.run_sanitizer_on('abc', name='foo', name_abc='foo',
|
||||
name_pqr='bar', ref='bar', ref_abc='baz')
|
||||
|
||||
assert res == [('bar', 'name', 'pqr'), ('bar', 'ref', ''), ('foo', 'name', '')]
|
||||
|
||||
def test_multiple_suffix(self):
|
||||
res = self.run_sanitizer_on(['abc.*', 'pqr'], name='foo', name_abcxx='foo',
|
||||
ref_pqr='bar', name_pqrxx='baz')
|
||||
|
||||
assert res == [('baz', 'name', 'pqrxx'), ('foo', 'name', '')]
|
||||
|
||||
|
||||
|
||||
class TestCountryCodes:
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def setup_country(self, def_config):
|
||||
self.config = def_config
|
||||
|
||||
def run_sanitizer_on(self, country_code, **kwargs):
|
||||
|
||||
place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()},
|
||||
'country_code': 'de', 'rank_address': 30})
|
||||
|
||||
sanitizer_args = {
|
||||
'step': 'delete-tags',
|
||||
'country_code': country_code,
|
||||
}
|
||||
|
||||
name, _ = PlaceSanitizer([sanitizer_args],
|
||||
self.config).process_names(place)
|
||||
|
||||
return sorted([(p.name, p.kind) for p in name])
|
||||
|
||||
|
||||
def test_single_country_code_pass(self):
|
||||
res = self.run_sanitizer_on('de', name='foo', ref='bar')
|
||||
|
||||
assert res == []
|
||||
|
||||
def test_single_country_code_fail(self):
|
||||
res = self.run_sanitizer_on('in', name='foo', ref='bar')
|
||||
|
||||
assert res == [('bar', 'ref'), ('foo', 'name')]
|
||||
|
||||
def test_empty_country_code_list(self):
|
||||
res = self.run_sanitizer_on([], name='foo', ref='bar')
|
||||
|
||||
assert res == [('bar', 'ref'), ('foo', 'name')]
|
||||
|
||||
def test_multiple_country_code_pass(self):
|
||||
res = self.run_sanitizer_on(['in', 'de', 'fr'], name='foo', ref='bar')
|
||||
|
||||
assert res == []
|
||||
|
||||
def test_multiple_country_code_fail(self):
|
||||
res = self.run_sanitizer_on(['in', 'au', 'fr'], name='foo', ref='bar')
|
||||
|
||||
assert res == [('bar', 'ref'), ('foo', 'name')]
|
||||
|
||||
class TestAllParameters:
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def setup_country(self, def_config):
|
||||
self.config = def_config
|
||||
|
||||
def run_sanitizer_on(self, country_code, rank_addr, suffix, **kwargs):
|
||||
|
||||
place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()},
|
||||
'country_code': 'de', 'rank_address': 30})
|
||||
|
||||
sanitizer_args = {
|
||||
'step': 'delete-tags',
|
||||
'type': 'name',
|
||||
'filter-kind': ['name', 'ref'],
|
||||
'country_code': country_code,
|
||||
'rank_address': rank_addr,
|
||||
'suffix': suffix,
|
||||
'name': r'[\s\S]*',
|
||||
}
|
||||
|
||||
name, _ = PlaceSanitizer([sanitizer_args],
|
||||
self.config).process_names(place)
|
||||
|
||||
return sorted([(p.name, p.kind, p.suffix or '') for p in name])
|
||||
|
||||
|
||||
def test_string_arguments_pass(self):
|
||||
res = self.run_sanitizer_on('de', '25-30', r'[\s\S]*',
|
||||
name='foo', ref='foo', name_abc='bar', ref_abc='baz')
|
||||
|
||||
assert res == []
|
||||
|
||||
def test_string_arguments_fail(self):
|
||||
res = self.run_sanitizer_on('in', '25-30', r'[\s\S]*',
|
||||
name='foo', ref='foo', name_abc='bar', ref_abc='baz')
|
||||
|
||||
assert res == [('bar', 'name', 'abc'), ('baz', 'ref', 'abc'),
|
||||
('foo', 'name', ''), ('foo', 'ref', '')]
|
||||
|
||||
def test_list_arguments_pass(self):
|
||||
res = self.run_sanitizer_on(['de', 'in'], ['20-28', '30'], [r'abc.*', r'[\s\S]*'],
|
||||
name='foo', ref_abc='foo', name_abcxx='bar', ref_pqr='baz')
|
||||
|
||||
assert res == []
|
||||
|
||||
def test_list_arguments_fail(self):
|
||||
res = self.run_sanitizer_on(['de', 'in'], ['14', '20-29'], [r'abc.*', r'pqr'],
|
||||
name='foo', ref_abc='foo', name_abcxx='bar', ref_pqr='baz')
|
||||
|
||||
assert res == [('bar', 'name', 'abcxx'), ('baz', 'ref', 'pqr'),
|
||||
('foo', 'name', ''), ('foo', 'ref', 'abc')]
|
||||
|
||||
def test_mix_arguments_pass(self):
|
||||
res = self.run_sanitizer_on('de', ['10', '20-28', '30'], r'[\s\S]*',
|
||||
name='foo', ref_abc='foo', name_abcxx='bar', ref_pqr='baz')
|
||||
|
||||
assert res == []
|
||||
|
||||
def test_mix_arguments_fail(self):
|
||||
res = self.run_sanitizer_on(['de', 'in'], ['10', '20-28', '30'], r'abc.*',
|
||||
name='foo', ref='foo', name_pqr='bar', ref_pqr='baz')
|
||||
|
||||
assert res == [('bar', 'name', 'pqr'), ('baz', 'ref', 'pqr'),
|
||||
('foo', 'name', ''), ('foo', 'ref', '')]
|
||||
Reference in New Issue
Block a user