Merge pull request #2993 from biswajit-k/delete-tags

Adds sanitizer for preventing certain tags to enter search index based on parameters
This commit is contained in:
Sarah Hoffmann
2023-03-09 14:31:45 +01:00
committed by GitHub
3 changed files with 479 additions and 2 deletions

View File

@@ -102,7 +102,7 @@ Here is an example configuration file:
``` yaml
normalization:
- ":: lower ()"
- "ß > 'ss'" # German szet is unimbigiously equal to double ss
- "ß > 'ss'" # German szet is unambiguously equal to double ss
transliteration:
- !include /etc/nominatim/icu-rules/extended-unicode-to-asccii.yaml
- ":: Ascii ()"
@@ -128,7 +128,7 @@ The configuration file contains four sections:
The normalization and transliteration sections each define a set of
ICU rules that are applied to the names.
The **normalisation** rules are applied after sanitation. They should remove
The **normalization** rules are applied after sanitation. They should remove
any information that is not relevant for search at all. Usual rules to be
applied here are: lower-casing, removing of special characters, cleanup of
spaces.
@@ -221,7 +221,13 @@ The following is a list of sanitizers that are shipped with Nominatim.
rendering:
heading_level: 6
#### delete-tags
::: nominatim.tokenizer.sanitizers.delete_tags
selection:
members: False
rendering:
heading_level: 6
#### Token Analysis

View File

@@ -0,0 +1,144 @@
# SPDX-License-Identifier: GPL-2.0-only
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2023 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Sanitizer which prevents certain tags from getting into the search index.
It remove tags which matches all properties given below.
Arguments:
type: Define which type of tags should be considered for removal.
There are two types of tags 'name' and 'address' tags.
Takes a string 'name' or 'address'. (default: 'name')
filter-kind: Define which 'kind' of tags should be removed.
Takes a string or list of strings where each
string is a regular expression. A tag is considered
to be a candidate for removal if its 'kind' property
fully matches any of the given regular expressions.
Note that by default all 'kind' of tags are considered.
suffix: Define the 'suffix' property of the tags which should be
removed. Takes a string or list of strings where each
string is a regular expression. A tag is considered to be a
candidate for removal if its 'suffix' property fully
matches any of the given regular expressions. Note that by
default tags with any suffix value are considered including
those which don't have a suffix at all.
name: Define the 'name' property corresponding to the 'kind' property
of the tag. Takes a string or list of strings where each string
is a regular expression. A tag is considered to be a candidate
for removal if its name fully matches any of the given regular
expressions. Note that by default tags with any 'name' are
considered.
country_code: Define the country code of places whose tags should be
considered for removed. Takes a string or list of strings
where each string is a two-letter lower-case country code.
Note that by default tags of places with any country code
are considered including those which don't have a country
code at all.
rank_address: Define the address rank of places whose tags should be
considered for removal. Takes a string or list of strings
where each string is a number or range of number or the
form <from>-<to>.
Note that default is '0-30', which means that tags of all
places are considered.
See https://nominatim.org/release-docs/latest/customize/Ranking/#address-rank
to learn more about address rank.
"""
from typing import Callable, List, Optional, Pattern, Tuple, Sequence
import re
from nominatim.tokenizer.sanitizers.base import ProcessInfo
from nominatim.data.place_name import PlaceName
from nominatim.tokenizer.sanitizers.config import SanitizerConfig
class _TagSanitizer:
def __init__(self, config: SanitizerConfig) -> None:
self.type = config.get('type', 'name')
self.filter_kind = config.get_filter_kind()
self.country_codes = config.get_string_list('country_code', [])
self.allowed_ranks = self._set_allowed_ranks( \
config.get_string_list('rank_address', ['0-30']))
self.has_country_code = config.get('country_code', None) is not None
suffixregexps = config.get_string_list('suffix', [r'[\s\S]*'])
self.suffix_regexp = [re.compile(r) for r in suffixregexps]
nameregexps = config.get_string_list('name', [r'[\s\S]*'])
self.name_regexp = [re.compile(r) for r in nameregexps]
def __call__(self, obj: ProcessInfo) -> None:
tags = obj.names if self.type == 'name' else obj.address
if (not tags or
self.has_country_code and
obj.place.country_code not in self.country_codes or
not self.allowed_ranks[obj.place.rank_address]):
return
filtered_tags: List[PlaceName] = []
for tag in tags:
if (not self.filter_kind(tag.kind) or
not self._matches(tag.suffix, self.suffix_regexp) or
not self._matches(tag.name, self.name_regexp)):
filtered_tags.append(tag)
if self.type == 'name':
obj.names = filtered_tags
else:
obj.address = filtered_tags
def _set_allowed_ranks(self, ranks: Sequence[str]) -> Tuple[bool, ...]:
""" Returns a tuple of 31 boolean values corresponding to the
address ranks 0-30. Value at index 'i' is True if rank 'i'
is present in the ranks or lies in the range of any of the
ranks provided in the sanitizer configuration, otherwise
the value is False.
"""
allowed_ranks = [False] * 31
for rank in ranks:
intvl = [int(x) for x in rank.split('-')]
start, end = (intvl[0], intvl[0]) if len(intvl) == 1 else (intvl[0], intvl[1])
for i in range(start, end + 1):
allowed_ranks[i] = True
return tuple(allowed_ranks)
def _matches(self, value: Optional[str], patterns: List[Pattern[str]]) -> bool:
""" Returns True if the given value fully matches any of the regular
expression pattern in the list. Otherwise, returns False.
Note that if the value is None, it is taken as an empty string.
"""
target = '' if value is None else value
return any(r.fullmatch(target) is not None for r in patterns)
def create(config: SanitizerConfig) -> Callable[[ProcessInfo], None]:
""" Create a function to process removal of certain tags.
"""
return _TagSanitizer(config)

View File

@@ -0,0 +1,327 @@
# SPDX-License-Identifier: GPL-2.0-only
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2023 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Tests for the sanitizer that normalizes housenumbers.
"""
import pytest
from nominatim.data.place_info import PlaceInfo
from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
class TestWithDefault:
@pytest.fixture(autouse=True)
def setup_country(self, def_config):
self.config = def_config
def run_sanitizer_on(self, type, **kwargs):
place = PlaceInfo({type: {k.replace('_', ':'): v for k, v in kwargs.items()},
'country_code': 'de', 'rank_address': 30})
sanitizer_args = {'step': 'delete-tags'}
name, address = PlaceSanitizer([sanitizer_args],
self.config).process_names(place)
return {
'name': sorted([(p.name, p.kind, p.suffix or '') for p in name]),
'address': sorted([(p.name, p.kind, p.suffix or '') for p in address])
}
def test_on_name(self):
res = self.run_sanitizer_on('name', name='foo', ref='bar', ref_abc='baz')
assert res.get('name') == []
def test_on_address(self):
res = self.run_sanitizer_on('address', name='foo', ref='bar', ref_abc='baz')
assert res.get('address') == [('bar', 'ref', ''), ('baz', 'ref', 'abc'),
('foo', 'name', '')]
class TestTypeField:
@pytest.fixture(autouse=True)
def setup_country(self, def_config):
self.config = def_config
def run_sanitizer_on(self, type, **kwargs):
place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()},
'country_code': 'de', 'rank_address': 30})
sanitizer_args = {
'step': 'delete-tags',
'type': type,
}
name, _ = PlaceSanitizer([sanitizer_args],
self.config).process_names(place)
return sorted([(p.name, p.kind, p.suffix or '') for p in name])
def test_name_type(self):
res = self.run_sanitizer_on('name', name='foo', ref='bar', ref_abc='baz')
assert res == []
def test_address_type(self):
res = self.run_sanitizer_on('address', name='foo', ref='bar', ref_abc='baz')
assert res == [('bar', 'ref', ''), ('baz', 'ref', 'abc'),
('foo', 'name', '')]
class TestFilterKind:
@pytest.fixture(autouse=True)
def setup_country(self, def_config):
self.config = def_config
def run_sanitizer_on(self, filt, **kwargs):
place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()},
'country_code': 'de', 'rank_address': 30})
sanitizer_args = {
'step': 'delete-tags',
'filter-kind': filt,
}
name, _ = PlaceSanitizer([sanitizer_args],
self.config).process_names(place)
return sorted([(p.name, p.kind, p.suffix or '') for p in name])
def test_single_exact_name(self):
res = self.run_sanitizer_on(['name'], ref='foo', name='foo',
name_abc='bar', ref_abc='bar')
assert res == [('bar', 'ref', 'abc'), ('foo', 'ref', '')]
def test_single_pattern(self):
res = self.run_sanitizer_on(['.*name'],
name_fr='foo', ref_fr='foo', namexx_fr='bar',
shortname_fr='bar', name='bar')
assert res == [('bar', 'namexx', 'fr'), ('foo', 'ref', 'fr')]
def test_multiple_patterns(self):
res = self.run_sanitizer_on(['.*name', 'ref'],
name_fr='foo', ref_fr='foo', oldref_fr='foo',
namexx_fr='bar', shortname_fr='baz', name='baz')
assert res == [('bar', 'namexx', 'fr'), ('foo', 'oldref', 'fr')]
class TestRankAddress:
@pytest.fixture(autouse=True)
def setup_country(self, def_config):
self.config = def_config
def run_sanitizer_on(self, rank_addr, **kwargs):
place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()},
'country_code': 'de', 'rank_address': 30})
sanitizer_args = {
'step': 'delete-tags',
'rank_address': rank_addr
}
name, _ = PlaceSanitizer([sanitizer_args],
self.config).process_names(place)
return sorted([(p.name, p.kind, p.suffix or '') for p in name])
def test_single_rank(self):
res = self.run_sanitizer_on('30', name='foo', ref='bar')
assert res == []
def test_single_rank_fail(self):
res = self.run_sanitizer_on('28', name='foo', ref='bar')
assert res == [('bar', 'ref', ''), ('foo', 'name', '')]
def test_ranged_rank_pass(self):
res = self.run_sanitizer_on('26-30', name='foo', ref='bar')
assert res == []
def test_ranged_rank_fail(self):
res = self.run_sanitizer_on('26-29', name='foo', ref='bar')
assert res == [('bar', 'ref', ''), ('foo', 'name', '')]
def test_mixed_rank_pass(self):
res = self.run_sanitizer_on(['4', '20-28', '30', '10-12'], name='foo', ref='bar')
assert res == []
def test_mixed_rank_fail(self):
res = self.run_sanitizer_on(['4-8', '10', '26-29', '18'], name='foo', ref='bar')
assert res == [('bar', 'ref', ''), ('foo', 'name', '')]
class TestSuffix:
@pytest.fixture(autouse=True)
def setup_country(self, def_config):
self.config = def_config
def run_sanitizer_on(self, suffix, **kwargs):
place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()},
'country_code': 'de', 'rank_address': 30})
sanitizer_args = {
'step': 'delete-tags',
'suffix': suffix,
}
name, _ = PlaceSanitizer([sanitizer_args],
self.config).process_names(place)
return sorted([(p.name, p.kind, p.suffix or '') for p in name])
def test_single_suffix(self):
res = self.run_sanitizer_on('abc', name='foo', name_abc='foo',
name_pqr='bar', ref='bar', ref_abc='baz')
assert res == [('bar', 'name', 'pqr'), ('bar', 'ref', ''), ('foo', 'name', '')]
def test_multiple_suffix(self):
res = self.run_sanitizer_on(['abc.*', 'pqr'], name='foo', name_abcxx='foo',
ref_pqr='bar', name_pqrxx='baz')
assert res == [('baz', 'name', 'pqrxx'), ('foo', 'name', '')]
class TestCountryCodes:
@pytest.fixture(autouse=True)
def setup_country(self, def_config):
self.config = def_config
def run_sanitizer_on(self, country_code, **kwargs):
place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()},
'country_code': 'de', 'rank_address': 30})
sanitizer_args = {
'step': 'delete-tags',
'country_code': country_code,
}
name, _ = PlaceSanitizer([sanitizer_args],
self.config).process_names(place)
return sorted([(p.name, p.kind) for p in name])
def test_single_country_code_pass(self):
res = self.run_sanitizer_on('de', name='foo', ref='bar')
assert res == []
def test_single_country_code_fail(self):
res = self.run_sanitizer_on('in', name='foo', ref='bar')
assert res == [('bar', 'ref'), ('foo', 'name')]
def test_empty_country_code_list(self):
res = self.run_sanitizer_on([], name='foo', ref='bar')
assert res == [('bar', 'ref'), ('foo', 'name')]
def test_multiple_country_code_pass(self):
res = self.run_sanitizer_on(['in', 'de', 'fr'], name='foo', ref='bar')
assert res == []
def test_multiple_country_code_fail(self):
res = self.run_sanitizer_on(['in', 'au', 'fr'], name='foo', ref='bar')
assert res == [('bar', 'ref'), ('foo', 'name')]
class TestAllParameters:
@pytest.fixture(autouse=True)
def setup_country(self, def_config):
self.config = def_config
def run_sanitizer_on(self, country_code, rank_addr, suffix, **kwargs):
place = PlaceInfo({'name': {k.replace('_', ':'): v for k, v in kwargs.items()},
'country_code': 'de', 'rank_address': 30})
sanitizer_args = {
'step': 'delete-tags',
'type': 'name',
'filter-kind': ['name', 'ref'],
'country_code': country_code,
'rank_address': rank_addr,
'suffix': suffix,
'name': r'[\s\S]*',
}
name, _ = PlaceSanitizer([sanitizer_args],
self.config).process_names(place)
return sorted([(p.name, p.kind, p.suffix or '') for p in name])
def test_string_arguments_pass(self):
res = self.run_sanitizer_on('de', '25-30', r'[\s\S]*',
name='foo', ref='foo', name_abc='bar', ref_abc='baz')
assert res == []
def test_string_arguments_fail(self):
res = self.run_sanitizer_on('in', '25-30', r'[\s\S]*',
name='foo', ref='foo', name_abc='bar', ref_abc='baz')
assert res == [('bar', 'name', 'abc'), ('baz', 'ref', 'abc'),
('foo', 'name', ''), ('foo', 'ref', '')]
def test_list_arguments_pass(self):
res = self.run_sanitizer_on(['de', 'in'], ['20-28', '30'], [r'abc.*', r'[\s\S]*'],
name='foo', ref_abc='foo', name_abcxx='bar', ref_pqr='baz')
assert res == []
def test_list_arguments_fail(self):
res = self.run_sanitizer_on(['de', 'in'], ['14', '20-29'], [r'abc.*', r'pqr'],
name='foo', ref_abc='foo', name_abcxx='bar', ref_pqr='baz')
assert res == [('bar', 'name', 'abcxx'), ('baz', 'ref', 'pqr'),
('foo', 'name', ''), ('foo', 'ref', 'abc')]
def test_mix_arguments_pass(self):
res = self.run_sanitizer_on('de', ['10', '20-28', '30'], r'[\s\S]*',
name='foo', ref_abc='foo', name_abcxx='bar', ref_pqr='baz')
assert res == []
def test_mix_arguments_fail(self):
res = self.run_sanitizer_on(['de', 'in'], ['10', '20-28', '30'], r'abc.*',
name='foo', ref='foo', name_pqr='bar', ref_pqr='baz')
assert res == [('bar', 'name', 'pqr'), ('baz', 'ref', 'pqr'),
('foo', 'name', ''), ('foo', 'ref', '')]