mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-02-15 19:07:58 +00:00
restrict postcode parsing in typed phrases
Postcodes can only appear in postcode-type phrases and must then cover the full phrase
This commit is contained in:
@@ -55,32 +55,49 @@ class PostcodeParser:
|
||||
[start node id, end node id, postcode token]
|
||||
"""
|
||||
nodes = query.nodes
|
||||
outcodes = set()
|
||||
outcodes: Set[Tuple[int, int, str]] = set()
|
||||
|
||||
for i in range(query.num_token_slots()):
|
||||
if nodes[i].btype in '<,: ' and nodes[i + 1].btype != '`':
|
||||
word = nodes[i + 1].term_normalized + nodes[i + 1].btype
|
||||
if word[-1] in ' -' and nodes[i + 2].btype != '`':
|
||||
word += nodes[i + 2].term_normalized + nodes[i + 2].btype
|
||||
if word[-1] in ' -' and nodes[i + 3].btype != '`':
|
||||
word += nodes[i + 3].term_normalized + nodes[i + 3].btype
|
||||
if nodes[i].btype in '<,: ' and nodes[i + 1].btype != '`' \
|
||||
and (i == 0 or nodes[i - 1].ptype != qmod.PHRASE_POSTCODE):
|
||||
if nodes[i].ptype == qmod.PHRASE_ANY:
|
||||
word = nodes[i + 1].term_normalized + nodes[i + 1].btype
|
||||
if word[-1] in ' -' and nodes[i + 2].btype != '`' \
|
||||
and nodes[i + 1].ptype == qmod.PHRASE_ANY:
|
||||
word += nodes[i + 2].term_normalized + nodes[i + 2].btype
|
||||
if word[-1] in ' -' and nodes[i + 3].btype != '`' \
|
||||
and nodes[i + 2].ptype == qmod.PHRASE_ANY:
|
||||
word += nodes[i + 3].term_normalized + nodes[i + 3].btype
|
||||
|
||||
self._match_word(word, i, False, outcodes)
|
||||
elif nodes[i].ptype == qmod.PHRASE_POSTCODE:
|
||||
word = nodes[i + 1].term_normalized + nodes[i + 1].btype
|
||||
for j in range(i + 1, query.num_token_slots()):
|
||||
if nodes[j].ptype != qmod.PHRASE_POSTCODE:
|
||||
break
|
||||
word += nodes[j + 1].term_normalized + nodes[j + 1].btype
|
||||
|
||||
self._match_word(word, i, True, outcodes)
|
||||
|
||||
# Use global pattern to check for presence of any postcode.
|
||||
m = self.global_pattern.fullmatch(word)
|
||||
if m:
|
||||
# If there was a match, check against each pattern separately
|
||||
# because multiple patterns might be machting at the end.
|
||||
cc = m.group('cc')
|
||||
pc_word = m.group('pc')
|
||||
cc_spaces = len(m.group('space') or '')
|
||||
for pattern, info in self.local_patterns:
|
||||
lm = pattern.match(pc_word)
|
||||
if lm:
|
||||
trange = (i, i + cc_spaces + sum(c in ' ,-:>' for c in lm.group(0)))
|
||||
for out, out_ccs in info:
|
||||
if cc is None or cc in out_ccs:
|
||||
if out:
|
||||
outcodes.add((*trange, lm.expand(out).upper()))
|
||||
else:
|
||||
outcodes.add((*trange, lm.group(0)[:-1].upper()))
|
||||
return outcodes
|
||||
|
||||
def _match_word(self, word: str, pos: int, fullmatch: bool,
|
||||
outcodes: Set[Tuple[int, int, str]]) -> None:
|
||||
# Use global pattern to check for presence of any postcode.
|
||||
m = self.global_pattern.fullmatch(word)
|
||||
if m:
|
||||
# If there was a match, check against each pattern separately
|
||||
# because multiple patterns might be machting at the end.
|
||||
cc = m.group('cc')
|
||||
pc_word = m.group('pc')
|
||||
cc_spaces = len(m.group('space') or '')
|
||||
for pattern, info in self.local_patterns:
|
||||
lm = pattern.fullmatch(pc_word) if fullmatch else pattern.match(pc_word)
|
||||
if lm:
|
||||
trange = (pos, pos + cc_spaces + sum(c in ' ,-:>' for c in lm.group(0)))
|
||||
for out, out_ccs in info:
|
||||
if cc is None or cc in out_ccs:
|
||||
if out:
|
||||
outcodes.add((*trange, lm.expand(out).upper()))
|
||||
else:
|
||||
outcodes.add((*trange, lm.group(0)[:-1].upper()))
|
||||
|
||||
@@ -14,7 +14,7 @@ from itertools import zip_longest
|
||||
import pytest
|
||||
|
||||
from nominatim_api.search.postcode_parser import PostcodeParser
|
||||
from nominatim_api.search.query import QueryStruct, PHRASE_ANY
|
||||
from nominatim_api.search.query import QueryStruct, PHRASE_ANY, PHRASE_POSTCODE, PHRASE_STREET
|
||||
|
||||
@pytest.fixture
|
||||
def pc_config(project_env):
|
||||
@@ -131,3 +131,24 @@ def test_postcode_with_non_matching_country_prefix(pc_config):
|
||||
|
||||
assert not parser.parse(mk_query('ky12233'))
|
||||
|
||||
def test_postcode_inside_postcode_phrase(pc_config):
|
||||
parser = PostcodeParser(pc_config)
|
||||
|
||||
query = QueryStruct([])
|
||||
query.nodes[-1].ptype = PHRASE_STREET
|
||||
query.add_node(',', PHRASE_STREET, 0.1, '12345', '12345')
|
||||
query.add_node(',', PHRASE_POSTCODE, 0.1, 'xz', 'xz')
|
||||
query.add_node('>', PHRASE_POSTCODE, 0.1, '4444', '4444')
|
||||
|
||||
assert parser.parse(query) == {(2, 3, '4444')}
|
||||
|
||||
|
||||
def test_partial_postcode_in_postcode_phrase(pc_config):
|
||||
parser = PostcodeParser(pc_config)
|
||||
|
||||
query = QueryStruct([])
|
||||
query.nodes[-1].ptype = PHRASE_POSTCODE
|
||||
query.add_node(' ', PHRASE_POSTCODE, 0.1, '2224', '2224')
|
||||
query.add_node('>', PHRASE_POSTCODE, 0.1, '12345', '12345')
|
||||
|
||||
assert not parser.parse(query)
|
||||
|
||||
Reference in New Issue
Block a user