From eff60ba6becafc243b011d91761f4757462d9b30 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Tue, 25 Feb 2025 20:29:06 +0100 Subject: [PATCH 01/12] enable parsing of US ZIP+ codes The four-digit part of these postcodes will simply be ignored. --- settings/country_settings.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/settings/country_settings.yaml b/settings/country_settings.yaml index 667684c6..a2ca7412 100644 --- a/settings/country_settings.yaml +++ b/settings/country_settings.yaml @@ -1809,7 +1809,8 @@ us: languages: en names: !include country-names/us.yaml postcode: - pattern: "ddddd" + pattern: "(ddddd)(?:-dddd)?" + output: \1 # Uruguay (Uruguay) From e362a965e167dadd828a4a4b7fc58c6076e6586a Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Wed, 26 Feb 2025 14:37:08 +0100 Subject: [PATCH 02/12] search: merge QueryPart array with QueryNodes The basic information on terms is pretty much always used together with the node inforamtion. Merging them together saves some allocation while making lookup easier at the same time. --- src/nominatim_api/search/icu_tokenizer.py | 110 ++++++++---------- src/nominatim_api/search/query.py | 39 ++++++- .../api/search/test_api_search_query.py | 34 +++--- 3 files changed, 100 insertions(+), 83 deletions(-) diff --git a/src/nominatim_api/search/icu_tokenizer.py b/src/nominatim_api/search/icu_tokenizer.py index 1a449276..60e712d5 100644 --- a/src/nominatim_api/search/icu_tokenizer.py +++ b/src/nominatim_api/search/icu_tokenizer.py @@ -47,40 +47,27 @@ PENALTY_IN_TOKEN_BREAK = { } -@dataclasses.dataclass -class QueryPart: - """ Normalized and transliterated form of a single term in the query. - - When the term came out of a split during the transliteration, - the normalized string is the full word before transliteration. - Check the subsequent break type to figure out if the word is - continued. - - Penalty is the break penalty for the break following the token. - """ - token: str - normalized: str - penalty: float - - -QueryParts = List[QueryPart] WordDict = Dict[str, List[qmod.TokenRange]] -def extract_words(terms: List[QueryPart], start: int, words: WordDict) -> None: - """ Add all combinations of words in the terms list after the - given position to the word list. +def extract_words(query: qmod.QueryStruct, start: int, words: WordDict) -> None: + """ Add all combinations of words in the terms list starting with + the term leading into node 'start'. + + The words found will be added into the 'words' dictionary with + their start and end position. """ - total = len(terms) + nodes = query.nodes + total = len(nodes) base_penalty = PENALTY_IN_TOKEN_BREAK[qmod.BREAK_WORD] for first in range(start, total): - word = terms[first].token + word = nodes[first].term_lookup penalty = base_penalty - words[word].append(qmod.TokenRange(first, first + 1, penalty=penalty)) + words[word].append(qmod.TokenRange(first - 1, first, penalty=penalty)) for last in range(first + 1, min(first + 20, total)): - word = ' '.join((word, terms[last].token)) - penalty += terms[last - 1].penalty - words[word].append(qmod.TokenRange(first, last + 1, penalty=penalty)) + word = ' '.join((word, nodes[last].term_lookup)) + penalty += nodes[last - 1].penalty + words[word].append(qmod.TokenRange(first - 1, last, penalty=penalty)) @dataclasses.dataclass @@ -216,8 +203,8 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): if not query.source: return query - parts, words = self.split_query(query) - log().var_dump('Transliterated query', lambda: _dump_transliterated(query, parts)) + words = self.split_query(query) + log().var_dump('Transliterated query', lambda: query.get_transliterated_query()) for row in await self.lookup_in_db(list(words.keys())): for trange in words[row.word_token]: @@ -234,8 +221,8 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): else: query.add_token(trange, DB_TO_TOKEN_TYPE[row.type], token) - self.add_extra_tokens(query, parts) - self.rerank_tokens(query, parts) + self.add_extra_tokens(query) + self.rerank_tokens(query) log().table_dump('Word tokens', _dump_word_tokens(query)) @@ -248,15 +235,13 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): """ return cast(str, self.normalizer.transliterate(text)).strip('-: ') - def split_query(self, query: qmod.QueryStruct) -> Tuple[QueryParts, WordDict]: + def split_query(self, query: qmod.QueryStruct) -> WordDict: """ Transliterate the phrases and split them into tokens. - Returns the list of transliterated tokens together with their - normalized form and a dictionary of words for lookup together + Returns a dictionary of words for lookup together with their position. """ - parts: QueryParts = [] - phrase_start = 0 + phrase_start = 1 words: WordDict = defaultdict(list) for phrase in query.source: query.nodes[-1].ptype = phrase.ptype @@ -272,18 +257,18 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): if trans: for term in trans.split(' '): if term: - parts.append(QueryPart(term, word, - PENALTY_IN_TOKEN_BREAK[qmod.BREAK_TOKEN])) - query.add_node(qmod.BREAK_TOKEN, phrase.ptype) - query.nodes[-1].btype = breakchar - parts[-1].penalty = PENALTY_IN_TOKEN_BREAK[breakchar] + query.add_node(qmod.BREAK_TOKEN, phrase.ptype, + PENALTY_IN_TOKEN_BREAK[qmod.BREAK_TOKEN], + term, word) + query.nodes[-1].adjust_break(breakchar, + PENALTY_IN_TOKEN_BREAK[breakchar]) - extract_words(parts, phrase_start, words) + extract_words(query, phrase_start, words) - phrase_start = len(parts) - query.nodes[-1].btype = qmod.BREAK_END + phrase_start = len(query.nodes) + query.nodes[-1].adjust_break(qmod.BREAK_END, PENALTY_IN_TOKEN_BREAK[qmod.BREAK_END]) - return parts, words + return words async def lookup_in_db(self, words: List[str]) -> 'sa.Result[Any]': """ Return the token information from the database for the @@ -292,18 +277,23 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): t = self.conn.t.meta.tables['word'] return await self.conn.execute(t.select().where(t.c.word_token.in_(words))) - def add_extra_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None: + def add_extra_tokens(self, query: qmod.QueryStruct) -> None: """ Add tokens to query that are not saved in the database. """ - for part, node, i in zip(parts, query.nodes, range(1000)): - if len(part.token) <= 4 and part.token.isdigit()\ - and not node.has_tokens(i+1, qmod.TOKEN_HOUSENUMBER): - query.add_token(qmod.TokenRange(i, i+1), qmod.TOKEN_HOUSENUMBER, + need_hnr = False + for i, node in enumerate(query.nodes): + is_full_token = node.btype not in (qmod.BREAK_TOKEN, qmod.BREAK_PART) + if need_hnr and is_full_token \ + and len(node.term_normalized) <= 4 and node.term_normalized.isdigit(): + query.add_token(qmod.TokenRange(i-1, i), qmod.TOKEN_HOUSENUMBER, ICUToken(penalty=0.5, token=0, - count=1, addr_count=1, lookup_word=part.token, - word_token=part.token, info=None)) + count=1, addr_count=1, + lookup_word=node.term_lookup, + word_token=node.term_lookup, info=None)) - def rerank_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None: + need_hnr = is_full_token and not node.has_tokens(i+1, qmod.TOKEN_HOUSENUMBER) + + def rerank_tokens(self, query: qmod.QueryStruct) -> None: """ Add penalties to tokens that depend on presence of other token. """ for i, node, tlist in query.iter_token_lists(): @@ -320,21 +310,15 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): if repl.end == tlist.end and repl.ttype != qmod.TOKEN_HOUSENUMBER: repl.add_penalty(0.5 - tlist.tokens[0].penalty) elif tlist.ttype not in (qmod.TOKEN_COUNTRY, qmod.TOKEN_PARTIAL): - norm = parts[i].normalized - for j in range(i + 1, tlist.end): - if node.btype != qmod.BREAK_TOKEN: - norm += ' ' + parts[j].normalized + norm = ' '.join(n.term_normalized for n in query.nodes[i + 1:tlist.end + 1] + if n.btype != qmod.BREAK_TOKEN) + if not norm: + # Can happen when the token only covers a partial term + norm = query.nodes[i + 1].term_normalized for token in tlist.tokens: cast(ICUToken, token).rematch(norm) -def _dump_transliterated(query: qmod.QueryStruct, parts: QueryParts) -> str: - out = query.nodes[0].btype - for node, part in zip(query.nodes[1:], parts): - out += part.token + node.btype - return out - - def _dump_word_tokens(query: qmod.QueryStruct) -> Iterator[List[Any]]: yield ['type', 'token', 'word_token', 'lookup_word', 'penalty', 'count', 'info'] for node in query.nodes: diff --git a/src/nominatim_api/search/query.py b/src/nominatim_api/search/query.py index 8530c4f2..fcd6763b 100644 --- a/src/nominatim_api/search/query.py +++ b/src/nominatim_api/search/query.py @@ -171,11 +171,33 @@ class TokenList: @dataclasses.dataclass class QueryNode: """ A node of the query representing a break between terms. + + The node also contains information on the source term + ending at the node. The tokens are created from this information. """ btype: BreakType ptype: PhraseType + + penalty: float + """ Penalty for the break at this node. + """ + term_lookup: str + """ Transliterated term following this node. + """ + term_normalized: str + """ Normalised form of term following this node. + When the token resulted from a split during transliteration, + then this string contains the complete source term. + """ + starting: List[TokenList] = dataclasses.field(default_factory=list) + def adjust_break(self, btype: BreakType, penalty: float) -> None: + """ Change the break type and penalty for this node. + """ + self.btype = btype + self.penalty = penalty + def has_tokens(self, end: int, *ttypes: TokenType) -> bool: """ Check if there are tokens of the given types ending at the given node. @@ -218,19 +240,22 @@ class QueryStruct: def __init__(self, source: List[Phrase]) -> None: self.source = source self.nodes: List[QueryNode] = \ - [QueryNode(BREAK_START, source[0].ptype if source else PHRASE_ANY)] + [QueryNode(BREAK_START, source[0].ptype if source else PHRASE_ANY, + 0.0, '', '')] def num_token_slots(self) -> int: """ Return the length of the query in vertice steps. """ return len(self.nodes) - 1 - def add_node(self, btype: BreakType, ptype: PhraseType) -> None: + def add_node(self, btype: BreakType, ptype: PhraseType, + break_penalty: float = 0.0, + term_lookup: str = '', term_normalized: str = '') -> None: """ Append a new break node with the given break type. The phrase type denotes the type for any tokens starting at the node. """ - self.nodes.append(QueryNode(btype, ptype)) + self.nodes.append(QueryNode(btype, ptype, break_penalty, term_lookup, term_normalized)) def add_token(self, trange: TokenRange, ttype: TokenType, token: Token) -> None: """ Add a token to the query. 'start' and 'end' are the indexes of the @@ -287,3 +312,11 @@ class QueryStruct: if t.token == token: return f"[{tlist.ttype}]{t.lookup_word}" return 'None' + + def get_transliterated_query(self) -> str: + """ Return a string representation of the transliterated query + with the character representation of the different break types. + + For debugging purposes only. + """ + return ''.join(''.join((n.term_lookup, n.btype)) for n in self.nodes) diff --git a/test/python/api/search/test_api_search_query.py b/test/python/api/search/test_api_search_query.py index 412a5bf2..08a1f7aa 100644 --- a/test/python/api/search/test_api_search_query.py +++ b/test/python/api/search/test_api_search_query.py @@ -21,6 +21,9 @@ def mktoken(tid: int): return MyToken(penalty=3.0, token=tid, count=1, addr_count=1, lookup_word='foo') +@pytest.fixture +def qnode(): + return query.QueryNode(query.BREAK_PHRASE, query.PHRASE_ANY, 0.0 ,'', '') @pytest.mark.parametrize('ptype,ttype', [(query.PHRASE_ANY, 'W'), (query.PHRASE_AMENITY, 'Q'), @@ -37,27 +40,24 @@ def test_phrase_incompatible(ptype): assert not query._phrase_compatible_with(ptype, query.TOKEN_PARTIAL, True) -def test_query_node_empty(): - qn = query.QueryNode(query.BREAK_PHRASE, query.PHRASE_ANY) - - assert not qn.has_tokens(3, query.TOKEN_PARTIAL) - assert qn.get_tokens(3, query.TOKEN_WORD) is None +def test_query_node_empty(qnode): + assert not qnode.has_tokens(3, query.TOKEN_PARTIAL) + assert qnode.get_tokens(3, query.TOKEN_WORD) is None -def test_query_node_with_content(): - qn = query.QueryNode(query.BREAK_PHRASE, query.PHRASE_ANY) - qn.starting.append(query.TokenList(2, query.TOKEN_PARTIAL, [mktoken(100), mktoken(101)])) - qn.starting.append(query.TokenList(2, query.TOKEN_WORD, [mktoken(1000)])) +def test_query_node_with_content(qnode): + qnode.starting.append(query.TokenList(2, query.TOKEN_PARTIAL, [mktoken(100), mktoken(101)])) + qnode.starting.append(query.TokenList(2, query.TOKEN_WORD, [mktoken(1000)])) - assert not qn.has_tokens(3, query.TOKEN_PARTIAL) - assert not qn.has_tokens(2, query.TOKEN_COUNTRY) - assert qn.has_tokens(2, query.TOKEN_PARTIAL) - assert qn.has_tokens(2, query.TOKEN_WORD) + assert not qnode.has_tokens(3, query.TOKEN_PARTIAL) + assert not qnode.has_tokens(2, query.TOKEN_COUNTRY) + assert qnode.has_tokens(2, query.TOKEN_PARTIAL) + assert qnode.has_tokens(2, query.TOKEN_WORD) - assert qn.get_tokens(3, query.TOKEN_PARTIAL) is None - assert qn.get_tokens(2, query.TOKEN_COUNTRY) is None - assert len(qn.get_tokens(2, query.TOKEN_PARTIAL)) == 2 - assert len(qn.get_tokens(2, query.TOKEN_WORD)) == 1 + assert qnode.get_tokens(3, query.TOKEN_PARTIAL) is None + assert qnode.get_tokens(2, query.TOKEN_COUNTRY) is None + assert len(qnode.get_tokens(2, query.TOKEN_PARTIAL)) == 2 + assert len(qnode.get_tokens(2, query.TOKEN_WORD)) == 1 def test_query_struct_empty(): From 6759edfb5d4cf68856d04b0208d60b48448068b1 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Wed, 26 Feb 2025 17:22:14 +0100 Subject: [PATCH 03/12] make word generation from query a class method --- src/nominatim_api/search/icu_tokenizer.py | 39 ++--------------------- src/nominatim_api/search/query.py | 34 +++++++++++++++++++- test/python/api/search/test_query.py | 17 ++++++++++ 3 files changed, 53 insertions(+), 37 deletions(-) diff --git a/src/nominatim_api/search/icu_tokenizer.py b/src/nominatim_api/search/icu_tokenizer.py index 60e712d5..e6bba95c 100644 --- a/src/nominatim_api/search/icu_tokenizer.py +++ b/src/nominatim_api/search/icu_tokenizer.py @@ -8,7 +8,6 @@ Implementation of query analysis for the ICU tokenizer. """ from typing import Tuple, Dict, List, Optional, Iterator, Any, cast -from collections import defaultdict import dataclasses import difflib import re @@ -47,29 +46,6 @@ PENALTY_IN_TOKEN_BREAK = { } -WordDict = Dict[str, List[qmod.TokenRange]] - - -def extract_words(query: qmod.QueryStruct, start: int, words: WordDict) -> None: - """ Add all combinations of words in the terms list starting with - the term leading into node 'start'. - - The words found will be added into the 'words' dictionary with - their start and end position. - """ - nodes = query.nodes - total = len(nodes) - base_penalty = PENALTY_IN_TOKEN_BREAK[qmod.BREAK_WORD] - for first in range(start, total): - word = nodes[first].term_lookup - penalty = base_penalty - words[word].append(qmod.TokenRange(first - 1, first, penalty=penalty)) - for last in range(first + 1, min(first + 20, total)): - word = ' '.join((word, nodes[last].term_lookup)) - penalty += nodes[last - 1].penalty - words[word].append(qmod.TokenRange(first - 1, last, penalty=penalty)) - - @dataclasses.dataclass class ICUToken(qmod.Token): """ Specialised token for ICU tokenizer. @@ -203,8 +179,9 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): if not query.source: return query - words = self.split_query(query) + self.split_query(query) log().var_dump('Transliterated query', lambda: query.get_transliterated_query()) + words = query.extract_words(base_penalty=PENALTY_IN_TOKEN_BREAK[qmod.BREAK_WORD]) for row in await self.lookup_in_db(list(words.keys())): for trange in words[row.word_token]: @@ -235,14 +212,9 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): """ return cast(str, self.normalizer.transliterate(text)).strip('-: ') - def split_query(self, query: qmod.QueryStruct) -> WordDict: + def split_query(self, query: qmod.QueryStruct) -> None: """ Transliterate the phrases and split them into tokens. - - Returns a dictionary of words for lookup together - with their position. """ - phrase_start = 1 - words: WordDict = defaultdict(list) for phrase in query.source: query.nodes[-1].ptype = phrase.ptype phrase_split = re.split('([ :-])', phrase.text) @@ -263,13 +235,8 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): query.nodes[-1].adjust_break(breakchar, PENALTY_IN_TOKEN_BREAK[breakchar]) - extract_words(query, phrase_start, words) - - phrase_start = len(query.nodes) query.nodes[-1].adjust_break(qmod.BREAK_END, PENALTY_IN_TOKEN_BREAK[qmod.BREAK_END]) - return words - async def lookup_in_db(self, words: List[str]) -> 'sa.Result[Any]': """ Return the token information from the database for the given word tokens. diff --git a/src/nominatim_api/search/query.py b/src/nominatim_api/search/query.py index fcd6763b..07bb685b 100644 --- a/src/nominatim_api/search/query.py +++ b/src/nominatim_api/search/query.py @@ -7,8 +7,9 @@ """ Datastructures for a tokenized query. """ -from typing import List, Tuple, Optional, Iterator +from typing import Dict, List, Tuple, Optional, Iterator from abc import ABC, abstractmethod +from collections import defaultdict import dataclasses @@ -320,3 +321,34 @@ class QueryStruct: For debugging purposes only. """ return ''.join(''.join((n.term_lookup, n.btype)) for n in self.nodes) + + def extract_words(self, base_penalty: float = 0.0, + start: int = 0, + endpos: Optional[int] = None) -> Dict[str, List[TokenRange]]: + """ Add all combinations of words that can be formed from the terms + between the given start and endnode. The terms are joined with + spaces for each break. Words can never go across a BREAK_PHRASE. + + The functions returns a dictionary of possible words with their + position within the query and a penalty. The penalty is computed + from the base_penalty plus the penalty for each node the word + crosses. + """ + if endpos is None: + endpos = len(self.nodes) + + words: Dict[str, List[TokenRange]] = defaultdict(list) + + for first in range(start, endpos - 1): + word = self.nodes[first + 1].term_lookup + penalty = base_penalty + words[word].append(TokenRange(first, first + 1, penalty=penalty)) + if self.nodes[first + 1].btype != BREAK_PHRASE: + for last in range(first + 2, min(first + 20, endpos)): + word = ' '.join((word, self.nodes[last].term_lookup)) + penalty += self.nodes[last - 1].penalty + words[word].append(TokenRange(first, last, penalty=penalty)) + if self.nodes[last].btype == BREAK_PHRASE: + break + + return words diff --git a/test/python/api/search/test_query.py b/test/python/api/search/test_query.py index c39094f0..bfed38df 100644 --- a/test/python/api/search/test_query.py +++ b/test/python/api/search/test_query.py @@ -46,3 +46,20 @@ def test_token_range_unimplemented_ops(): nq.TokenRange(1, 3) <= nq.TokenRange(10, 12) with pytest.raises(TypeError): nq.TokenRange(1, 3) >= nq.TokenRange(10, 12) + + +def test_query_extract_words(): + q = nq.QueryStruct([]) + q.add_node(nq.BREAK_WORD, nq.PHRASE_ANY, 0.1, '12', '') + q.add_node(nq.BREAK_TOKEN, nq.PHRASE_ANY, 0.0, 'ab', '') + q.add_node(nq.BREAK_PHRASE, nq.PHRASE_ANY, 0.0, '12', '') + q.add_node(nq.BREAK_END, nq.PHRASE_ANY, 0.5, 'hallo', '') + + words = q.extract_words(base_penalty=1.0) + + assert set(words.keys()) \ + == {'12', 'ab', 'hallo', '12 ab', 'ab 12', '12 ab 12'} + assert sorted(words['12']) == [nq.TokenRange(0, 1, 1.0), nq.TokenRange(2, 3, 1.0)] + assert words['12 ab'] == [nq.TokenRange(0, 2, 1.1)] + assert words['hallo'] == [nq.TokenRange(3, 4, 1.0)] + From fc1c6261ed24b66a61c038f1def268f5aa07fecc Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Tue, 25 Feb 2025 20:56:07 +0100 Subject: [PATCH 04/12] add postcode parser --- src/nominatim_api/search/icu_tokenizer.py | 13 +++- src/nominatim_api/search/postcode_parser.py | 81 +++++++++++++++++++++ 2 files changed, 93 insertions(+), 1 deletion(-) create mode 100644 src/nominatim_api/search/postcode_parser.py diff --git a/src/nominatim_api/search/icu_tokenizer.py b/src/nominatim_api/search/icu_tokenizer.py index e6bba95c..01513103 100644 --- a/src/nominatim_api/search/icu_tokenizer.py +++ b/src/nominatim_api/search/icu_tokenizer.py @@ -25,6 +25,7 @@ from ..logging import log from . import query as qmod from ..query_preprocessing.config import QueryConfig from .query_analyzer_factory import AbstractQueryAnalyzer +from .postcode_parser import PostcodeParser DB_TO_TOKEN_TYPE = { @@ -117,6 +118,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): """ def __init__(self, conn: SearchConnection) -> None: self.conn = conn + self.postcode_parser = PostcodeParser(conn.config) async def setup(self) -> None: """ Set up static data structures needed for the analysis. @@ -199,6 +201,11 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): query.add_token(trange, DB_TO_TOKEN_TYPE[row.type], token) self.add_extra_tokens(query) + for start, end, pc in self.postcode_parser.parse(query): + query.add_token(qmod.TokenRange(start, end), + qmod.TOKEN_POSTCODE, + ICUToken(penalty=0.1, token=0, count=1, addr_count=1, + lookup_word=pc, word_token=pc, info=None)) self.rerank_tokens(query) log().table_dump('Word tokens', _dump_word_tokens(query)) @@ -240,9 +247,13 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): async def lookup_in_db(self, words: List[str]) -> 'sa.Result[Any]': """ Return the token information from the database for the given word tokens. + + This function excludes postcode tokens """ t = self.conn.t.meta.tables['word'] - return await self.conn.execute(t.select().where(t.c.word_token.in_(words))) + return await self.conn.execute(t.select() + .where(t.c.word_token.in_(words)) + .where(t.c.type != 'P')) def add_extra_tokens(self, query: qmod.QueryStruct) -> None: """ Add tokens to query that are not saved in the database. diff --git a/src/nominatim_api/search/postcode_parser.py b/src/nominatim_api/search/postcode_parser.py new file mode 100644 index 00000000..93ed87c4 --- /dev/null +++ b/src/nominatim_api/search/postcode_parser.py @@ -0,0 +1,81 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2025 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Handling of arbitrary postcode tokens in tokenized query string. +""" +from typing import Tuple, Set +import re +from collections import defaultdict + +import yaml + +from ..config import Configuration +from . import query as qmod + + +class PostcodeParser: + """ Pattern-based parser for postcodes in tokenized queries. + + The postcode patterns are read from the country configuration. + The parser does currently not return country restrictions. + """ + + def __init__(self, config: Configuration) -> None: + # skip over includes here to avoid loading the complete country name data + yaml.add_constructor('!include', lambda loader, node: [], + Loader=yaml.SafeLoader) + cdata = yaml.safe_load(config.find_config_file('country_settings.yaml') + .read_text(encoding='utf-8')) + + unique_patterns = defaultdict(set) + for cc, data in cdata.items(): + if data.get('postcode'): + pat = data['postcode']['pattern'] + out = data['postcode'].get('output') + unique_patterns[pat.replace('d', '[0-9]').replace('l', '[a-z]')].add(out) + + self.global_pattern = re.compile( + '(?:' + + '|'.join(f"(?:{k})" for k in unique_patterns) + + ')[:, >]') + + self.local_patterns = [(re.compile(f"(?:{k})[:, >]"), v) + for k, v in unique_patterns.items()] + + def parse(self, query: qmod.QueryStruct) -> Set[Tuple[int, int, str]]: + """ Parse postcodes in the given list of query tokens taking into + account the list of breaks from the nodes. + + The result is a sequence of tuples with + [start node id, end node id, postcode token] + """ + nodes = query.nodes + outcodes = set() + + for i in range(query.num_token_slots()): + if nodes[i].btype in '<,: ' and nodes[i + 1].btype != '`': + word = nodes[i + 1].term_normalized + nodes[i + 1].btype + if word[-1] in ' -' and nodes[i + 2].btype != '`': + word += nodes[i + 2].term_normalized + nodes[i + 2].btype + if word[-1] in ' -' and nodes[i + 3].btype != '`': + word += nodes[i + 3].term_normalized + nodes[i + 3].btype + + # Use global pattern to check for presence of any postocde. + m = self.global_pattern.match(word) + if m: + # If there was a match, check against each pattern separately + # because multiple patterns might be machting at the end. + for pattern, info in self.local_patterns: + lm = pattern.match(word) + if lm: + trange = (i, i + sum(c in ' ,-:>' for c in lm.group(0))) + for out in info: + if out: + outcodes.add((*trange, lm.expand(out).upper())) + else: + outcodes.add((*trange, lm.group(0)[:-1].upper())) + return outcodes From e67ae701ac76129d24578cc70b8e25987a162a81 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Fri, 28 Feb 2025 17:19:37 +0100 Subject: [PATCH 05/12] show token begin and end in debug output --- src/nominatim_api/search/icu_tokenizer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/nominatim_api/search/icu_tokenizer.py b/src/nominatim_api/search/icu_tokenizer.py index 01513103..09827826 100644 --- a/src/nominatim_api/search/icu_tokenizer.py +++ b/src/nominatim_api/search/icu_tokenizer.py @@ -298,12 +298,12 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer): def _dump_word_tokens(query: qmod.QueryStruct) -> Iterator[List[Any]]: - yield ['type', 'token', 'word_token', 'lookup_word', 'penalty', 'count', 'info'] - for node in query.nodes: + yield ['type', 'from', 'to', 'token', 'word_token', 'lookup_word', 'penalty', 'count', 'info'] + for i, node in enumerate(query.nodes): for tlist in node.starting: for token in tlist.tokens: t = cast(ICUToken, token) - yield [tlist.ttype, t.token, t.word_token or '', + yield [tlist.ttype, str(i), str(tlist.end), t.token, t.word_token or '', t.lookup_word or '', t.penalty, t.count, t.info] From b2af358f663b7dd34b67ae42e26c6b38df1851bb Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Fri, 28 Feb 2025 17:19:56 +0100 Subject: [PATCH 06/12] reenable ZIP+ test --- test/bdd/api/search/postcode.feature | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/bdd/api/search/postcode.feature b/test/bdd/api/search/postcode.feature index bb1b755b..fb722862 100644 --- a/test/bdd/api/search/postcode.feature +++ b/test/bdd/api/search/postcode.feature @@ -3,9 +3,8 @@ Feature: Searches with postcodes Various searches involving postcodes - @Fail Scenario: US 5+4 ZIP codes are shortened to 5 ZIP codes if not found - When sending json search query "36067 1111, us" with address + When sending json search query "36067-1111, us" with address Then result addresses contain | postcode | | 36067 | From a574b98e4a59ae2460cce406e083c5c25fa15caa Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Sat, 1 Mar 2025 10:20:33 +0100 Subject: [PATCH 07/12] remove postcode computation for word table during import --- src/nominatim_db/tokenizer/icu_tokenizer.py | 99 ++----------------- .../api/search/test_icu_query_analyzer.py | 9 +- test/python/tokenizer/test_icu.py | 36 +------ 3 files changed, 16 insertions(+), 128 deletions(-) diff --git a/src/nominatim_db/tokenizer/icu_tokenizer.py b/src/nominatim_db/tokenizer/icu_tokenizer.py index 5595fcb2..3da1171f 100644 --- a/src/nominatim_db/tokenizer/icu_tokenizer.py +++ b/src/nominatim_db/tokenizer/icu_tokenizer.py @@ -381,76 +381,15 @@ class ICUNameAnalyzer(AbstractAnalyzer): return postcode.strip().upper() def update_postcodes_from_db(self) -> None: - """ Update postcode tokens in the word table from the location_postcode - table. + """ Postcode update. + + Removes all postcodes from the word table because they are not + needed. Postcodes are recognised by pattern. """ assert self.conn is not None - analyzer = self.token_analysis.analysis.get('@postcode') with self.conn.cursor() as cur: - # First get all postcode names currently in the word table. - cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'") - word_entries = set((entry[0] for entry in cur)) - - # Then compute the required postcode names from the postcode table. - needed_entries = set() - cur.execute("SELECT country_code, postcode FROM location_postcode") - for cc, postcode in cur: - info = PlaceInfo({'country_code': cc, - 'class': 'place', 'type': 'postcode', - 'address': {'postcode': postcode}}) - address = self.sanitizer.process_names(info)[1] - for place in address: - if place.kind == 'postcode': - if analyzer is None: - postcode_name = place.name.strip().upper() - variant_base = None - else: - postcode_name = analyzer.get_canonical_id(place) - variant_base = place.get_attr("variant") - - if variant_base: - needed_entries.add(f'{postcode_name}@{variant_base}') - else: - needed_entries.add(postcode_name) - break - - # Now update the word table. - self._delete_unused_postcode_words(word_entries - needed_entries) - self._add_missing_postcode_words(needed_entries - word_entries) - - def _delete_unused_postcode_words(self, tokens: Iterable[str]) -> None: - assert self.conn is not None - if tokens: - with self.conn.cursor() as cur: - cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)", - (list(tokens), )) - - def _add_missing_postcode_words(self, tokens: Iterable[str]) -> None: - assert self.conn is not None - if not tokens: - return - - analyzer = self.token_analysis.analysis.get('@postcode') - terms = [] - - for postcode_name in tokens: - if '@' in postcode_name: - term, variant = postcode_name.split('@', 2) - term = self._search_normalized(term) - if analyzer is None: - variants = [term] - else: - variants = analyzer.compute_variants(variant) - if term not in variants: - variants.append(term) - else: - variants = [self._search_normalized(postcode_name)] - terms.append((postcode_name, variants)) - - if terms: - with self.conn.cursor() as cur: - cur.executemany("""SELECT create_postcode_word(%s, %s)""", terms) + cur.execute("DELETE FROM word WHERE type = 'P'") def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]], should_replace: bool) -> None: @@ -718,32 +657,9 @@ class ICUNameAnalyzer(AbstractAnalyzer): analyzer = self.token_analysis.analysis.get('@postcode') if analyzer is None: - postcode_name = item.name.strip().upper() - variant_base = None + return item.name.strip().upper() else: - postcode_name = analyzer.get_canonical_id(item) - variant_base = item.get_attr("variant") - - if variant_base: - postcode = f'{postcode_name}@{variant_base}' - else: - postcode = postcode_name - - if postcode not in self._cache.postcodes: - term = self._search_normalized(postcode_name) - if not term: - return None - - variants = {term} - if analyzer is not None and variant_base: - variants.update(analyzer.compute_variants(variant_base)) - - with self.conn.cursor() as cur: - cur.execute("SELECT create_postcode_word(%s, %s)", - (postcode, list(variants))) - self._cache.postcodes.add(postcode) - - return postcode_name + return analyzer.get_canonical_id(item) class _TokenInfo: @@ -836,5 +752,4 @@ class _TokenCache: self.names: Dict[str, Tuple[int, List[int]]] = {} self.partials: Dict[str, int] = {} self.fulls: Dict[str, List[int]] = {} - self.postcodes: Set[str] = set() self.housenumbers: Dict[str, Tuple[Optional[int], Optional[str]]] = {} diff --git a/test/python/api/search/test_icu_query_analyzer.py b/test/python/api/search/test_icu_query_analyzer.py index eb453fda..fc200bca 100644 --- a/test/python/api/search/test_icu_query_analyzer.py +++ b/test/python/api/search/test_icu_query_analyzer.py @@ -102,12 +102,11 @@ async def test_splitting_in_transliteration(conn): @pytest.mark.asyncio @pytest.mark.parametrize('term,order', [('23456', ['P', 'H', 'W', 'w']), - ('3', ['H', 'P', 'W', 'w']) + ('3', ['H', 'W', 'w']) ]) async def test_penalty_postcodes_and_housenumbers(conn, term, order): ana = await tok.create_query_analyzer(conn) - await add_word(conn, 1, term, 'P', None) await add_word(conn, 2, term, 'H', term) await add_word(conn, 3, term, 'w', term) await add_word(conn, 4, term, 'W', term) @@ -179,8 +178,10 @@ async def test_add_unknown_housenumbers(conn): assert query.nodes[1].starting[0].ttype == qmod.TOKEN_HOUSENUMBER assert len(query.nodes[1].starting[0].tokens) == 1 assert query.nodes[1].starting[0].tokens[0].token == 1 - assert not query.nodes[2].starting - assert not query.nodes[3].starting + assert query.nodes[2].has_tokens(3, qmod.TOKEN_POSTCODE) + assert not query.nodes[2].has_tokens(3, qmod.TOKEN_HOUSENUMBER) + assert not query.nodes[2].has_tokens(4, qmod.TOKEN_HOUSENUMBER) + assert not query.nodes[3].has_tokens(4, qmod.TOKEN_HOUSENUMBER) @pytest.mark.asyncio diff --git a/test/python/tokenizer/test_icu.py b/test/python/tokenizer/test_icu.py index a2bf6766..06a3cd6c 100644 --- a/test/python/tokenizer/test_icu.py +++ b/test/python/tokenizer/test_icu.py @@ -265,37 +265,13 @@ class TestPostcodes: 'address': {'postcode': postcode}})) - def test_update_postcodes_from_db_empty(self, table_factory, word_table): - table_factory('location_postcode', 'country_code TEXT, postcode TEXT', - content=(('de', '12345'), ('se', '132 34'), - ('bm', 'AB23'), ('fr', '12345'))) - - self.analyzer.update_postcodes_from_db() - - assert word_table.count() == 5 - assert word_table.get_postcodes() == {'12345', '132 34@132 34', 'AB 23@AB 23'} - - - def test_update_postcodes_from_db_ambigious(self, table_factory, word_table): - table_factory('location_postcode', 'country_code TEXT, postcode TEXT', - content=(('in', '123456'), ('sg', '123456'))) - - self.analyzer.update_postcodes_from_db() - - assert word_table.count() == 3 - assert word_table.get_postcodes() == {'123456', '123456@123 456'} - - - def test_update_postcodes_from_db_add_and_remove(self, table_factory, word_table): - table_factory('location_postcode', 'country_code TEXT, postcode TEXT', - content=(('ch', '1234'), ('bm', 'BC 45'), ('bm', 'XX45'))) + def test_update_postcodes_deleted(self, word_table): word_table.add_postcode(' 1234', '1234') word_table.add_postcode(' 5678', '5678') self.analyzer.update_postcodes_from_db() - assert word_table.count() == 5 - assert word_table.get_postcodes() == {'1234', 'BC 45@BC 45', 'XX 45@XX 45'} + assert word_table.count() == 0 def test_process_place_postcode_simple(self, word_table): @@ -303,16 +279,12 @@ class TestPostcodes: assert info['postcode'] == '12345' - assert word_table.get_postcodes() == {'12345', } - def test_process_place_postcode_with_space(self, word_table): info = self.process_postcode('in', '123 567') assert info['postcode'] == '123567' - assert word_table.get_postcodes() == {'123567@123 567', } - def test_update_special_phrase_empty_table(analyzer, word_table): @@ -477,9 +449,9 @@ class TestPlaceAddress: @pytest.mark.parametrize('pcode', ['12345', 'AB 123', '34-345']) def test_process_place_postcode(self, word_table, pcode): - self.process_address(postcode=pcode) + info = self.process_address(postcode=pcode) - assert word_table.get_postcodes() == {pcode, } + assert info['postcode'] == pcode @pytest.mark.parametrize('hnr', ['123a', '1', '101']) From 921db8bb2fff339cda93deaa99d06ac85fa39694 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Sun, 2 Mar 2025 17:31:04 +0100 Subject: [PATCH 08/12] cache all info of ICUQueryAnalyser in a single object --- src/nominatim_api/search/icu_tokenizer.py | 95 ++++++++++++----------- 1 file changed, 48 insertions(+), 47 deletions(-) diff --git a/src/nominatim_api/search/icu_tokenizer.py b/src/nominatim_api/search/icu_tokenizer.py index 09827826..b3e14f6a 100644 --- a/src/nominatim_api/search/icu_tokenizer.py +++ b/src/nominatim_api/search/icu_tokenizer.py @@ -24,6 +24,7 @@ from ..connection import SearchConnection from ..logging import log from . import query as qmod from ..query_preprocessing.config import QueryConfig +from ..query_preprocessing.base import QueryProcessingFunc from .query_analyzer_factory import AbstractQueryAnalyzer from .postcode_parser import PostcodeParser @@ -112,61 +113,51 @@ class ICUToken(qmod.Token): addr_count=max(1, addr_count)) -class ICUQueryAnalyzer(AbstractQueryAnalyzer): - """ Converter for query strings into a tokenized query - using the tokens created by a ICU tokenizer. - """ - def __init__(self, conn: SearchConnection) -> None: - self.conn = conn - self.postcode_parser = PostcodeParser(conn.config) +@dataclasses.dataclass +class ICUAnalyzerConfig: + postcode_parser: PostcodeParser + normalizer: Transliterator + transliterator: Transliterator + preprocessors: List[QueryProcessingFunc] - async def setup(self) -> None: - """ Set up static data structures needed for the analysis. - """ - async def _make_normalizer() -> Any: - rules = await self.conn.get_property('tokenizer_import_normalisation') - return Transliterator.createFromRules("normalization", rules) + @staticmethod + async def create(conn: SearchConnection) -> 'ICUAnalyzerConfig': + rules = await conn.get_property('tokenizer_import_normalisation') + normalizer = Transliterator.createFromRules("normalization", rules) - self.normalizer = await self.conn.get_cached_value('ICUTOK', 'normalizer', - _make_normalizer) + rules = await conn.get_property('tokenizer_import_transliteration') + transliterator = Transliterator.createFromRules("transliteration", rules) - async def _make_transliterator() -> Any: - rules = await self.conn.get_property('tokenizer_import_transliteration') - return Transliterator.createFromRules("transliteration", rules) - - self.transliterator = await self.conn.get_cached_value('ICUTOK', 'transliterator', - _make_transliterator) - - await self._setup_preprocessing() - - if 'word' not in self.conn.t.meta.tables: - sa.Table('word', self.conn.t.meta, - sa.Column('word_id', sa.Integer), - sa.Column('word_token', sa.Text, nullable=False), - sa.Column('type', sa.Text, nullable=False), - sa.Column('word', sa.Text), - sa.Column('info', Json)) - - async def _setup_preprocessing(self) -> None: - """ Load the rules for preprocessing and set up the handlers. - """ - - rules = self.conn.config.load_sub_configuration('icu_tokenizer.yaml', - config='TOKENIZER_CONFIG') - preprocessing_rules = rules.get('query-preprocessing', []) - - self.preprocessors = [] + preprocessing_rules = conn.config.load_sub_configuration('icu_tokenizer.yaml', + config='TOKENIZER_CONFIG')\ + .get('query-preprocessing', []) + preprocessors: List[QueryProcessingFunc] = [] for func in preprocessing_rules: if 'step' not in func: raise UsageError("Preprocessing rule is missing the 'step' attribute.") if not isinstance(func['step'], str): raise UsageError("'step' attribute must be a simple string.") - module = self.conn.config.load_plugin_module( + module = conn.config.load_plugin_module( func['step'], 'nominatim_api.query_preprocessing') - self.preprocessors.append( - module.create(QueryConfig(func).set_normalizer(self.normalizer))) + preprocessors.append( + module.create(QueryConfig(func).set_normalizer(normalizer))) + + return ICUAnalyzerConfig(PostcodeParser(conn.config), + normalizer, transliterator, preprocessors) + + +class ICUQueryAnalyzer(AbstractQueryAnalyzer): + """ Converter for query strings into a tokenized query + using the tokens created by a ICU tokenizer. + """ + def __init__(self, conn: SearchConnection, config: ICUAnalyzerConfig) -> None: + self.conn = conn + self.postcode_parser = config.postcode_parser + self.normalizer = config.normalizer + self.transliterator = config.transliterator + self.preprocessors = config.preprocessors async def analyze_query(self, phrases: List[qmod.Phrase]) -> qmod.QueryStruct: """ Analyze the given list of phrases and return the @@ -311,7 +302,17 @@ async def create_query_analyzer(conn: SearchConnection) -> AbstractQueryAnalyzer """ Create and set up a new query analyzer for a database based on the ICU tokenizer. """ - out = ICUQueryAnalyzer(conn) - await out.setup() + async def _get_config() -> ICUAnalyzerConfig: + if 'word' not in conn.t.meta.tables: + sa.Table('word', conn.t.meta, + sa.Column('word_id', sa.Integer), + sa.Column('word_token', sa.Text, nullable=False), + sa.Column('type', sa.Text, nullable=False), + sa.Column('word', sa.Text), + sa.Column('info', Json)) - return out + return await ICUAnalyzerConfig.create(conn) + + config = await conn.get_cached_value('ICUTOK', 'config', _get_config) + + return ICUQueryAnalyzer(conn, config) From 434fbbfd182c2f556d46b3bac65a4d92ab59b9d7 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Tue, 4 Mar 2025 15:18:27 +0100 Subject: [PATCH 09/12] add support for country prefixes in postcodes --- src/nominatim_api/search/postcode_parser.py | 41 ++++++++++++--------- 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/src/nominatim_api/search/postcode_parser.py b/src/nominatim_api/search/postcode_parser.py index 93ed87c4..1148d3c3 100644 --- a/src/nominatim_api/search/postcode_parser.py +++ b/src/nominatim_api/search/postcode_parser.py @@ -7,7 +7,7 @@ """ Handling of arbitrary postcode tokens in tokenized query string. """ -from typing import Tuple, Set +from typing import Tuple, Set, Dict, List import re from collections import defaultdict @@ -31,20 +31,21 @@ class PostcodeParser: cdata = yaml.safe_load(config.find_config_file('country_settings.yaml') .read_text(encoding='utf-8')) - unique_patterns = defaultdict(set) + unique_patterns: Dict[str, Dict[str, List[str]]] = {} for cc, data in cdata.items(): if data.get('postcode'): - pat = data['postcode']['pattern'] + pat = data['postcode']['pattern'].replace('d', '[0-9]').replace('l', '[a-z]') out = data['postcode'].get('output') - unique_patterns[pat.replace('d', '[0-9]').replace('l', '[a-z]')].add(out) + if pat not in unique_patterns: + unique_patterns[pat] = defaultdict(list) + unique_patterns[pat][out].append(cc) self.global_pattern = re.compile( - '(?:' + - '|'.join(f"(?:{k})" for k in unique_patterns) - + ')[:, >]') + '(?:(?P[a-z][a-z])(?P[ -]?))?(?P(?:(?:' + + ')|(?:'.join(unique_patterns) + '))[:, >].*)') - self.local_patterns = [(re.compile(f"(?:{k})[:, >]"), v) - for k, v in unique_patterns.items()] + self.local_patterns = [(re.compile(f"{pat}[:, >]"), list(info.items())) + for pat, info in unique_patterns.items()] def parse(self, query: qmod.QueryStruct) -> Set[Tuple[int, int, str]]: """ Parse postcodes in the given list of query tokens taking into @@ -64,18 +65,22 @@ class PostcodeParser: if word[-1] in ' -' and nodes[i + 3].btype != '`': word += nodes[i + 3].term_normalized + nodes[i + 3].btype - # Use global pattern to check for presence of any postocde. - m = self.global_pattern.match(word) + # Use global pattern to check for presence of any postcode. + m = self.global_pattern.fullmatch(word) if m: # If there was a match, check against each pattern separately # because multiple patterns might be machting at the end. + cc = m.group('cc') + pc_word = m.group('pc') + cc_spaces = len(m.group('space') or '') for pattern, info in self.local_patterns: - lm = pattern.match(word) + lm = pattern.match(pc_word) if lm: - trange = (i, i + sum(c in ' ,-:>' for c in lm.group(0))) - for out in info: - if out: - outcodes.add((*trange, lm.expand(out).upper())) - else: - outcodes.add((*trange, lm.group(0)[:-1].upper())) + trange = (i, i + cc_spaces + sum(c in ' ,-:>' for c in lm.group(0))) + for out, out_ccs in info: + if cc is None or cc in out_ccs: + if out: + outcodes.add((*trange, lm.expand(out).upper())) + else: + outcodes.add((*trange, lm.group(0)[:-1].upper())) return outcodes From 6712627d5eb960e88975bf154403ac8631d8d081 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Tue, 4 Mar 2025 15:18:46 +0100 Subject: [PATCH 10/12] adapt BDD tests to new postcode handling --- test/bdd/db/import/postcodes.feature | 6 ++---- test/bdd/db/update/postcode.feature | 13 +++---------- 2 files changed, 5 insertions(+), 14 deletions(-) diff --git a/test/bdd/db/import/postcodes.feature b/test/bdd/db/import/postcodes.feature index 3f4976f1..a9b07bfe 100644 --- a/test/bdd/db/import/postcodes.feature +++ b/test/bdd/db/import/postcodes.feature @@ -170,7 +170,7 @@ Feature: Import of postcodes | object | postcode | | W93 | 11200 | - Scenario: Postcodes are added to the postcode and word table + Scenario: Postcodes are added to the postcode Given the places | osm | class | type | addr+postcode | addr+housenumber | geometry | | N34 | place | house | 01982 | 111 |country:de | @@ -178,7 +178,6 @@ Feature: Import of postcodes Then location_postcode contains exactly | country | postcode | geometry | | de | 01982 | country:de | - And there are word tokens for postcodes 01982 @Fail @@ -195,7 +194,7 @@ Feature: Import of postcodes | E45 2 | gb | 23 | 5 | | Y45 | gb | 21 | 5 | - Scenario: Postcodes outside all countries are not added to the postcode and word table + Scenario: Postcodes outside all countries are not added to the postcode table Given the places | osm | class | type | addr+postcode | addr+housenumber | addr+place | geometry | | N34 | place | house | 01982 | 111 | Null Island | 0 0.00001 | @@ -205,7 +204,6 @@ Feature: Import of postcodes When importing Then location_postcode contains exactly | country | postcode | geometry | - And there are no word tokens for postcodes 01982 When sending search query "111, 01982 Null Island" Then results contain | osm | display_name | diff --git a/test/bdd/db/update/postcode.feature b/test/bdd/db/update/postcode.feature index 39318101..61b52f3d 100644 --- a/test/bdd/db/update/postcode.feature +++ b/test/bdd/db/update/postcode.feature @@ -2,7 +2,7 @@ Feature: Update of postcode Tests for updating of data related to postcodes - Scenario: A new postcode appears in the postcode and word table + Scenario: A new postcode appears in the postcode table Given the places | osm | class | type | addr+postcode | addr+housenumber | geometry | | N34 | place | house | 01982 | 111 |country:de | @@ -18,9 +18,8 @@ Feature: Update of postcode | country | postcode | geometry | | de | 01982 | country:de | | ch | 4567 | country:ch | - And there are word tokens for postcodes 01982,4567 - Scenario: When the last postcode is deleted, it is deleted from postcode and word + Scenario: When the last postcode is deleted, it is deleted from postcode Given the places | osm | class | type | addr+postcode | addr+housenumber | geometry | | N34 | place | house | 01982 | 111 |country:de | @@ -31,10 +30,8 @@ Feature: Update of postcode Then location_postcode contains exactly | country | postcode | geometry | | ch | 4567 | country:ch | - And there are word tokens for postcodes 4567 - And there are no word tokens for postcodes 01982 - Scenario: A postcode is not deleted from postcode and word when it exist in another country + Scenario: A postcode is not deleted from postcode when it exist in another country Given the places | osm | class | type | addr+postcode | addr+housenumber | geometry | | N34 | place | house | 01982 | 111 |country:de | @@ -45,7 +42,6 @@ Feature: Update of postcode Then location_postcode contains exactly | country | postcode | geometry | | fr | 01982 | country:fr | - And there are word tokens for postcodes 01982 Scenario: Updating a postcode is reflected in postcode table Given the places @@ -59,7 +55,6 @@ Feature: Update of postcode Then location_postcode contains exactly | country | postcode | geometry | | de | 20453 | country:de | - And there are word tokens for postcodes 20453 Scenario: When changing from a postcode type, the entry appears in placex When importing @@ -80,7 +75,6 @@ Feature: Update of postcode Then location_postcode contains exactly | country | postcode | geometry | | de | 20453 | country:de | - And there are word tokens for postcodes 20453 Scenario: When changing to a postcode type, the entry disappears from placex When importing @@ -101,7 +95,6 @@ Feature: Update of postcode Then location_postcode contains exactly | country | postcode | geometry | | de | 01982 | country:de | - And there are word tokens for postcodes 01982 Scenario: When a parent is deleted, the postcode gets a new parent Given the grid with origin DE From afb89f9c7abab5aad234889a4e4426ac57a5b56c Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Tue, 4 Mar 2025 16:25:00 +0100 Subject: [PATCH 11/12] add unit tests for postcode parser --- .../python/api/search/test_postcode_parser.py | 133 ++++++++++++++++++ 1 file changed, 133 insertions(+) create mode 100644 test/python/api/search/test_postcode_parser.py diff --git a/test/python/api/search/test_postcode_parser.py b/test/python/api/search/test_postcode_parser.py new file mode 100644 index 00000000..f691a58c --- /dev/null +++ b/test/python/api/search/test_postcode_parser.py @@ -0,0 +1,133 @@ + +# SPDX-License-Identifier: GPL-3.0-or-later +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2025 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Test for parsing of postcodes in queries. +""" +import re +from itertools import zip_longest + +import pytest + +from nominatim_api.search.postcode_parser import PostcodeParser +from nominatim_api.search.query import QueryStruct, PHRASE_ANY + +@pytest.fixture +def pc_config(project_env): + country_file = project_env.project_dir / 'country_settings.yaml' + country_file.write_text(r""" +ab: + postcode: + pattern: "ddddd ll" +ba: + postcode: + pattern: "ddddd" +de: + postcode: + pattern: "ddddd" +gr: + postcode: + pattern: "(ddd) ?(dd)" + output: \1 \2 +in: + postcode: + pattern: "(ddd) ?(ddd)" + output: \1\2 +mc: + postcode: + pattern: "980dd" +mz: + postcode: + pattern: "(dddd)(?:-dd)?" +bn: + postcode: + pattern: "(ll) ?(dddd)" + output: \1\2 +ky: + postcode: + pattern: "(d)-(dddd)" + output: KY\1-\2 + """) + + return project_env + +def mk_query(inp): + query = QueryStruct([]) + phrase_split = re.split(r"([ ,:'-])", inp) + + for word, breakchar in zip_longest(*[iter(phrase_split)]*2, fillvalue='>'): + query.add_node(breakchar, PHRASE_ANY, 0.1, word, word) + + return query + + +@pytest.mark.parametrize('query,pos', [('45325 Berlin', 0), + ('45325:Berlin', 0), + ('45325,Berlin', 0), + ('Berlin 45325', 1), + ('Berlin,45325', 1), + ('Berlin:45325', 1), + ('Hansastr,45325 Berlin', 1), + ('Hansastr 45325 Berlin', 1)]) +def test_simple_postcode(pc_config, query, pos): + parser = PostcodeParser(pc_config) + + result = parser.parse(mk_query(query)) + + assert result == {(pos, pos + 1, '45325'), (pos, pos + 1, '453 25')} + +def test_contained_postcode(pc_config): + parser = PostcodeParser(pc_config) + + assert parser.parse(mk_query('12345 dx')) == {(0, 1, '12345'), (0, 1, '123 45'), + (0, 2, '12345 DX')} + + + +@pytest.mark.parametrize('query,frm,to', [('345987', 0, 1), ('345 987', 0, 2), + ('Aina 345 987', 1, 3), + ('Aina 23 345 987 ff', 2, 4)]) +def test_postcode_with_space(pc_config, query, frm, to): + parser = PostcodeParser(pc_config) + + result = parser.parse(mk_query(query)) + + assert result == {(frm, to, '345987')} + +def test_overlapping_postcode(pc_config): + parser = PostcodeParser(pc_config) + + assert parser.parse(mk_query('123 456 78')) == {(0, 2, '123456'), (1, 3, '456 78')} + + +@pytest.mark.parametrize('query', ['45325-Berlin', "45325'Berlin", + 'Berlin-45325', "Berlin'45325", '45325Berlin' + '345-987', "345'987", '345,987', '345:987']) +def test_not_a_postcode(pc_config, query): + parser = PostcodeParser(pc_config) + + assert not parser.parse(mk_query(query)) + + +@pytest.mark.parametrize('query', ['ba 12233', 'ba-12233']) +def test_postcode_with_country_prefix(pc_config, query): + parser = PostcodeParser(pc_config) + + assert (0, 2, '12233') in parser.parse(mk_query(query)) + + +def test_postcode_with_joined_country_prefix(pc_config): + parser = PostcodeParser(pc_config) + + assert parser.parse(mk_query('ba12233')) == {(0, 1, '12233')} + + +def test_postcode_with_non_matching_country_prefix(pc_config): + parser = PostcodeParser(pc_config) + + assert not parser.parse(mk_query('ky12233')) + From 6b0d58d9fdbd2e488f3dc408c1b7d8837710f6e6 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Wed, 5 Mar 2025 10:08:07 +0100 Subject: [PATCH 12/12] restrict postcode parsing in typed phrases Postcodes can only appear in postcode-type phrases and must then cover the full phrase --- src/nominatim_api/search/postcode_parser.py | 67 ++++++++++++------- .../python/api/search/test_postcode_parser.py | 23 ++++++- 2 files changed, 64 insertions(+), 26 deletions(-) diff --git a/src/nominatim_api/search/postcode_parser.py b/src/nominatim_api/search/postcode_parser.py index 1148d3c3..bb3ef1a4 100644 --- a/src/nominatim_api/search/postcode_parser.py +++ b/src/nominatim_api/search/postcode_parser.py @@ -55,32 +55,49 @@ class PostcodeParser: [start node id, end node id, postcode token] """ nodes = query.nodes - outcodes = set() + outcodes: Set[Tuple[int, int, str]] = set() for i in range(query.num_token_slots()): - if nodes[i].btype in '<,: ' and nodes[i + 1].btype != '`': - word = nodes[i + 1].term_normalized + nodes[i + 1].btype - if word[-1] in ' -' and nodes[i + 2].btype != '`': - word += nodes[i + 2].term_normalized + nodes[i + 2].btype - if word[-1] in ' -' and nodes[i + 3].btype != '`': - word += nodes[i + 3].term_normalized + nodes[i + 3].btype + if nodes[i].btype in '<,: ' and nodes[i + 1].btype != '`' \ + and (i == 0 or nodes[i - 1].ptype != qmod.PHRASE_POSTCODE): + if nodes[i].ptype == qmod.PHRASE_ANY: + word = nodes[i + 1].term_normalized + nodes[i + 1].btype + if word[-1] in ' -' and nodes[i + 2].btype != '`' \ + and nodes[i + 1].ptype == qmod.PHRASE_ANY: + word += nodes[i + 2].term_normalized + nodes[i + 2].btype + if word[-1] in ' -' and nodes[i + 3].btype != '`' \ + and nodes[i + 2].ptype == qmod.PHRASE_ANY: + word += nodes[i + 3].term_normalized + nodes[i + 3].btype + + self._match_word(word, i, False, outcodes) + elif nodes[i].ptype == qmod.PHRASE_POSTCODE: + word = nodes[i + 1].term_normalized + nodes[i + 1].btype + for j in range(i + 1, query.num_token_slots()): + if nodes[j].ptype != qmod.PHRASE_POSTCODE: + break + word += nodes[j + 1].term_normalized + nodes[j + 1].btype + + self._match_word(word, i, True, outcodes) - # Use global pattern to check for presence of any postcode. - m = self.global_pattern.fullmatch(word) - if m: - # If there was a match, check against each pattern separately - # because multiple patterns might be machting at the end. - cc = m.group('cc') - pc_word = m.group('pc') - cc_spaces = len(m.group('space') or '') - for pattern, info in self.local_patterns: - lm = pattern.match(pc_word) - if lm: - trange = (i, i + cc_spaces + sum(c in ' ,-:>' for c in lm.group(0))) - for out, out_ccs in info: - if cc is None or cc in out_ccs: - if out: - outcodes.add((*trange, lm.expand(out).upper())) - else: - outcodes.add((*trange, lm.group(0)[:-1].upper())) return outcodes + + def _match_word(self, word: str, pos: int, fullmatch: bool, + outcodes: Set[Tuple[int, int, str]]) -> None: + # Use global pattern to check for presence of any postcode. + m = self.global_pattern.fullmatch(word) + if m: + # If there was a match, check against each pattern separately + # because multiple patterns might be machting at the end. + cc = m.group('cc') + pc_word = m.group('pc') + cc_spaces = len(m.group('space') or '') + for pattern, info in self.local_patterns: + lm = pattern.fullmatch(pc_word) if fullmatch else pattern.match(pc_word) + if lm: + trange = (pos, pos + cc_spaces + sum(c in ' ,-:>' for c in lm.group(0))) + for out, out_ccs in info: + if cc is None or cc in out_ccs: + if out: + outcodes.add((*trange, lm.expand(out).upper())) + else: + outcodes.add((*trange, lm.group(0)[:-1].upper())) diff --git a/test/python/api/search/test_postcode_parser.py b/test/python/api/search/test_postcode_parser.py index f691a58c..284aba5b 100644 --- a/test/python/api/search/test_postcode_parser.py +++ b/test/python/api/search/test_postcode_parser.py @@ -14,7 +14,7 @@ from itertools import zip_longest import pytest from nominatim_api.search.postcode_parser import PostcodeParser -from nominatim_api.search.query import QueryStruct, PHRASE_ANY +from nominatim_api.search.query import QueryStruct, PHRASE_ANY, PHRASE_POSTCODE, PHRASE_STREET @pytest.fixture def pc_config(project_env): @@ -131,3 +131,24 @@ def test_postcode_with_non_matching_country_prefix(pc_config): assert not parser.parse(mk_query('ky12233')) +def test_postcode_inside_postcode_phrase(pc_config): + parser = PostcodeParser(pc_config) + + query = QueryStruct([]) + query.nodes[-1].ptype = PHRASE_STREET + query.add_node(',', PHRASE_STREET, 0.1, '12345', '12345') + query.add_node(',', PHRASE_POSTCODE, 0.1, 'xz', 'xz') + query.add_node('>', PHRASE_POSTCODE, 0.1, '4444', '4444') + + assert parser.parse(query) == {(2, 3, '4444')} + + +def test_partial_postcode_in_postcode_phrase(pc_config): + parser = PostcodeParser(pc_config) + + query = QueryStruct([]) + query.nodes[-1].ptype = PHRASE_POSTCODE + query.add_node(' ', PHRASE_POSTCODE, 0.1, '2224', '2224') + query.add_node('>', PHRASE_POSTCODE, 0.1, '12345', '12345') + + assert not parser.parse(query)