Merge pull request #3665 from lonvia/pattern-matching-postcodes

Add full parsing of postcodes in query
This commit is contained in:
Sarah Hoffmann
2025-03-05 16:02:03 +01:00
committed by GitHub
13 changed files with 476 additions and 295 deletions

View File

@@ -1809,7 +1809,8 @@ us:
languages: en languages: en
names: !include country-names/us.yaml names: !include country-names/us.yaml
postcode: postcode:
pattern: "ddddd" pattern: "(ddddd)(?:-dddd)?"
output: \1
# Uruguay (Uruguay) # Uruguay (Uruguay)

View File

@@ -8,7 +8,6 @@
Implementation of query analysis for the ICU tokenizer. Implementation of query analysis for the ICU tokenizer.
""" """
from typing import Tuple, Dict, List, Optional, Iterator, Any, cast from typing import Tuple, Dict, List, Optional, Iterator, Any, cast
from collections import defaultdict
import dataclasses import dataclasses
import difflib import difflib
import re import re
@@ -25,7 +24,9 @@ from ..connection import SearchConnection
from ..logging import log from ..logging import log
from . import query as qmod from . import query as qmod
from ..query_preprocessing.config import QueryConfig from ..query_preprocessing.config import QueryConfig
from ..query_preprocessing.base import QueryProcessingFunc
from .query_analyzer_factory import AbstractQueryAnalyzer from .query_analyzer_factory import AbstractQueryAnalyzer
from .postcode_parser import PostcodeParser
DB_TO_TOKEN_TYPE = { DB_TO_TOKEN_TYPE = {
@@ -47,42 +48,6 @@ PENALTY_IN_TOKEN_BREAK = {
} }
@dataclasses.dataclass
class QueryPart:
""" Normalized and transliterated form of a single term in the query.
When the term came out of a split during the transliteration,
the normalized string is the full word before transliteration.
Check the subsequent break type to figure out if the word is
continued.
Penalty is the break penalty for the break following the token.
"""
token: str
normalized: str
penalty: float
QueryParts = List[QueryPart]
WordDict = Dict[str, List[qmod.TokenRange]]
def extract_words(terms: List[QueryPart], start: int, words: WordDict) -> None:
""" Add all combinations of words in the terms list after the
given position to the word list.
"""
total = len(terms)
base_penalty = PENALTY_IN_TOKEN_BREAK[qmod.BREAK_WORD]
for first in range(start, total):
word = terms[first].token
penalty = base_penalty
words[word].append(qmod.TokenRange(first, first + 1, penalty=penalty))
for last in range(first + 1, min(first + 20, total)):
word = ' '.join((word, terms[last].token))
penalty += terms[last - 1].penalty
words[word].append(qmod.TokenRange(first, last + 1, penalty=penalty))
@dataclasses.dataclass @dataclasses.dataclass
class ICUToken(qmod.Token): class ICUToken(qmod.Token):
""" Specialised token for ICU tokenizer. """ Specialised token for ICU tokenizer.
@@ -148,60 +113,51 @@ class ICUToken(qmod.Token):
addr_count=max(1, addr_count)) addr_count=max(1, addr_count))
class ICUQueryAnalyzer(AbstractQueryAnalyzer): @dataclasses.dataclass
""" Converter for query strings into a tokenized query class ICUAnalyzerConfig:
using the tokens created by a ICU tokenizer. postcode_parser: PostcodeParser
""" normalizer: Transliterator
def __init__(self, conn: SearchConnection) -> None: transliterator: Transliterator
self.conn = conn preprocessors: List[QueryProcessingFunc]
async def setup(self) -> None: @staticmethod
""" Set up static data structures needed for the analysis. async def create(conn: SearchConnection) -> 'ICUAnalyzerConfig':
""" rules = await conn.get_property('tokenizer_import_normalisation')
async def _make_normalizer() -> Any: normalizer = Transliterator.createFromRules("normalization", rules)
rules = await self.conn.get_property('tokenizer_import_normalisation')
return Transliterator.createFromRules("normalization", rules)
self.normalizer = await self.conn.get_cached_value('ICUTOK', 'normalizer', rules = await conn.get_property('tokenizer_import_transliteration')
_make_normalizer) transliterator = Transliterator.createFromRules("transliteration", rules)
async def _make_transliterator() -> Any: preprocessing_rules = conn.config.load_sub_configuration('icu_tokenizer.yaml',
rules = await self.conn.get_property('tokenizer_import_transliteration') config='TOKENIZER_CONFIG')\
return Transliterator.createFromRules("transliteration", rules) .get('query-preprocessing', [])
self.transliterator = await self.conn.get_cached_value('ICUTOK', 'transliterator',
_make_transliterator)
await self._setup_preprocessing()
if 'word' not in self.conn.t.meta.tables:
sa.Table('word', self.conn.t.meta,
sa.Column('word_id', sa.Integer),
sa.Column('word_token', sa.Text, nullable=False),
sa.Column('type', sa.Text, nullable=False),
sa.Column('word', sa.Text),
sa.Column('info', Json))
async def _setup_preprocessing(self) -> None:
""" Load the rules for preprocessing and set up the handlers.
"""
rules = self.conn.config.load_sub_configuration('icu_tokenizer.yaml',
config='TOKENIZER_CONFIG')
preprocessing_rules = rules.get('query-preprocessing', [])
self.preprocessors = []
preprocessors: List[QueryProcessingFunc] = []
for func in preprocessing_rules: for func in preprocessing_rules:
if 'step' not in func: if 'step' not in func:
raise UsageError("Preprocessing rule is missing the 'step' attribute.") raise UsageError("Preprocessing rule is missing the 'step' attribute.")
if not isinstance(func['step'], str): if not isinstance(func['step'], str):
raise UsageError("'step' attribute must be a simple string.") raise UsageError("'step' attribute must be a simple string.")
module = self.conn.config.load_plugin_module( module = conn.config.load_plugin_module(
func['step'], 'nominatim_api.query_preprocessing') func['step'], 'nominatim_api.query_preprocessing')
self.preprocessors.append( preprocessors.append(
module.create(QueryConfig(func).set_normalizer(self.normalizer))) module.create(QueryConfig(func).set_normalizer(normalizer)))
return ICUAnalyzerConfig(PostcodeParser(conn.config),
normalizer, transliterator, preprocessors)
class ICUQueryAnalyzer(AbstractQueryAnalyzer):
""" Converter for query strings into a tokenized query
using the tokens created by a ICU tokenizer.
"""
def __init__(self, conn: SearchConnection, config: ICUAnalyzerConfig) -> None:
self.conn = conn
self.postcode_parser = config.postcode_parser
self.normalizer = config.normalizer
self.transliterator = config.transliterator
self.preprocessors = config.preprocessors
async def analyze_query(self, phrases: List[qmod.Phrase]) -> qmod.QueryStruct: async def analyze_query(self, phrases: List[qmod.Phrase]) -> qmod.QueryStruct:
""" Analyze the given list of phrases and return the """ Analyze the given list of phrases and return the
@@ -216,8 +172,9 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
if not query.source: if not query.source:
return query return query
parts, words = self.split_query(query) self.split_query(query)
log().var_dump('Transliterated query', lambda: _dump_transliterated(query, parts)) log().var_dump('Transliterated query', lambda: query.get_transliterated_query())
words = query.extract_words(base_penalty=PENALTY_IN_TOKEN_BREAK[qmod.BREAK_WORD])
for row in await self.lookup_in_db(list(words.keys())): for row in await self.lookup_in_db(list(words.keys())):
for trange in words[row.word_token]: for trange in words[row.word_token]:
@@ -234,8 +191,13 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
else: else:
query.add_token(trange, DB_TO_TOKEN_TYPE[row.type], token) query.add_token(trange, DB_TO_TOKEN_TYPE[row.type], token)
self.add_extra_tokens(query, parts) self.add_extra_tokens(query)
self.rerank_tokens(query, parts) for start, end, pc in self.postcode_parser.parse(query):
query.add_token(qmod.TokenRange(start, end),
qmod.TOKEN_POSTCODE,
ICUToken(penalty=0.1, token=0, count=1, addr_count=1,
lookup_word=pc, word_token=pc, info=None))
self.rerank_tokens(query)
log().table_dump('Word tokens', _dump_word_tokens(query)) log().table_dump('Word tokens', _dump_word_tokens(query))
@@ -248,16 +210,9 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
""" """
return cast(str, self.normalizer.transliterate(text)).strip('-: ') return cast(str, self.normalizer.transliterate(text)).strip('-: ')
def split_query(self, query: qmod.QueryStruct) -> Tuple[QueryParts, WordDict]: def split_query(self, query: qmod.QueryStruct) -> None:
""" Transliterate the phrases and split them into tokens. """ Transliterate the phrases and split them into tokens.
Returns the list of transliterated tokens together with their
normalized form and a dictionary of words for lookup together
with their position.
""" """
parts: QueryParts = []
phrase_start = 0
words: WordDict = defaultdict(list)
for phrase in query.source: for phrase in query.source:
query.nodes[-1].ptype = phrase.ptype query.nodes[-1].ptype = phrase.ptype
phrase_split = re.split('([ :-])', phrase.text) phrase_split = re.split('([ :-])', phrase.text)
@@ -272,38 +227,42 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
if trans: if trans:
for term in trans.split(' '): for term in trans.split(' '):
if term: if term:
parts.append(QueryPart(term, word, query.add_node(qmod.BREAK_TOKEN, phrase.ptype,
PENALTY_IN_TOKEN_BREAK[qmod.BREAK_TOKEN])) PENALTY_IN_TOKEN_BREAK[qmod.BREAK_TOKEN],
query.add_node(qmod.BREAK_TOKEN, phrase.ptype) term, word)
query.nodes[-1].btype = breakchar query.nodes[-1].adjust_break(breakchar,
parts[-1].penalty = PENALTY_IN_TOKEN_BREAK[breakchar] PENALTY_IN_TOKEN_BREAK[breakchar])
extract_words(parts, phrase_start, words) query.nodes[-1].adjust_break(qmod.BREAK_END, PENALTY_IN_TOKEN_BREAK[qmod.BREAK_END])
phrase_start = len(parts)
query.nodes[-1].btype = qmod.BREAK_END
return parts, words
async def lookup_in_db(self, words: List[str]) -> 'sa.Result[Any]': async def lookup_in_db(self, words: List[str]) -> 'sa.Result[Any]':
""" Return the token information from the database for the """ Return the token information from the database for the
given word tokens. given word tokens.
This function excludes postcode tokens
""" """
t = self.conn.t.meta.tables['word'] t = self.conn.t.meta.tables['word']
return await self.conn.execute(t.select().where(t.c.word_token.in_(words))) return await self.conn.execute(t.select()
.where(t.c.word_token.in_(words))
.where(t.c.type != 'P'))
def add_extra_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None: def add_extra_tokens(self, query: qmod.QueryStruct) -> None:
""" Add tokens to query that are not saved in the database. """ Add tokens to query that are not saved in the database.
""" """
for part, node, i in zip(parts, query.nodes, range(1000)): need_hnr = False
if len(part.token) <= 4 and part.token.isdigit()\ for i, node in enumerate(query.nodes):
and not node.has_tokens(i+1, qmod.TOKEN_HOUSENUMBER): is_full_token = node.btype not in (qmod.BREAK_TOKEN, qmod.BREAK_PART)
query.add_token(qmod.TokenRange(i, i+1), qmod.TOKEN_HOUSENUMBER, if need_hnr and is_full_token \
and len(node.term_normalized) <= 4 and node.term_normalized.isdigit():
query.add_token(qmod.TokenRange(i-1, i), qmod.TOKEN_HOUSENUMBER,
ICUToken(penalty=0.5, token=0, ICUToken(penalty=0.5, token=0,
count=1, addr_count=1, lookup_word=part.token, count=1, addr_count=1,
word_token=part.token, info=None)) lookup_word=node.term_lookup,
word_token=node.term_lookup, info=None))
def rerank_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None: need_hnr = is_full_token and not node.has_tokens(i+1, qmod.TOKEN_HOUSENUMBER)
def rerank_tokens(self, query: qmod.QueryStruct) -> None:
""" Add penalties to tokens that depend on presence of other token. """ Add penalties to tokens that depend on presence of other token.
""" """
for i, node, tlist in query.iter_token_lists(): for i, node, tlist in query.iter_token_lists():
@@ -320,28 +279,22 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
if repl.end == tlist.end and repl.ttype != qmod.TOKEN_HOUSENUMBER: if repl.end == tlist.end and repl.ttype != qmod.TOKEN_HOUSENUMBER:
repl.add_penalty(0.5 - tlist.tokens[0].penalty) repl.add_penalty(0.5 - tlist.tokens[0].penalty)
elif tlist.ttype not in (qmod.TOKEN_COUNTRY, qmod.TOKEN_PARTIAL): elif tlist.ttype not in (qmod.TOKEN_COUNTRY, qmod.TOKEN_PARTIAL):
norm = parts[i].normalized norm = ' '.join(n.term_normalized for n in query.nodes[i + 1:tlist.end + 1]
for j in range(i + 1, tlist.end): if n.btype != qmod.BREAK_TOKEN)
if node.btype != qmod.BREAK_TOKEN: if not norm:
norm += ' ' + parts[j].normalized # Can happen when the token only covers a partial term
norm = query.nodes[i + 1].term_normalized
for token in tlist.tokens: for token in tlist.tokens:
cast(ICUToken, token).rematch(norm) cast(ICUToken, token).rematch(norm)
def _dump_transliterated(query: qmod.QueryStruct, parts: QueryParts) -> str:
out = query.nodes[0].btype
for node, part in zip(query.nodes[1:], parts):
out += part.token + node.btype
return out
def _dump_word_tokens(query: qmod.QueryStruct) -> Iterator[List[Any]]: def _dump_word_tokens(query: qmod.QueryStruct) -> Iterator[List[Any]]:
yield ['type', 'token', 'word_token', 'lookup_word', 'penalty', 'count', 'info'] yield ['type', 'from', 'to', 'token', 'word_token', 'lookup_word', 'penalty', 'count', 'info']
for node in query.nodes: for i, node in enumerate(query.nodes):
for tlist in node.starting: for tlist in node.starting:
for token in tlist.tokens: for token in tlist.tokens:
t = cast(ICUToken, token) t = cast(ICUToken, token)
yield [tlist.ttype, t.token, t.word_token or '', yield [tlist.ttype, str(i), str(tlist.end), t.token, t.word_token or '',
t.lookup_word or '', t.penalty, t.count, t.info] t.lookup_word or '', t.penalty, t.count, t.info]
@@ -349,7 +302,17 @@ async def create_query_analyzer(conn: SearchConnection) -> AbstractQueryAnalyzer
""" Create and set up a new query analyzer for a database based """ Create and set up a new query analyzer for a database based
on the ICU tokenizer. on the ICU tokenizer.
""" """
out = ICUQueryAnalyzer(conn) async def _get_config() -> ICUAnalyzerConfig:
await out.setup() if 'word' not in conn.t.meta.tables:
sa.Table('word', conn.t.meta,
sa.Column('word_id', sa.Integer),
sa.Column('word_token', sa.Text, nullable=False),
sa.Column('type', sa.Text, nullable=False),
sa.Column('word', sa.Text),
sa.Column('info', Json))
return out return await ICUAnalyzerConfig.create(conn)
config = await conn.get_cached_value('ICUTOK', 'config', _get_config)
return ICUQueryAnalyzer(conn, config)

View File

@@ -0,0 +1,103 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2025 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Handling of arbitrary postcode tokens in tokenized query string.
"""
from typing import Tuple, Set, Dict, List
import re
from collections import defaultdict
import yaml
from ..config import Configuration
from . import query as qmod
class PostcodeParser:
""" Pattern-based parser for postcodes in tokenized queries.
The postcode patterns are read from the country configuration.
The parser does currently not return country restrictions.
"""
def __init__(self, config: Configuration) -> None:
# skip over includes here to avoid loading the complete country name data
yaml.add_constructor('!include', lambda loader, node: [],
Loader=yaml.SafeLoader)
cdata = yaml.safe_load(config.find_config_file('country_settings.yaml')
.read_text(encoding='utf-8'))
unique_patterns: Dict[str, Dict[str, List[str]]] = {}
for cc, data in cdata.items():
if data.get('postcode'):
pat = data['postcode']['pattern'].replace('d', '[0-9]').replace('l', '[a-z]')
out = data['postcode'].get('output')
if pat not in unique_patterns:
unique_patterns[pat] = defaultdict(list)
unique_patterns[pat][out].append(cc)
self.global_pattern = re.compile(
'(?:(?P<cc>[a-z][a-z])(?P<space>[ -]?))?(?P<pc>(?:(?:'
+ ')|(?:'.join(unique_patterns) + '))[:, >].*)')
self.local_patterns = [(re.compile(f"{pat}[:, >]"), list(info.items()))
for pat, info in unique_patterns.items()]
def parse(self, query: qmod.QueryStruct) -> Set[Tuple[int, int, str]]:
""" Parse postcodes in the given list of query tokens taking into
account the list of breaks from the nodes.
The result is a sequence of tuples with
[start node id, end node id, postcode token]
"""
nodes = query.nodes
outcodes: Set[Tuple[int, int, str]] = set()
for i in range(query.num_token_slots()):
if nodes[i].btype in '<,: ' and nodes[i + 1].btype != '`' \
and (i == 0 or nodes[i - 1].ptype != qmod.PHRASE_POSTCODE):
if nodes[i].ptype == qmod.PHRASE_ANY:
word = nodes[i + 1].term_normalized + nodes[i + 1].btype
if word[-1] in ' -' and nodes[i + 2].btype != '`' \
and nodes[i + 1].ptype == qmod.PHRASE_ANY:
word += nodes[i + 2].term_normalized + nodes[i + 2].btype
if word[-1] in ' -' and nodes[i + 3].btype != '`' \
and nodes[i + 2].ptype == qmod.PHRASE_ANY:
word += nodes[i + 3].term_normalized + nodes[i + 3].btype
self._match_word(word, i, False, outcodes)
elif nodes[i].ptype == qmod.PHRASE_POSTCODE:
word = nodes[i + 1].term_normalized + nodes[i + 1].btype
for j in range(i + 1, query.num_token_slots()):
if nodes[j].ptype != qmod.PHRASE_POSTCODE:
break
word += nodes[j + 1].term_normalized + nodes[j + 1].btype
self._match_word(word, i, True, outcodes)
return outcodes
def _match_word(self, word: str, pos: int, fullmatch: bool,
outcodes: Set[Tuple[int, int, str]]) -> None:
# Use global pattern to check for presence of any postcode.
m = self.global_pattern.fullmatch(word)
if m:
# If there was a match, check against each pattern separately
# because multiple patterns might be machting at the end.
cc = m.group('cc')
pc_word = m.group('pc')
cc_spaces = len(m.group('space') or '')
for pattern, info in self.local_patterns:
lm = pattern.fullmatch(pc_word) if fullmatch else pattern.match(pc_word)
if lm:
trange = (pos, pos + cc_spaces + sum(c in ' ,-:>' for c in lm.group(0)))
for out, out_ccs in info:
if cc is None or cc in out_ccs:
if out:
outcodes.add((*trange, lm.expand(out).upper()))
else:
outcodes.add((*trange, lm.group(0)[:-1].upper()))

View File

@@ -7,8 +7,9 @@
""" """
Datastructures for a tokenized query. Datastructures for a tokenized query.
""" """
from typing import List, Tuple, Optional, Iterator from typing import Dict, List, Tuple, Optional, Iterator
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from collections import defaultdict
import dataclasses import dataclasses
@@ -171,11 +172,33 @@ class TokenList:
@dataclasses.dataclass @dataclasses.dataclass
class QueryNode: class QueryNode:
""" A node of the query representing a break between terms. """ A node of the query representing a break between terms.
The node also contains information on the source term
ending at the node. The tokens are created from this information.
""" """
btype: BreakType btype: BreakType
ptype: PhraseType ptype: PhraseType
penalty: float
""" Penalty for the break at this node.
"""
term_lookup: str
""" Transliterated term following this node.
"""
term_normalized: str
""" Normalised form of term following this node.
When the token resulted from a split during transliteration,
then this string contains the complete source term.
"""
starting: List[TokenList] = dataclasses.field(default_factory=list) starting: List[TokenList] = dataclasses.field(default_factory=list)
def adjust_break(self, btype: BreakType, penalty: float) -> None:
""" Change the break type and penalty for this node.
"""
self.btype = btype
self.penalty = penalty
def has_tokens(self, end: int, *ttypes: TokenType) -> bool: def has_tokens(self, end: int, *ttypes: TokenType) -> bool:
""" Check if there are tokens of the given types ending at the """ Check if there are tokens of the given types ending at the
given node. given node.
@@ -218,19 +241,22 @@ class QueryStruct:
def __init__(self, source: List[Phrase]) -> None: def __init__(self, source: List[Phrase]) -> None:
self.source = source self.source = source
self.nodes: List[QueryNode] = \ self.nodes: List[QueryNode] = \
[QueryNode(BREAK_START, source[0].ptype if source else PHRASE_ANY)] [QueryNode(BREAK_START, source[0].ptype if source else PHRASE_ANY,
0.0, '', '')]
def num_token_slots(self) -> int: def num_token_slots(self) -> int:
""" Return the length of the query in vertice steps. """ Return the length of the query in vertice steps.
""" """
return len(self.nodes) - 1 return len(self.nodes) - 1
def add_node(self, btype: BreakType, ptype: PhraseType) -> None: def add_node(self, btype: BreakType, ptype: PhraseType,
break_penalty: float = 0.0,
term_lookup: str = '', term_normalized: str = '') -> None:
""" Append a new break node with the given break type. """ Append a new break node with the given break type.
The phrase type denotes the type for any tokens starting The phrase type denotes the type for any tokens starting
at the node. at the node.
""" """
self.nodes.append(QueryNode(btype, ptype)) self.nodes.append(QueryNode(btype, ptype, break_penalty, term_lookup, term_normalized))
def add_token(self, trange: TokenRange, ttype: TokenType, token: Token) -> None: def add_token(self, trange: TokenRange, ttype: TokenType, token: Token) -> None:
""" Add a token to the query. 'start' and 'end' are the indexes of the """ Add a token to the query. 'start' and 'end' are the indexes of the
@@ -287,3 +313,42 @@ class QueryStruct:
if t.token == token: if t.token == token:
return f"[{tlist.ttype}]{t.lookup_word}" return f"[{tlist.ttype}]{t.lookup_word}"
return 'None' return 'None'
def get_transliterated_query(self) -> str:
""" Return a string representation of the transliterated query
with the character representation of the different break types.
For debugging purposes only.
"""
return ''.join(''.join((n.term_lookup, n.btype)) for n in self.nodes)
def extract_words(self, base_penalty: float = 0.0,
start: int = 0,
endpos: Optional[int] = None) -> Dict[str, List[TokenRange]]:
""" Add all combinations of words that can be formed from the terms
between the given start and endnode. The terms are joined with
spaces for each break. Words can never go across a BREAK_PHRASE.
The functions returns a dictionary of possible words with their
position within the query and a penalty. The penalty is computed
from the base_penalty plus the penalty for each node the word
crosses.
"""
if endpos is None:
endpos = len(self.nodes)
words: Dict[str, List[TokenRange]] = defaultdict(list)
for first in range(start, endpos - 1):
word = self.nodes[first + 1].term_lookup
penalty = base_penalty
words[word].append(TokenRange(first, first + 1, penalty=penalty))
if self.nodes[first + 1].btype != BREAK_PHRASE:
for last in range(first + 2, min(first + 20, endpos)):
word = ' '.join((word, self.nodes[last].term_lookup))
penalty += self.nodes[last - 1].penalty
words[word].append(TokenRange(first, last, penalty=penalty))
if self.nodes[last].btype == BREAK_PHRASE:
break
return words

View File

@@ -381,76 +381,15 @@ class ICUNameAnalyzer(AbstractAnalyzer):
return postcode.strip().upper() return postcode.strip().upper()
def update_postcodes_from_db(self) -> None: def update_postcodes_from_db(self) -> None:
""" Update postcode tokens in the word table from the location_postcode """ Postcode update.
table.
Removes all postcodes from the word table because they are not
needed. Postcodes are recognised by pattern.
""" """
assert self.conn is not None assert self.conn is not None
analyzer = self.token_analysis.analysis.get('@postcode')
with self.conn.cursor() as cur: with self.conn.cursor() as cur:
# First get all postcode names currently in the word table. cur.execute("DELETE FROM word WHERE type = 'P'")
cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'")
word_entries = set((entry[0] for entry in cur))
# Then compute the required postcode names from the postcode table.
needed_entries = set()
cur.execute("SELECT country_code, postcode FROM location_postcode")
for cc, postcode in cur:
info = PlaceInfo({'country_code': cc,
'class': 'place', 'type': 'postcode',
'address': {'postcode': postcode}})
address = self.sanitizer.process_names(info)[1]
for place in address:
if place.kind == 'postcode':
if analyzer is None:
postcode_name = place.name.strip().upper()
variant_base = None
else:
postcode_name = analyzer.get_canonical_id(place)
variant_base = place.get_attr("variant")
if variant_base:
needed_entries.add(f'{postcode_name}@{variant_base}')
else:
needed_entries.add(postcode_name)
break
# Now update the word table.
self._delete_unused_postcode_words(word_entries - needed_entries)
self._add_missing_postcode_words(needed_entries - word_entries)
def _delete_unused_postcode_words(self, tokens: Iterable[str]) -> None:
assert self.conn is not None
if tokens:
with self.conn.cursor() as cur:
cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
(list(tokens), ))
def _add_missing_postcode_words(self, tokens: Iterable[str]) -> None:
assert self.conn is not None
if not tokens:
return
analyzer = self.token_analysis.analysis.get('@postcode')
terms = []
for postcode_name in tokens:
if '@' in postcode_name:
term, variant = postcode_name.split('@', 2)
term = self._search_normalized(term)
if analyzer is None:
variants = [term]
else:
variants = analyzer.compute_variants(variant)
if term not in variants:
variants.append(term)
else:
variants = [self._search_normalized(postcode_name)]
terms.append((postcode_name, variants))
if terms:
with self.conn.cursor() as cur:
cur.executemany("""SELECT create_postcode_word(%s, %s)""", terms)
def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]], def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
should_replace: bool) -> None: should_replace: bool) -> None:
@@ -718,32 +657,9 @@ class ICUNameAnalyzer(AbstractAnalyzer):
analyzer = self.token_analysis.analysis.get('@postcode') analyzer = self.token_analysis.analysis.get('@postcode')
if analyzer is None: if analyzer is None:
postcode_name = item.name.strip().upper() return item.name.strip().upper()
variant_base = None
else: else:
postcode_name = analyzer.get_canonical_id(item) return analyzer.get_canonical_id(item)
variant_base = item.get_attr("variant")
if variant_base:
postcode = f'{postcode_name}@{variant_base}'
else:
postcode = postcode_name
if postcode not in self._cache.postcodes:
term = self._search_normalized(postcode_name)
if not term:
return None
variants = {term}
if analyzer is not None and variant_base:
variants.update(analyzer.compute_variants(variant_base))
with self.conn.cursor() as cur:
cur.execute("SELECT create_postcode_word(%s, %s)",
(postcode, list(variants)))
self._cache.postcodes.add(postcode)
return postcode_name
class _TokenInfo: class _TokenInfo:
@@ -836,5 +752,4 @@ class _TokenCache:
self.names: Dict[str, Tuple[int, List[int]]] = {} self.names: Dict[str, Tuple[int, List[int]]] = {}
self.partials: Dict[str, int] = {} self.partials: Dict[str, int] = {}
self.fulls: Dict[str, List[int]] = {} self.fulls: Dict[str, List[int]] = {}
self.postcodes: Set[str] = set()
self.housenumbers: Dict[str, Tuple[Optional[int], Optional[str]]] = {} self.housenumbers: Dict[str, Tuple[Optional[int], Optional[str]]] = {}

View File

@@ -3,9 +3,8 @@
Feature: Searches with postcodes Feature: Searches with postcodes
Various searches involving postcodes Various searches involving postcodes
@Fail
Scenario: US 5+4 ZIP codes are shortened to 5 ZIP codes if not found Scenario: US 5+4 ZIP codes are shortened to 5 ZIP codes if not found
When sending json search query "36067 1111, us" with address When sending json search query "36067-1111, us" with address
Then result addresses contain Then result addresses contain
| postcode | | postcode |
| 36067 | | 36067 |

View File

@@ -170,7 +170,7 @@ Feature: Import of postcodes
| object | postcode | | object | postcode |
| W93 | 11200 | | W93 | 11200 |
Scenario: Postcodes are added to the postcode and word table Scenario: Postcodes are added to the postcode
Given the places Given the places
| osm | class | type | addr+postcode | addr+housenumber | geometry | | osm | class | type | addr+postcode | addr+housenumber | geometry |
| N34 | place | house | 01982 | 111 |country:de | | N34 | place | house | 01982 | 111 |country:de |
@@ -178,7 +178,6 @@ Feature: Import of postcodes
Then location_postcode contains exactly Then location_postcode contains exactly
| country | postcode | geometry | | country | postcode | geometry |
| de | 01982 | country:de | | de | 01982 | country:de |
And there are word tokens for postcodes 01982
@Fail @Fail
@@ -195,7 +194,7 @@ Feature: Import of postcodes
| E45 2 | gb | 23 | 5 | | E45 2 | gb | 23 | 5 |
| Y45 | gb | 21 | 5 | | Y45 | gb | 21 | 5 |
Scenario: Postcodes outside all countries are not added to the postcode and word table Scenario: Postcodes outside all countries are not added to the postcode table
Given the places Given the places
| osm | class | type | addr+postcode | addr+housenumber | addr+place | geometry | | osm | class | type | addr+postcode | addr+housenumber | addr+place | geometry |
| N34 | place | house | 01982 | 111 | Null Island | 0 0.00001 | | N34 | place | house | 01982 | 111 | Null Island | 0 0.00001 |
@@ -205,7 +204,6 @@ Feature: Import of postcodes
When importing When importing
Then location_postcode contains exactly Then location_postcode contains exactly
| country | postcode | geometry | | country | postcode | geometry |
And there are no word tokens for postcodes 01982
When sending search query "111, 01982 Null Island" When sending search query "111, 01982 Null Island"
Then results contain Then results contain
| osm | display_name | | osm | display_name |

View File

@@ -2,7 +2,7 @@
Feature: Update of postcode Feature: Update of postcode
Tests for updating of data related to postcodes Tests for updating of data related to postcodes
Scenario: A new postcode appears in the postcode and word table Scenario: A new postcode appears in the postcode table
Given the places Given the places
| osm | class | type | addr+postcode | addr+housenumber | geometry | | osm | class | type | addr+postcode | addr+housenumber | geometry |
| N34 | place | house | 01982 | 111 |country:de | | N34 | place | house | 01982 | 111 |country:de |
@@ -18,9 +18,8 @@ Feature: Update of postcode
| country | postcode | geometry | | country | postcode | geometry |
| de | 01982 | country:de | | de | 01982 | country:de |
| ch | 4567 | country:ch | | ch | 4567 | country:ch |
And there are word tokens for postcodes 01982,4567
Scenario: When the last postcode is deleted, it is deleted from postcode and word Scenario: When the last postcode is deleted, it is deleted from postcode
Given the places Given the places
| osm | class | type | addr+postcode | addr+housenumber | geometry | | osm | class | type | addr+postcode | addr+housenumber | geometry |
| N34 | place | house | 01982 | 111 |country:de | | N34 | place | house | 01982 | 111 |country:de |
@@ -31,10 +30,8 @@ Feature: Update of postcode
Then location_postcode contains exactly Then location_postcode contains exactly
| country | postcode | geometry | | country | postcode | geometry |
| ch | 4567 | country:ch | | ch | 4567 | country:ch |
And there are word tokens for postcodes 4567
And there are no word tokens for postcodes 01982
Scenario: A postcode is not deleted from postcode and word when it exist in another country Scenario: A postcode is not deleted from postcode when it exist in another country
Given the places Given the places
| osm | class | type | addr+postcode | addr+housenumber | geometry | | osm | class | type | addr+postcode | addr+housenumber | geometry |
| N34 | place | house | 01982 | 111 |country:de | | N34 | place | house | 01982 | 111 |country:de |
@@ -45,7 +42,6 @@ Feature: Update of postcode
Then location_postcode contains exactly Then location_postcode contains exactly
| country | postcode | geometry | | country | postcode | geometry |
| fr | 01982 | country:fr | | fr | 01982 | country:fr |
And there are word tokens for postcodes 01982
Scenario: Updating a postcode is reflected in postcode table Scenario: Updating a postcode is reflected in postcode table
Given the places Given the places
@@ -59,7 +55,6 @@ Feature: Update of postcode
Then location_postcode contains exactly Then location_postcode contains exactly
| country | postcode | geometry | | country | postcode | geometry |
| de | 20453 | country:de | | de | 20453 | country:de |
And there are word tokens for postcodes 20453
Scenario: When changing from a postcode type, the entry appears in placex Scenario: When changing from a postcode type, the entry appears in placex
When importing When importing
@@ -80,7 +75,6 @@ Feature: Update of postcode
Then location_postcode contains exactly Then location_postcode contains exactly
| country | postcode | geometry | | country | postcode | geometry |
| de | 20453 | country:de | | de | 20453 | country:de |
And there are word tokens for postcodes 20453
Scenario: When changing to a postcode type, the entry disappears from placex Scenario: When changing to a postcode type, the entry disappears from placex
When importing When importing
@@ -101,7 +95,6 @@ Feature: Update of postcode
Then location_postcode contains exactly Then location_postcode contains exactly
| country | postcode | geometry | | country | postcode | geometry |
| de | 01982 | country:de | | de | 01982 | country:de |
And there are word tokens for postcodes 01982
Scenario: When a parent is deleted, the postcode gets a new parent Scenario: When a parent is deleted, the postcode gets a new parent
Given the grid with origin DE Given the grid with origin DE

View File

@@ -21,6 +21,9 @@ def mktoken(tid: int):
return MyToken(penalty=3.0, token=tid, count=1, addr_count=1, return MyToken(penalty=3.0, token=tid, count=1, addr_count=1,
lookup_word='foo') lookup_word='foo')
@pytest.fixture
def qnode():
return query.QueryNode(query.BREAK_PHRASE, query.PHRASE_ANY, 0.0 ,'', '')
@pytest.mark.parametrize('ptype,ttype', [(query.PHRASE_ANY, 'W'), @pytest.mark.parametrize('ptype,ttype', [(query.PHRASE_ANY, 'W'),
(query.PHRASE_AMENITY, 'Q'), (query.PHRASE_AMENITY, 'Q'),
@@ -37,27 +40,24 @@ def test_phrase_incompatible(ptype):
assert not query._phrase_compatible_with(ptype, query.TOKEN_PARTIAL, True) assert not query._phrase_compatible_with(ptype, query.TOKEN_PARTIAL, True)
def test_query_node_empty(): def test_query_node_empty(qnode):
qn = query.QueryNode(query.BREAK_PHRASE, query.PHRASE_ANY) assert not qnode.has_tokens(3, query.TOKEN_PARTIAL)
assert qnode.get_tokens(3, query.TOKEN_WORD) is None
assert not qn.has_tokens(3, query.TOKEN_PARTIAL)
assert qn.get_tokens(3, query.TOKEN_WORD) is None
def test_query_node_with_content(): def test_query_node_with_content(qnode):
qn = query.QueryNode(query.BREAK_PHRASE, query.PHRASE_ANY) qnode.starting.append(query.TokenList(2, query.TOKEN_PARTIAL, [mktoken(100), mktoken(101)]))
qn.starting.append(query.TokenList(2, query.TOKEN_PARTIAL, [mktoken(100), mktoken(101)])) qnode.starting.append(query.TokenList(2, query.TOKEN_WORD, [mktoken(1000)]))
qn.starting.append(query.TokenList(2, query.TOKEN_WORD, [mktoken(1000)]))
assert not qn.has_tokens(3, query.TOKEN_PARTIAL) assert not qnode.has_tokens(3, query.TOKEN_PARTIAL)
assert not qn.has_tokens(2, query.TOKEN_COUNTRY) assert not qnode.has_tokens(2, query.TOKEN_COUNTRY)
assert qn.has_tokens(2, query.TOKEN_PARTIAL) assert qnode.has_tokens(2, query.TOKEN_PARTIAL)
assert qn.has_tokens(2, query.TOKEN_WORD) assert qnode.has_tokens(2, query.TOKEN_WORD)
assert qn.get_tokens(3, query.TOKEN_PARTIAL) is None assert qnode.get_tokens(3, query.TOKEN_PARTIAL) is None
assert qn.get_tokens(2, query.TOKEN_COUNTRY) is None assert qnode.get_tokens(2, query.TOKEN_COUNTRY) is None
assert len(qn.get_tokens(2, query.TOKEN_PARTIAL)) == 2 assert len(qnode.get_tokens(2, query.TOKEN_PARTIAL)) == 2
assert len(qn.get_tokens(2, query.TOKEN_WORD)) == 1 assert len(qnode.get_tokens(2, query.TOKEN_WORD)) == 1
def test_query_struct_empty(): def test_query_struct_empty():

View File

@@ -102,12 +102,11 @@ async def test_splitting_in_transliteration(conn):
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize('term,order', [('23456', ['P', 'H', 'W', 'w']), @pytest.mark.parametrize('term,order', [('23456', ['P', 'H', 'W', 'w']),
('3', ['H', 'P', 'W', 'w']) ('3', ['H', 'W', 'w'])
]) ])
async def test_penalty_postcodes_and_housenumbers(conn, term, order): async def test_penalty_postcodes_and_housenumbers(conn, term, order):
ana = await tok.create_query_analyzer(conn) ana = await tok.create_query_analyzer(conn)
await add_word(conn, 1, term, 'P', None)
await add_word(conn, 2, term, 'H', term) await add_word(conn, 2, term, 'H', term)
await add_word(conn, 3, term, 'w', term) await add_word(conn, 3, term, 'w', term)
await add_word(conn, 4, term, 'W', term) await add_word(conn, 4, term, 'W', term)
@@ -179,8 +178,10 @@ async def test_add_unknown_housenumbers(conn):
assert query.nodes[1].starting[0].ttype == qmod.TOKEN_HOUSENUMBER assert query.nodes[1].starting[0].ttype == qmod.TOKEN_HOUSENUMBER
assert len(query.nodes[1].starting[0].tokens) == 1 assert len(query.nodes[1].starting[0].tokens) == 1
assert query.nodes[1].starting[0].tokens[0].token == 1 assert query.nodes[1].starting[0].tokens[0].token == 1
assert not query.nodes[2].starting assert query.nodes[2].has_tokens(3, qmod.TOKEN_POSTCODE)
assert not query.nodes[3].starting assert not query.nodes[2].has_tokens(3, qmod.TOKEN_HOUSENUMBER)
assert not query.nodes[2].has_tokens(4, qmod.TOKEN_HOUSENUMBER)
assert not query.nodes[3].has_tokens(4, qmod.TOKEN_HOUSENUMBER)
@pytest.mark.asyncio @pytest.mark.asyncio

View File

@@ -0,0 +1,154 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2025 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Test for parsing of postcodes in queries.
"""
import re
from itertools import zip_longest
import pytest
from nominatim_api.search.postcode_parser import PostcodeParser
from nominatim_api.search.query import QueryStruct, PHRASE_ANY, PHRASE_POSTCODE, PHRASE_STREET
@pytest.fixture
def pc_config(project_env):
country_file = project_env.project_dir / 'country_settings.yaml'
country_file.write_text(r"""
ab:
postcode:
pattern: "ddddd ll"
ba:
postcode:
pattern: "ddddd"
de:
postcode:
pattern: "ddddd"
gr:
postcode:
pattern: "(ddd) ?(dd)"
output: \1 \2
in:
postcode:
pattern: "(ddd) ?(ddd)"
output: \1\2
mc:
postcode:
pattern: "980dd"
mz:
postcode:
pattern: "(dddd)(?:-dd)?"
bn:
postcode:
pattern: "(ll) ?(dddd)"
output: \1\2
ky:
postcode:
pattern: "(d)-(dddd)"
output: KY\1-\2
""")
return project_env
def mk_query(inp):
query = QueryStruct([])
phrase_split = re.split(r"([ ,:'-])", inp)
for word, breakchar in zip_longest(*[iter(phrase_split)]*2, fillvalue='>'):
query.add_node(breakchar, PHRASE_ANY, 0.1, word, word)
return query
@pytest.mark.parametrize('query,pos', [('45325 Berlin', 0),
('45325:Berlin', 0),
('45325,Berlin', 0),
('Berlin 45325', 1),
('Berlin,45325', 1),
('Berlin:45325', 1),
('Hansastr,45325 Berlin', 1),
('Hansastr 45325 Berlin', 1)])
def test_simple_postcode(pc_config, query, pos):
parser = PostcodeParser(pc_config)
result = parser.parse(mk_query(query))
assert result == {(pos, pos + 1, '45325'), (pos, pos + 1, '453 25')}
def test_contained_postcode(pc_config):
parser = PostcodeParser(pc_config)
assert parser.parse(mk_query('12345 dx')) == {(0, 1, '12345'), (0, 1, '123 45'),
(0, 2, '12345 DX')}
@pytest.mark.parametrize('query,frm,to', [('345987', 0, 1), ('345 987', 0, 2),
('Aina 345 987', 1, 3),
('Aina 23 345 987 ff', 2, 4)])
def test_postcode_with_space(pc_config, query, frm, to):
parser = PostcodeParser(pc_config)
result = parser.parse(mk_query(query))
assert result == {(frm, to, '345987')}
def test_overlapping_postcode(pc_config):
parser = PostcodeParser(pc_config)
assert parser.parse(mk_query('123 456 78')) == {(0, 2, '123456'), (1, 3, '456 78')}
@pytest.mark.parametrize('query', ['45325-Berlin', "45325'Berlin",
'Berlin-45325', "Berlin'45325", '45325Berlin'
'345-987', "345'987", '345,987', '345:987'])
def test_not_a_postcode(pc_config, query):
parser = PostcodeParser(pc_config)
assert not parser.parse(mk_query(query))
@pytest.mark.parametrize('query', ['ba 12233', 'ba-12233'])
def test_postcode_with_country_prefix(pc_config, query):
parser = PostcodeParser(pc_config)
assert (0, 2, '12233') in parser.parse(mk_query(query))
def test_postcode_with_joined_country_prefix(pc_config):
parser = PostcodeParser(pc_config)
assert parser.parse(mk_query('ba12233')) == {(0, 1, '12233')}
def test_postcode_with_non_matching_country_prefix(pc_config):
parser = PostcodeParser(pc_config)
assert not parser.parse(mk_query('ky12233'))
def test_postcode_inside_postcode_phrase(pc_config):
parser = PostcodeParser(pc_config)
query = QueryStruct([])
query.nodes[-1].ptype = PHRASE_STREET
query.add_node(',', PHRASE_STREET, 0.1, '12345', '12345')
query.add_node(',', PHRASE_POSTCODE, 0.1, 'xz', 'xz')
query.add_node('>', PHRASE_POSTCODE, 0.1, '4444', '4444')
assert parser.parse(query) == {(2, 3, '4444')}
def test_partial_postcode_in_postcode_phrase(pc_config):
parser = PostcodeParser(pc_config)
query = QueryStruct([])
query.nodes[-1].ptype = PHRASE_POSTCODE
query.add_node(' ', PHRASE_POSTCODE, 0.1, '2224', '2224')
query.add_node('>', PHRASE_POSTCODE, 0.1, '12345', '12345')
assert not parser.parse(query)

View File

@@ -46,3 +46,20 @@ def test_token_range_unimplemented_ops():
nq.TokenRange(1, 3) <= nq.TokenRange(10, 12) nq.TokenRange(1, 3) <= nq.TokenRange(10, 12)
with pytest.raises(TypeError): with pytest.raises(TypeError):
nq.TokenRange(1, 3) >= nq.TokenRange(10, 12) nq.TokenRange(1, 3) >= nq.TokenRange(10, 12)
def test_query_extract_words():
q = nq.QueryStruct([])
q.add_node(nq.BREAK_WORD, nq.PHRASE_ANY, 0.1, '12', '')
q.add_node(nq.BREAK_TOKEN, nq.PHRASE_ANY, 0.0, 'ab', '')
q.add_node(nq.BREAK_PHRASE, nq.PHRASE_ANY, 0.0, '12', '')
q.add_node(nq.BREAK_END, nq.PHRASE_ANY, 0.5, 'hallo', '')
words = q.extract_words(base_penalty=1.0)
assert set(words.keys()) \
== {'12', 'ab', 'hallo', '12 ab', 'ab 12', '12 ab 12'}
assert sorted(words['12']) == [nq.TokenRange(0, 1, 1.0), nq.TokenRange(2, 3, 1.0)]
assert words['12 ab'] == [nq.TokenRange(0, 2, 1.1)]
assert words['hallo'] == [nq.TokenRange(3, 4, 1.0)]

View File

@@ -265,37 +265,13 @@ class TestPostcodes:
'address': {'postcode': postcode}})) 'address': {'postcode': postcode}}))
def test_update_postcodes_from_db_empty(self, table_factory, word_table): def test_update_postcodes_deleted(self, word_table):
table_factory('location_postcode', 'country_code TEXT, postcode TEXT',
content=(('de', '12345'), ('se', '132 34'),
('bm', 'AB23'), ('fr', '12345')))
self.analyzer.update_postcodes_from_db()
assert word_table.count() == 5
assert word_table.get_postcodes() == {'12345', '132 34@132 34', 'AB 23@AB 23'}
def test_update_postcodes_from_db_ambigious(self, table_factory, word_table):
table_factory('location_postcode', 'country_code TEXT, postcode TEXT',
content=(('in', '123456'), ('sg', '123456')))
self.analyzer.update_postcodes_from_db()
assert word_table.count() == 3
assert word_table.get_postcodes() == {'123456', '123456@123 456'}
def test_update_postcodes_from_db_add_and_remove(self, table_factory, word_table):
table_factory('location_postcode', 'country_code TEXT, postcode TEXT',
content=(('ch', '1234'), ('bm', 'BC 45'), ('bm', 'XX45')))
word_table.add_postcode(' 1234', '1234') word_table.add_postcode(' 1234', '1234')
word_table.add_postcode(' 5678', '5678') word_table.add_postcode(' 5678', '5678')
self.analyzer.update_postcodes_from_db() self.analyzer.update_postcodes_from_db()
assert word_table.count() == 5 assert word_table.count() == 0
assert word_table.get_postcodes() == {'1234', 'BC 45@BC 45', 'XX 45@XX 45'}
def test_process_place_postcode_simple(self, word_table): def test_process_place_postcode_simple(self, word_table):
@@ -303,16 +279,12 @@ class TestPostcodes:
assert info['postcode'] == '12345' assert info['postcode'] == '12345'
assert word_table.get_postcodes() == {'12345', }
def test_process_place_postcode_with_space(self, word_table): def test_process_place_postcode_with_space(self, word_table):
info = self.process_postcode('in', '123 567') info = self.process_postcode('in', '123 567')
assert info['postcode'] == '123567' assert info['postcode'] == '123567'
assert word_table.get_postcodes() == {'123567@123 567', }
def test_update_special_phrase_empty_table(analyzer, word_table): def test_update_special_phrase_empty_table(analyzer, word_table):
@@ -477,9 +449,9 @@ class TestPlaceAddress:
@pytest.mark.parametrize('pcode', ['12345', 'AB 123', '34-345']) @pytest.mark.parametrize('pcode', ['12345', 'AB 123', '34-345'])
def test_process_place_postcode(self, word_table, pcode): def test_process_place_postcode(self, word_table, pcode):
self.process_address(postcode=pcode) info = self.process_address(postcode=pcode)
assert word_table.get_postcodes() == {pcode, } assert info['postcode'] == pcode
@pytest.mark.parametrize('hnr', ['123a', '1', '101']) @pytest.mark.parametrize('hnr', ['123a', '1', '101'])