Merge pull request #3665 from lonvia/pattern-matching-postcodes

Add full parsing of postcodes in query
This commit is contained in:
Sarah Hoffmann
2025-03-05 16:02:03 +01:00
committed by GitHub
13 changed files with 476 additions and 295 deletions

View File

@@ -1809,7 +1809,8 @@ us:
languages: en
names: !include country-names/us.yaml
postcode:
pattern: "ddddd"
pattern: "(ddddd)(?:-dddd)?"
output: \1
# Uruguay (Uruguay)

View File

@@ -8,7 +8,6 @@
Implementation of query analysis for the ICU tokenizer.
"""
from typing import Tuple, Dict, List, Optional, Iterator, Any, cast
from collections import defaultdict
import dataclasses
import difflib
import re
@@ -25,7 +24,9 @@ from ..connection import SearchConnection
from ..logging import log
from . import query as qmod
from ..query_preprocessing.config import QueryConfig
from ..query_preprocessing.base import QueryProcessingFunc
from .query_analyzer_factory import AbstractQueryAnalyzer
from .postcode_parser import PostcodeParser
DB_TO_TOKEN_TYPE = {
@@ -47,42 +48,6 @@ PENALTY_IN_TOKEN_BREAK = {
}
@dataclasses.dataclass
class QueryPart:
""" Normalized and transliterated form of a single term in the query.
When the term came out of a split during the transliteration,
the normalized string is the full word before transliteration.
Check the subsequent break type to figure out if the word is
continued.
Penalty is the break penalty for the break following the token.
"""
token: str
normalized: str
penalty: float
QueryParts = List[QueryPart]
WordDict = Dict[str, List[qmod.TokenRange]]
def extract_words(terms: List[QueryPart], start: int, words: WordDict) -> None:
""" Add all combinations of words in the terms list after the
given position to the word list.
"""
total = len(terms)
base_penalty = PENALTY_IN_TOKEN_BREAK[qmod.BREAK_WORD]
for first in range(start, total):
word = terms[first].token
penalty = base_penalty
words[word].append(qmod.TokenRange(first, first + 1, penalty=penalty))
for last in range(first + 1, min(first + 20, total)):
word = ' '.join((word, terms[last].token))
penalty += terms[last - 1].penalty
words[word].append(qmod.TokenRange(first, last + 1, penalty=penalty))
@dataclasses.dataclass
class ICUToken(qmod.Token):
""" Specialised token for ICU tokenizer.
@@ -148,60 +113,51 @@ class ICUToken(qmod.Token):
addr_count=max(1, addr_count))
class ICUQueryAnalyzer(AbstractQueryAnalyzer):
""" Converter for query strings into a tokenized query
using the tokens created by a ICU tokenizer.
"""
def __init__(self, conn: SearchConnection) -> None:
self.conn = conn
@dataclasses.dataclass
class ICUAnalyzerConfig:
postcode_parser: PostcodeParser
normalizer: Transliterator
transliterator: Transliterator
preprocessors: List[QueryProcessingFunc]
async def setup(self) -> None:
""" Set up static data structures needed for the analysis.
"""
async def _make_normalizer() -> Any:
rules = await self.conn.get_property('tokenizer_import_normalisation')
return Transliterator.createFromRules("normalization", rules)
@staticmethod
async def create(conn: SearchConnection) -> 'ICUAnalyzerConfig':
rules = await conn.get_property('tokenizer_import_normalisation')
normalizer = Transliterator.createFromRules("normalization", rules)
self.normalizer = await self.conn.get_cached_value('ICUTOK', 'normalizer',
_make_normalizer)
rules = await conn.get_property('tokenizer_import_transliteration')
transliterator = Transliterator.createFromRules("transliteration", rules)
async def _make_transliterator() -> Any:
rules = await self.conn.get_property('tokenizer_import_transliteration')
return Transliterator.createFromRules("transliteration", rules)
self.transliterator = await self.conn.get_cached_value('ICUTOK', 'transliterator',
_make_transliterator)
await self._setup_preprocessing()
if 'word' not in self.conn.t.meta.tables:
sa.Table('word', self.conn.t.meta,
sa.Column('word_id', sa.Integer),
sa.Column('word_token', sa.Text, nullable=False),
sa.Column('type', sa.Text, nullable=False),
sa.Column('word', sa.Text),
sa.Column('info', Json))
async def _setup_preprocessing(self) -> None:
""" Load the rules for preprocessing and set up the handlers.
"""
rules = self.conn.config.load_sub_configuration('icu_tokenizer.yaml',
config='TOKENIZER_CONFIG')
preprocessing_rules = rules.get('query-preprocessing', [])
self.preprocessors = []
preprocessing_rules = conn.config.load_sub_configuration('icu_tokenizer.yaml',
config='TOKENIZER_CONFIG')\
.get('query-preprocessing', [])
preprocessors: List[QueryProcessingFunc] = []
for func in preprocessing_rules:
if 'step' not in func:
raise UsageError("Preprocessing rule is missing the 'step' attribute.")
if not isinstance(func['step'], str):
raise UsageError("'step' attribute must be a simple string.")
module = self.conn.config.load_plugin_module(
module = conn.config.load_plugin_module(
func['step'], 'nominatim_api.query_preprocessing')
self.preprocessors.append(
module.create(QueryConfig(func).set_normalizer(self.normalizer)))
preprocessors.append(
module.create(QueryConfig(func).set_normalizer(normalizer)))
return ICUAnalyzerConfig(PostcodeParser(conn.config),
normalizer, transliterator, preprocessors)
class ICUQueryAnalyzer(AbstractQueryAnalyzer):
""" Converter for query strings into a tokenized query
using the tokens created by a ICU tokenizer.
"""
def __init__(self, conn: SearchConnection, config: ICUAnalyzerConfig) -> None:
self.conn = conn
self.postcode_parser = config.postcode_parser
self.normalizer = config.normalizer
self.transliterator = config.transliterator
self.preprocessors = config.preprocessors
async def analyze_query(self, phrases: List[qmod.Phrase]) -> qmod.QueryStruct:
""" Analyze the given list of phrases and return the
@@ -216,8 +172,9 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
if not query.source:
return query
parts, words = self.split_query(query)
log().var_dump('Transliterated query', lambda: _dump_transliterated(query, parts))
self.split_query(query)
log().var_dump('Transliterated query', lambda: query.get_transliterated_query())
words = query.extract_words(base_penalty=PENALTY_IN_TOKEN_BREAK[qmod.BREAK_WORD])
for row in await self.lookup_in_db(list(words.keys())):
for trange in words[row.word_token]:
@@ -234,8 +191,13 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
else:
query.add_token(trange, DB_TO_TOKEN_TYPE[row.type], token)
self.add_extra_tokens(query, parts)
self.rerank_tokens(query, parts)
self.add_extra_tokens(query)
for start, end, pc in self.postcode_parser.parse(query):
query.add_token(qmod.TokenRange(start, end),
qmod.TOKEN_POSTCODE,
ICUToken(penalty=0.1, token=0, count=1, addr_count=1,
lookup_word=pc, word_token=pc, info=None))
self.rerank_tokens(query)
log().table_dump('Word tokens', _dump_word_tokens(query))
@@ -248,16 +210,9 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
"""
return cast(str, self.normalizer.transliterate(text)).strip('-: ')
def split_query(self, query: qmod.QueryStruct) -> Tuple[QueryParts, WordDict]:
def split_query(self, query: qmod.QueryStruct) -> None:
""" Transliterate the phrases and split them into tokens.
Returns the list of transliterated tokens together with their
normalized form and a dictionary of words for lookup together
with their position.
"""
parts: QueryParts = []
phrase_start = 0
words: WordDict = defaultdict(list)
for phrase in query.source:
query.nodes[-1].ptype = phrase.ptype
phrase_split = re.split('([ :-])', phrase.text)
@@ -272,38 +227,42 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
if trans:
for term in trans.split(' '):
if term:
parts.append(QueryPart(term, word,
PENALTY_IN_TOKEN_BREAK[qmod.BREAK_TOKEN]))
query.add_node(qmod.BREAK_TOKEN, phrase.ptype)
query.nodes[-1].btype = breakchar
parts[-1].penalty = PENALTY_IN_TOKEN_BREAK[breakchar]
query.add_node(qmod.BREAK_TOKEN, phrase.ptype,
PENALTY_IN_TOKEN_BREAK[qmod.BREAK_TOKEN],
term, word)
query.nodes[-1].adjust_break(breakchar,
PENALTY_IN_TOKEN_BREAK[breakchar])
extract_words(parts, phrase_start, words)
phrase_start = len(parts)
query.nodes[-1].btype = qmod.BREAK_END
return parts, words
query.nodes[-1].adjust_break(qmod.BREAK_END, PENALTY_IN_TOKEN_BREAK[qmod.BREAK_END])
async def lookup_in_db(self, words: List[str]) -> 'sa.Result[Any]':
""" Return the token information from the database for the
given word tokens.
This function excludes postcode tokens
"""
t = self.conn.t.meta.tables['word']
return await self.conn.execute(t.select().where(t.c.word_token.in_(words)))
return await self.conn.execute(t.select()
.where(t.c.word_token.in_(words))
.where(t.c.type != 'P'))
def add_extra_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None:
def add_extra_tokens(self, query: qmod.QueryStruct) -> None:
""" Add tokens to query that are not saved in the database.
"""
for part, node, i in zip(parts, query.nodes, range(1000)):
if len(part.token) <= 4 and part.token.isdigit()\
and not node.has_tokens(i+1, qmod.TOKEN_HOUSENUMBER):
query.add_token(qmod.TokenRange(i, i+1), qmod.TOKEN_HOUSENUMBER,
need_hnr = False
for i, node in enumerate(query.nodes):
is_full_token = node.btype not in (qmod.BREAK_TOKEN, qmod.BREAK_PART)
if need_hnr and is_full_token \
and len(node.term_normalized) <= 4 and node.term_normalized.isdigit():
query.add_token(qmod.TokenRange(i-1, i), qmod.TOKEN_HOUSENUMBER,
ICUToken(penalty=0.5, token=0,
count=1, addr_count=1, lookup_word=part.token,
word_token=part.token, info=None))
count=1, addr_count=1,
lookup_word=node.term_lookup,
word_token=node.term_lookup, info=None))
def rerank_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None:
need_hnr = is_full_token and not node.has_tokens(i+1, qmod.TOKEN_HOUSENUMBER)
def rerank_tokens(self, query: qmod.QueryStruct) -> None:
""" Add penalties to tokens that depend on presence of other token.
"""
for i, node, tlist in query.iter_token_lists():
@@ -320,28 +279,22 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
if repl.end == tlist.end and repl.ttype != qmod.TOKEN_HOUSENUMBER:
repl.add_penalty(0.5 - tlist.tokens[0].penalty)
elif tlist.ttype not in (qmod.TOKEN_COUNTRY, qmod.TOKEN_PARTIAL):
norm = parts[i].normalized
for j in range(i + 1, tlist.end):
if node.btype != qmod.BREAK_TOKEN:
norm += ' ' + parts[j].normalized
norm = ' '.join(n.term_normalized for n in query.nodes[i + 1:tlist.end + 1]
if n.btype != qmod.BREAK_TOKEN)
if not norm:
# Can happen when the token only covers a partial term
norm = query.nodes[i + 1].term_normalized
for token in tlist.tokens:
cast(ICUToken, token).rematch(norm)
def _dump_transliterated(query: qmod.QueryStruct, parts: QueryParts) -> str:
out = query.nodes[0].btype
for node, part in zip(query.nodes[1:], parts):
out += part.token + node.btype
return out
def _dump_word_tokens(query: qmod.QueryStruct) -> Iterator[List[Any]]:
yield ['type', 'token', 'word_token', 'lookup_word', 'penalty', 'count', 'info']
for node in query.nodes:
yield ['type', 'from', 'to', 'token', 'word_token', 'lookup_word', 'penalty', 'count', 'info']
for i, node in enumerate(query.nodes):
for tlist in node.starting:
for token in tlist.tokens:
t = cast(ICUToken, token)
yield [tlist.ttype, t.token, t.word_token or '',
yield [tlist.ttype, str(i), str(tlist.end), t.token, t.word_token or '',
t.lookup_word or '', t.penalty, t.count, t.info]
@@ -349,7 +302,17 @@ async def create_query_analyzer(conn: SearchConnection) -> AbstractQueryAnalyzer
""" Create and set up a new query analyzer for a database based
on the ICU tokenizer.
"""
out = ICUQueryAnalyzer(conn)
await out.setup()
async def _get_config() -> ICUAnalyzerConfig:
if 'word' not in conn.t.meta.tables:
sa.Table('word', conn.t.meta,
sa.Column('word_id', sa.Integer),
sa.Column('word_token', sa.Text, nullable=False),
sa.Column('type', sa.Text, nullable=False),
sa.Column('word', sa.Text),
sa.Column('info', Json))
return out
return await ICUAnalyzerConfig.create(conn)
config = await conn.get_cached_value('ICUTOK', 'config', _get_config)
return ICUQueryAnalyzer(conn, config)

View File

@@ -0,0 +1,103 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2025 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Handling of arbitrary postcode tokens in tokenized query string.
"""
from typing import Tuple, Set, Dict, List
import re
from collections import defaultdict
import yaml
from ..config import Configuration
from . import query as qmod
class PostcodeParser:
""" Pattern-based parser for postcodes in tokenized queries.
The postcode patterns are read from the country configuration.
The parser does currently not return country restrictions.
"""
def __init__(self, config: Configuration) -> None:
# skip over includes here to avoid loading the complete country name data
yaml.add_constructor('!include', lambda loader, node: [],
Loader=yaml.SafeLoader)
cdata = yaml.safe_load(config.find_config_file('country_settings.yaml')
.read_text(encoding='utf-8'))
unique_patterns: Dict[str, Dict[str, List[str]]] = {}
for cc, data in cdata.items():
if data.get('postcode'):
pat = data['postcode']['pattern'].replace('d', '[0-9]').replace('l', '[a-z]')
out = data['postcode'].get('output')
if pat not in unique_patterns:
unique_patterns[pat] = defaultdict(list)
unique_patterns[pat][out].append(cc)
self.global_pattern = re.compile(
'(?:(?P<cc>[a-z][a-z])(?P<space>[ -]?))?(?P<pc>(?:(?:'
+ ')|(?:'.join(unique_patterns) + '))[:, >].*)')
self.local_patterns = [(re.compile(f"{pat}[:, >]"), list(info.items()))
for pat, info in unique_patterns.items()]
def parse(self, query: qmod.QueryStruct) -> Set[Tuple[int, int, str]]:
""" Parse postcodes in the given list of query tokens taking into
account the list of breaks from the nodes.
The result is a sequence of tuples with
[start node id, end node id, postcode token]
"""
nodes = query.nodes
outcodes: Set[Tuple[int, int, str]] = set()
for i in range(query.num_token_slots()):
if nodes[i].btype in '<,: ' and nodes[i + 1].btype != '`' \
and (i == 0 or nodes[i - 1].ptype != qmod.PHRASE_POSTCODE):
if nodes[i].ptype == qmod.PHRASE_ANY:
word = nodes[i + 1].term_normalized + nodes[i + 1].btype
if word[-1] in ' -' and nodes[i + 2].btype != '`' \
and nodes[i + 1].ptype == qmod.PHRASE_ANY:
word += nodes[i + 2].term_normalized + nodes[i + 2].btype
if word[-1] in ' -' and nodes[i + 3].btype != '`' \
and nodes[i + 2].ptype == qmod.PHRASE_ANY:
word += nodes[i + 3].term_normalized + nodes[i + 3].btype
self._match_word(word, i, False, outcodes)
elif nodes[i].ptype == qmod.PHRASE_POSTCODE:
word = nodes[i + 1].term_normalized + nodes[i + 1].btype
for j in range(i + 1, query.num_token_slots()):
if nodes[j].ptype != qmod.PHRASE_POSTCODE:
break
word += nodes[j + 1].term_normalized + nodes[j + 1].btype
self._match_word(word, i, True, outcodes)
return outcodes
def _match_word(self, word: str, pos: int, fullmatch: bool,
outcodes: Set[Tuple[int, int, str]]) -> None:
# Use global pattern to check for presence of any postcode.
m = self.global_pattern.fullmatch(word)
if m:
# If there was a match, check against each pattern separately
# because multiple patterns might be machting at the end.
cc = m.group('cc')
pc_word = m.group('pc')
cc_spaces = len(m.group('space') or '')
for pattern, info in self.local_patterns:
lm = pattern.fullmatch(pc_word) if fullmatch else pattern.match(pc_word)
if lm:
trange = (pos, pos + cc_spaces + sum(c in ' ,-:>' for c in lm.group(0)))
for out, out_ccs in info:
if cc is None or cc in out_ccs:
if out:
outcodes.add((*trange, lm.expand(out).upper()))
else:
outcodes.add((*trange, lm.group(0)[:-1].upper()))

View File

@@ -7,8 +7,9 @@
"""
Datastructures for a tokenized query.
"""
from typing import List, Tuple, Optional, Iterator
from typing import Dict, List, Tuple, Optional, Iterator
from abc import ABC, abstractmethod
from collections import defaultdict
import dataclasses
@@ -171,11 +172,33 @@ class TokenList:
@dataclasses.dataclass
class QueryNode:
""" A node of the query representing a break between terms.
The node also contains information on the source term
ending at the node. The tokens are created from this information.
"""
btype: BreakType
ptype: PhraseType
penalty: float
""" Penalty for the break at this node.
"""
term_lookup: str
""" Transliterated term following this node.
"""
term_normalized: str
""" Normalised form of term following this node.
When the token resulted from a split during transliteration,
then this string contains the complete source term.
"""
starting: List[TokenList] = dataclasses.field(default_factory=list)
def adjust_break(self, btype: BreakType, penalty: float) -> None:
""" Change the break type and penalty for this node.
"""
self.btype = btype
self.penalty = penalty
def has_tokens(self, end: int, *ttypes: TokenType) -> bool:
""" Check if there are tokens of the given types ending at the
given node.
@@ -218,19 +241,22 @@ class QueryStruct:
def __init__(self, source: List[Phrase]) -> None:
self.source = source
self.nodes: List[QueryNode] = \
[QueryNode(BREAK_START, source[0].ptype if source else PHRASE_ANY)]
[QueryNode(BREAK_START, source[0].ptype if source else PHRASE_ANY,
0.0, '', '')]
def num_token_slots(self) -> int:
""" Return the length of the query in vertice steps.
"""
return len(self.nodes) - 1
def add_node(self, btype: BreakType, ptype: PhraseType) -> None:
def add_node(self, btype: BreakType, ptype: PhraseType,
break_penalty: float = 0.0,
term_lookup: str = '', term_normalized: str = '') -> None:
""" Append a new break node with the given break type.
The phrase type denotes the type for any tokens starting
at the node.
"""
self.nodes.append(QueryNode(btype, ptype))
self.nodes.append(QueryNode(btype, ptype, break_penalty, term_lookup, term_normalized))
def add_token(self, trange: TokenRange, ttype: TokenType, token: Token) -> None:
""" Add a token to the query. 'start' and 'end' are the indexes of the
@@ -287,3 +313,42 @@ class QueryStruct:
if t.token == token:
return f"[{tlist.ttype}]{t.lookup_word}"
return 'None'
def get_transliterated_query(self) -> str:
""" Return a string representation of the transliterated query
with the character representation of the different break types.
For debugging purposes only.
"""
return ''.join(''.join((n.term_lookup, n.btype)) for n in self.nodes)
def extract_words(self, base_penalty: float = 0.0,
start: int = 0,
endpos: Optional[int] = None) -> Dict[str, List[TokenRange]]:
""" Add all combinations of words that can be formed from the terms
between the given start and endnode. The terms are joined with
spaces for each break. Words can never go across a BREAK_PHRASE.
The functions returns a dictionary of possible words with their
position within the query and a penalty. The penalty is computed
from the base_penalty plus the penalty for each node the word
crosses.
"""
if endpos is None:
endpos = len(self.nodes)
words: Dict[str, List[TokenRange]] = defaultdict(list)
for first in range(start, endpos - 1):
word = self.nodes[first + 1].term_lookup
penalty = base_penalty
words[word].append(TokenRange(first, first + 1, penalty=penalty))
if self.nodes[first + 1].btype != BREAK_PHRASE:
for last in range(first + 2, min(first + 20, endpos)):
word = ' '.join((word, self.nodes[last].term_lookup))
penalty += self.nodes[last - 1].penalty
words[word].append(TokenRange(first, last, penalty=penalty))
if self.nodes[last].btype == BREAK_PHRASE:
break
return words

View File

@@ -381,76 +381,15 @@ class ICUNameAnalyzer(AbstractAnalyzer):
return postcode.strip().upper()
def update_postcodes_from_db(self) -> None:
""" Update postcode tokens in the word table from the location_postcode
table.
""" Postcode update.
Removes all postcodes from the word table because they are not
needed. Postcodes are recognised by pattern.
"""
assert self.conn is not None
analyzer = self.token_analysis.analysis.get('@postcode')
with self.conn.cursor() as cur:
# First get all postcode names currently in the word table.
cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'")
word_entries = set((entry[0] for entry in cur))
# Then compute the required postcode names from the postcode table.
needed_entries = set()
cur.execute("SELECT country_code, postcode FROM location_postcode")
for cc, postcode in cur:
info = PlaceInfo({'country_code': cc,
'class': 'place', 'type': 'postcode',
'address': {'postcode': postcode}})
address = self.sanitizer.process_names(info)[1]
for place in address:
if place.kind == 'postcode':
if analyzer is None:
postcode_name = place.name.strip().upper()
variant_base = None
else:
postcode_name = analyzer.get_canonical_id(place)
variant_base = place.get_attr("variant")
if variant_base:
needed_entries.add(f'{postcode_name}@{variant_base}')
else:
needed_entries.add(postcode_name)
break
# Now update the word table.
self._delete_unused_postcode_words(word_entries - needed_entries)
self._add_missing_postcode_words(needed_entries - word_entries)
def _delete_unused_postcode_words(self, tokens: Iterable[str]) -> None:
assert self.conn is not None
if tokens:
with self.conn.cursor() as cur:
cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
(list(tokens), ))
def _add_missing_postcode_words(self, tokens: Iterable[str]) -> None:
assert self.conn is not None
if not tokens:
return
analyzer = self.token_analysis.analysis.get('@postcode')
terms = []
for postcode_name in tokens:
if '@' in postcode_name:
term, variant = postcode_name.split('@', 2)
term = self._search_normalized(term)
if analyzer is None:
variants = [term]
else:
variants = analyzer.compute_variants(variant)
if term not in variants:
variants.append(term)
else:
variants = [self._search_normalized(postcode_name)]
terms.append((postcode_name, variants))
if terms:
with self.conn.cursor() as cur:
cur.executemany("""SELECT create_postcode_word(%s, %s)""", terms)
cur.execute("DELETE FROM word WHERE type = 'P'")
def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
should_replace: bool) -> None:
@@ -718,32 +657,9 @@ class ICUNameAnalyzer(AbstractAnalyzer):
analyzer = self.token_analysis.analysis.get('@postcode')
if analyzer is None:
postcode_name = item.name.strip().upper()
variant_base = None
return item.name.strip().upper()
else:
postcode_name = analyzer.get_canonical_id(item)
variant_base = item.get_attr("variant")
if variant_base:
postcode = f'{postcode_name}@{variant_base}'
else:
postcode = postcode_name
if postcode not in self._cache.postcodes:
term = self._search_normalized(postcode_name)
if not term:
return None
variants = {term}
if analyzer is not None and variant_base:
variants.update(analyzer.compute_variants(variant_base))
with self.conn.cursor() as cur:
cur.execute("SELECT create_postcode_word(%s, %s)",
(postcode, list(variants)))
self._cache.postcodes.add(postcode)
return postcode_name
return analyzer.get_canonical_id(item)
class _TokenInfo:
@@ -836,5 +752,4 @@ class _TokenCache:
self.names: Dict[str, Tuple[int, List[int]]] = {}
self.partials: Dict[str, int] = {}
self.fulls: Dict[str, List[int]] = {}
self.postcodes: Set[str] = set()
self.housenumbers: Dict[str, Tuple[Optional[int], Optional[str]]] = {}

View File

@@ -3,9 +3,8 @@
Feature: Searches with postcodes
Various searches involving postcodes
@Fail
Scenario: US 5+4 ZIP codes are shortened to 5 ZIP codes if not found
When sending json search query "36067 1111, us" with address
When sending json search query "36067-1111, us" with address
Then result addresses contain
| postcode |
| 36067 |

View File

@@ -170,7 +170,7 @@ Feature: Import of postcodes
| object | postcode |
| W93 | 11200 |
Scenario: Postcodes are added to the postcode and word table
Scenario: Postcodes are added to the postcode
Given the places
| osm | class | type | addr+postcode | addr+housenumber | geometry |
| N34 | place | house | 01982 | 111 |country:de |
@@ -178,7 +178,6 @@ Feature: Import of postcodes
Then location_postcode contains exactly
| country | postcode | geometry |
| de | 01982 | country:de |
And there are word tokens for postcodes 01982
@Fail
@@ -195,7 +194,7 @@ Feature: Import of postcodes
| E45 2 | gb | 23 | 5 |
| Y45 | gb | 21 | 5 |
Scenario: Postcodes outside all countries are not added to the postcode and word table
Scenario: Postcodes outside all countries are not added to the postcode table
Given the places
| osm | class | type | addr+postcode | addr+housenumber | addr+place | geometry |
| N34 | place | house | 01982 | 111 | Null Island | 0 0.00001 |
@@ -205,7 +204,6 @@ Feature: Import of postcodes
When importing
Then location_postcode contains exactly
| country | postcode | geometry |
And there are no word tokens for postcodes 01982
When sending search query "111, 01982 Null Island"
Then results contain
| osm | display_name |

View File

@@ -2,7 +2,7 @@
Feature: Update of postcode
Tests for updating of data related to postcodes
Scenario: A new postcode appears in the postcode and word table
Scenario: A new postcode appears in the postcode table
Given the places
| osm | class | type | addr+postcode | addr+housenumber | geometry |
| N34 | place | house | 01982 | 111 |country:de |
@@ -18,9 +18,8 @@ Feature: Update of postcode
| country | postcode | geometry |
| de | 01982 | country:de |
| ch | 4567 | country:ch |
And there are word tokens for postcodes 01982,4567
Scenario: When the last postcode is deleted, it is deleted from postcode and word
Scenario: When the last postcode is deleted, it is deleted from postcode
Given the places
| osm | class | type | addr+postcode | addr+housenumber | geometry |
| N34 | place | house | 01982 | 111 |country:de |
@@ -31,10 +30,8 @@ Feature: Update of postcode
Then location_postcode contains exactly
| country | postcode | geometry |
| ch | 4567 | country:ch |
And there are word tokens for postcodes 4567
And there are no word tokens for postcodes 01982
Scenario: A postcode is not deleted from postcode and word when it exist in another country
Scenario: A postcode is not deleted from postcode when it exist in another country
Given the places
| osm | class | type | addr+postcode | addr+housenumber | geometry |
| N34 | place | house | 01982 | 111 |country:de |
@@ -45,7 +42,6 @@ Feature: Update of postcode
Then location_postcode contains exactly
| country | postcode | geometry |
| fr | 01982 | country:fr |
And there are word tokens for postcodes 01982
Scenario: Updating a postcode is reflected in postcode table
Given the places
@@ -59,7 +55,6 @@ Feature: Update of postcode
Then location_postcode contains exactly
| country | postcode | geometry |
| de | 20453 | country:de |
And there are word tokens for postcodes 20453
Scenario: When changing from a postcode type, the entry appears in placex
When importing
@@ -80,7 +75,6 @@ Feature: Update of postcode
Then location_postcode contains exactly
| country | postcode | geometry |
| de | 20453 | country:de |
And there are word tokens for postcodes 20453
Scenario: When changing to a postcode type, the entry disappears from placex
When importing
@@ -101,7 +95,6 @@ Feature: Update of postcode
Then location_postcode contains exactly
| country | postcode | geometry |
| de | 01982 | country:de |
And there are word tokens for postcodes 01982
Scenario: When a parent is deleted, the postcode gets a new parent
Given the grid with origin DE

View File

@@ -21,6 +21,9 @@ def mktoken(tid: int):
return MyToken(penalty=3.0, token=tid, count=1, addr_count=1,
lookup_word='foo')
@pytest.fixture
def qnode():
return query.QueryNode(query.BREAK_PHRASE, query.PHRASE_ANY, 0.0 ,'', '')
@pytest.mark.parametrize('ptype,ttype', [(query.PHRASE_ANY, 'W'),
(query.PHRASE_AMENITY, 'Q'),
@@ -37,27 +40,24 @@ def test_phrase_incompatible(ptype):
assert not query._phrase_compatible_with(ptype, query.TOKEN_PARTIAL, True)
def test_query_node_empty():
qn = query.QueryNode(query.BREAK_PHRASE, query.PHRASE_ANY)
assert not qn.has_tokens(3, query.TOKEN_PARTIAL)
assert qn.get_tokens(3, query.TOKEN_WORD) is None
def test_query_node_empty(qnode):
assert not qnode.has_tokens(3, query.TOKEN_PARTIAL)
assert qnode.get_tokens(3, query.TOKEN_WORD) is None
def test_query_node_with_content():
qn = query.QueryNode(query.BREAK_PHRASE, query.PHRASE_ANY)
qn.starting.append(query.TokenList(2, query.TOKEN_PARTIAL, [mktoken(100), mktoken(101)]))
qn.starting.append(query.TokenList(2, query.TOKEN_WORD, [mktoken(1000)]))
def test_query_node_with_content(qnode):
qnode.starting.append(query.TokenList(2, query.TOKEN_PARTIAL, [mktoken(100), mktoken(101)]))
qnode.starting.append(query.TokenList(2, query.TOKEN_WORD, [mktoken(1000)]))
assert not qn.has_tokens(3, query.TOKEN_PARTIAL)
assert not qn.has_tokens(2, query.TOKEN_COUNTRY)
assert qn.has_tokens(2, query.TOKEN_PARTIAL)
assert qn.has_tokens(2, query.TOKEN_WORD)
assert not qnode.has_tokens(3, query.TOKEN_PARTIAL)
assert not qnode.has_tokens(2, query.TOKEN_COUNTRY)
assert qnode.has_tokens(2, query.TOKEN_PARTIAL)
assert qnode.has_tokens(2, query.TOKEN_WORD)
assert qn.get_tokens(3, query.TOKEN_PARTIAL) is None
assert qn.get_tokens(2, query.TOKEN_COUNTRY) is None
assert len(qn.get_tokens(2, query.TOKEN_PARTIAL)) == 2
assert len(qn.get_tokens(2, query.TOKEN_WORD)) == 1
assert qnode.get_tokens(3, query.TOKEN_PARTIAL) is None
assert qnode.get_tokens(2, query.TOKEN_COUNTRY) is None
assert len(qnode.get_tokens(2, query.TOKEN_PARTIAL)) == 2
assert len(qnode.get_tokens(2, query.TOKEN_WORD)) == 1
def test_query_struct_empty():

View File

@@ -102,12 +102,11 @@ async def test_splitting_in_transliteration(conn):
@pytest.mark.asyncio
@pytest.mark.parametrize('term,order', [('23456', ['P', 'H', 'W', 'w']),
('3', ['H', 'P', 'W', 'w'])
('3', ['H', 'W', 'w'])
])
async def test_penalty_postcodes_and_housenumbers(conn, term, order):
ana = await tok.create_query_analyzer(conn)
await add_word(conn, 1, term, 'P', None)
await add_word(conn, 2, term, 'H', term)
await add_word(conn, 3, term, 'w', term)
await add_word(conn, 4, term, 'W', term)
@@ -179,8 +178,10 @@ async def test_add_unknown_housenumbers(conn):
assert query.nodes[1].starting[0].ttype == qmod.TOKEN_HOUSENUMBER
assert len(query.nodes[1].starting[0].tokens) == 1
assert query.nodes[1].starting[0].tokens[0].token == 1
assert not query.nodes[2].starting
assert not query.nodes[3].starting
assert query.nodes[2].has_tokens(3, qmod.TOKEN_POSTCODE)
assert not query.nodes[2].has_tokens(3, qmod.TOKEN_HOUSENUMBER)
assert not query.nodes[2].has_tokens(4, qmod.TOKEN_HOUSENUMBER)
assert not query.nodes[3].has_tokens(4, qmod.TOKEN_HOUSENUMBER)
@pytest.mark.asyncio

View File

@@ -0,0 +1,154 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2025 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Test for parsing of postcodes in queries.
"""
import re
from itertools import zip_longest
import pytest
from nominatim_api.search.postcode_parser import PostcodeParser
from nominatim_api.search.query import QueryStruct, PHRASE_ANY, PHRASE_POSTCODE, PHRASE_STREET
@pytest.fixture
def pc_config(project_env):
country_file = project_env.project_dir / 'country_settings.yaml'
country_file.write_text(r"""
ab:
postcode:
pattern: "ddddd ll"
ba:
postcode:
pattern: "ddddd"
de:
postcode:
pattern: "ddddd"
gr:
postcode:
pattern: "(ddd) ?(dd)"
output: \1 \2
in:
postcode:
pattern: "(ddd) ?(ddd)"
output: \1\2
mc:
postcode:
pattern: "980dd"
mz:
postcode:
pattern: "(dddd)(?:-dd)?"
bn:
postcode:
pattern: "(ll) ?(dddd)"
output: \1\2
ky:
postcode:
pattern: "(d)-(dddd)"
output: KY\1-\2
""")
return project_env
def mk_query(inp):
query = QueryStruct([])
phrase_split = re.split(r"([ ,:'-])", inp)
for word, breakchar in zip_longest(*[iter(phrase_split)]*2, fillvalue='>'):
query.add_node(breakchar, PHRASE_ANY, 0.1, word, word)
return query
@pytest.mark.parametrize('query,pos', [('45325 Berlin', 0),
('45325:Berlin', 0),
('45325,Berlin', 0),
('Berlin 45325', 1),
('Berlin,45325', 1),
('Berlin:45325', 1),
('Hansastr,45325 Berlin', 1),
('Hansastr 45325 Berlin', 1)])
def test_simple_postcode(pc_config, query, pos):
parser = PostcodeParser(pc_config)
result = parser.parse(mk_query(query))
assert result == {(pos, pos + 1, '45325'), (pos, pos + 1, '453 25')}
def test_contained_postcode(pc_config):
parser = PostcodeParser(pc_config)
assert parser.parse(mk_query('12345 dx')) == {(0, 1, '12345'), (0, 1, '123 45'),
(0, 2, '12345 DX')}
@pytest.mark.parametrize('query,frm,to', [('345987', 0, 1), ('345 987', 0, 2),
('Aina 345 987', 1, 3),
('Aina 23 345 987 ff', 2, 4)])
def test_postcode_with_space(pc_config, query, frm, to):
parser = PostcodeParser(pc_config)
result = parser.parse(mk_query(query))
assert result == {(frm, to, '345987')}
def test_overlapping_postcode(pc_config):
parser = PostcodeParser(pc_config)
assert parser.parse(mk_query('123 456 78')) == {(0, 2, '123456'), (1, 3, '456 78')}
@pytest.mark.parametrize('query', ['45325-Berlin', "45325'Berlin",
'Berlin-45325', "Berlin'45325", '45325Berlin'
'345-987', "345'987", '345,987', '345:987'])
def test_not_a_postcode(pc_config, query):
parser = PostcodeParser(pc_config)
assert not parser.parse(mk_query(query))
@pytest.mark.parametrize('query', ['ba 12233', 'ba-12233'])
def test_postcode_with_country_prefix(pc_config, query):
parser = PostcodeParser(pc_config)
assert (0, 2, '12233') in parser.parse(mk_query(query))
def test_postcode_with_joined_country_prefix(pc_config):
parser = PostcodeParser(pc_config)
assert parser.parse(mk_query('ba12233')) == {(0, 1, '12233')}
def test_postcode_with_non_matching_country_prefix(pc_config):
parser = PostcodeParser(pc_config)
assert not parser.parse(mk_query('ky12233'))
def test_postcode_inside_postcode_phrase(pc_config):
parser = PostcodeParser(pc_config)
query = QueryStruct([])
query.nodes[-1].ptype = PHRASE_STREET
query.add_node(',', PHRASE_STREET, 0.1, '12345', '12345')
query.add_node(',', PHRASE_POSTCODE, 0.1, 'xz', 'xz')
query.add_node('>', PHRASE_POSTCODE, 0.1, '4444', '4444')
assert parser.parse(query) == {(2, 3, '4444')}
def test_partial_postcode_in_postcode_phrase(pc_config):
parser = PostcodeParser(pc_config)
query = QueryStruct([])
query.nodes[-1].ptype = PHRASE_POSTCODE
query.add_node(' ', PHRASE_POSTCODE, 0.1, '2224', '2224')
query.add_node('>', PHRASE_POSTCODE, 0.1, '12345', '12345')
assert not parser.parse(query)

View File

@@ -46,3 +46,20 @@ def test_token_range_unimplemented_ops():
nq.TokenRange(1, 3) <= nq.TokenRange(10, 12)
with pytest.raises(TypeError):
nq.TokenRange(1, 3) >= nq.TokenRange(10, 12)
def test_query_extract_words():
q = nq.QueryStruct([])
q.add_node(nq.BREAK_WORD, nq.PHRASE_ANY, 0.1, '12', '')
q.add_node(nq.BREAK_TOKEN, nq.PHRASE_ANY, 0.0, 'ab', '')
q.add_node(nq.BREAK_PHRASE, nq.PHRASE_ANY, 0.0, '12', '')
q.add_node(nq.BREAK_END, nq.PHRASE_ANY, 0.5, 'hallo', '')
words = q.extract_words(base_penalty=1.0)
assert set(words.keys()) \
== {'12', 'ab', 'hallo', '12 ab', 'ab 12', '12 ab 12'}
assert sorted(words['12']) == [nq.TokenRange(0, 1, 1.0), nq.TokenRange(2, 3, 1.0)]
assert words['12 ab'] == [nq.TokenRange(0, 2, 1.1)]
assert words['hallo'] == [nq.TokenRange(3, 4, 1.0)]

View File

@@ -265,37 +265,13 @@ class TestPostcodes:
'address': {'postcode': postcode}}))
def test_update_postcodes_from_db_empty(self, table_factory, word_table):
table_factory('location_postcode', 'country_code TEXT, postcode TEXT',
content=(('de', '12345'), ('se', '132 34'),
('bm', 'AB23'), ('fr', '12345')))
self.analyzer.update_postcodes_from_db()
assert word_table.count() == 5
assert word_table.get_postcodes() == {'12345', '132 34@132 34', 'AB 23@AB 23'}
def test_update_postcodes_from_db_ambigious(self, table_factory, word_table):
table_factory('location_postcode', 'country_code TEXT, postcode TEXT',
content=(('in', '123456'), ('sg', '123456')))
self.analyzer.update_postcodes_from_db()
assert word_table.count() == 3
assert word_table.get_postcodes() == {'123456', '123456@123 456'}
def test_update_postcodes_from_db_add_and_remove(self, table_factory, word_table):
table_factory('location_postcode', 'country_code TEXT, postcode TEXT',
content=(('ch', '1234'), ('bm', 'BC 45'), ('bm', 'XX45')))
def test_update_postcodes_deleted(self, word_table):
word_table.add_postcode(' 1234', '1234')
word_table.add_postcode(' 5678', '5678')
self.analyzer.update_postcodes_from_db()
assert word_table.count() == 5
assert word_table.get_postcodes() == {'1234', 'BC 45@BC 45', 'XX 45@XX 45'}
assert word_table.count() == 0
def test_process_place_postcode_simple(self, word_table):
@@ -303,16 +279,12 @@ class TestPostcodes:
assert info['postcode'] == '12345'
assert word_table.get_postcodes() == {'12345', }
def test_process_place_postcode_with_space(self, word_table):
info = self.process_postcode('in', '123 567')
assert info['postcode'] == '123567'
assert word_table.get_postcodes() == {'123567@123 567', }
def test_update_special_phrase_empty_table(analyzer, word_table):
@@ -477,9 +449,9 @@ class TestPlaceAddress:
@pytest.mark.parametrize('pcode', ['12345', 'AB 123', '34-345'])
def test_process_place_postcode(self, word_table, pcode):
self.process_address(postcode=pcode)
info = self.process_address(postcode=pcode)
assert word_table.get_postcodes() == {pcode, }
assert info['postcode'] == pcode
@pytest.mark.parametrize('hnr', ['123a', '1', '101'])