Merge pull request #3665 from lonvia/pattern-matching-postcodes

Add full parsing of postcodes in query
2026-02-16 15:47:58 +00:00 · 2025-03-05 16:02:03 +01:00
parent 157414a053 6b0d58d9fd
commit 1b44fe2555
13 changed files with 476 additions and 295 deletions
--- a/settings/country_settings.yaml
+++ b/settings/country_settings.yaml
@@ -1809,7 +1809,8 @@ us:
    languages: en
    names: !include country-names/us.yaml
    postcode:
-      pattern: "ddddd"
+      pattern: "(ddddd)(?:-dddd)?"
      output: \1
 # Uruguay (Uruguay)
--- a/src/nominatim_api/search/icu_tokenizer.py
+++ b/src/nominatim_api/search/icu_tokenizer.py
@@ -8,7 +8,6 @@
 Implementation of query analysis for the ICU tokenizer.
 """
 from typing import Tuple, Dict, List, Optional, Iterator, Any, cast
 from collections import defaultdict
 import dataclasses
 import difflib
 import re
@@ -25,7 +24,9 @@ from ..connection import SearchConnection
 from ..logging import log
 from . import query as qmod
 from ..query_preprocessing.config import QueryConfig
 from ..query_preprocessing.base import QueryProcessingFunc
 from .query_analyzer_factory import AbstractQueryAnalyzer
 from .postcode_parser import PostcodeParser
 DB_TO_TOKEN_TYPE = {
@@ -47,42 +48,6 @@ PENALTY_IN_TOKEN_BREAK = {
 }
@dataclasses.dataclass
 class QueryPart:
    """ Normalized and transliterated form of a single term in the query.
        When the term came out of a split during the transliteration,
        the normalized string is the full word before transliteration.
        Check the subsequent break type to figure out if the word is
        continued.
        Penalty is the break penalty for the break following the token.
    """
    token: str
    normalized: str
    penalty: float
 QueryParts = List[QueryPart]
 WordDict = Dict[str, List[qmod.TokenRange]]
 def extract_words(terms: List[QueryPart], start: int,  words: WordDict) -> None:
    """ Add all combinations of words in the terms list after the
        given position to the word list.
    """
    total = len(terms)
    base_penalty = PENALTY_IN_TOKEN_BREAK[qmod.BREAK_WORD]
    for first in range(start, total):
        word = terms[first].token
        penalty = base_penalty
        words[word].append(qmod.TokenRange(first, first + 1, penalty=penalty))
        for last in range(first + 1, min(first + 20, total)):
            word = ' '.join((word, terms[last].token))
            penalty += terms[last - 1].penalty
            words[word].append(qmod.TokenRange(first, last + 1, penalty=penalty))
@dataclasses.dataclass
 class ICUToken(qmod.Token):
    """ Specialised token for ICU tokenizer.
@@ -148,60 +113,51 @@ class ICUToken(qmod.Token):
                        addr_count=max(1, addr_count))
-class ICUQueryAnalyzer(AbstractQueryAnalyzer):
+@dataclasses.dataclass
-    """ Converter for query strings into a tokenized query
+class ICUAnalyzerConfig:
-        using the tokens created by a ICU tokenizer.
+    postcode_parser: PostcodeParser
-    """
+    normalizer: Transliterator
-    def __init__(self, conn: SearchConnection) -> None:
+    transliterator: Transliterator
-        self.conn = conn
+    preprocessors: List[QueryProcessingFunc]
-    async def setup(self) -> None:
+    @staticmethod
-        """ Set up static data structures needed for the analysis.
+    async def create(conn: SearchConnection) -> 'ICUAnalyzerConfig':
-        """
+        rules = await conn.get_property('tokenizer_import_normalisation')
-        async def _make_normalizer() -> Any:
+        normalizer = Transliterator.createFromRules("normalization", rules)
            rules = await self.conn.get_property('tokenizer_import_normalisation')
            return Transliterator.createFromRules("normalization", rules)
-        self.normalizer = await self.conn.get_cached_value('ICUTOK', 'normalizer',
+        rules = await conn.get_property('tokenizer_import_transliteration')
-                                                           _make_normalizer)
+        transliterator = Transliterator.createFromRules("transliteration", rules)
-        async def _make_transliterator() -> Any:
+        preprocessing_rules = conn.config.load_sub_configuration('icu_tokenizer.yaml',
-            rules = await self.conn.get_property('tokenizer_import_transliteration')
+                                                                 config='TOKENIZER_CONFIG')\
-            return Transliterator.createFromRules("transliteration", rules)
+                                         .get('query-preprocessing', [])
        self.transliterator = await self.conn.get_cached_value('ICUTOK', 'transliterator',
                                                               _make_transliterator)
        await self._setup_preprocessing()
        if 'word' not in self.conn.t.meta.tables:
            sa.Table('word', self.conn.t.meta,
                     sa.Column('word_id', sa.Integer),
                     sa.Column('word_token', sa.Text, nullable=False),
                     sa.Column('type', sa.Text, nullable=False),
                     sa.Column('word', sa.Text),
                     sa.Column('info', Json))
    async def _setup_preprocessing(self) -> None:
        """ Load the rules for preprocessing and set up the handlers.
        """
        rules = self.conn.config.load_sub_configuration('icu_tokenizer.yaml',
                                                        config='TOKENIZER_CONFIG')
        preprocessing_rules = rules.get('query-preprocessing', [])
        self.preprocessors = []
        preprocessors: List[QueryProcessingFunc] = []
        for func in preprocessing_rules:
            if 'step' not in func:
                raise UsageError("Preprocessing rule is missing the 'step' attribute.")
            if not isinstance(func['step'], str):
                raise UsageError("'step' attribute must be a simple string.")
-            module = self.conn.config.load_plugin_module(
+            module = conn.config.load_plugin_module(
                        func['step'], 'nominatim_api.query_preprocessing')
-            self.preprocessors.append(
+            preprocessors.append(
-                module.create(QueryConfig(func).set_normalizer(self.normalizer)))
+                module.create(QueryConfig(func).set_normalizer(normalizer)))
        return ICUAnalyzerConfig(PostcodeParser(conn.config),
                                 normalizer, transliterator, preprocessors)
 class ICUQueryAnalyzer(AbstractQueryAnalyzer):
    """ Converter for query strings into a tokenized query
        using the tokens created by a ICU tokenizer.
    """
    def __init__(self, conn: SearchConnection, config: ICUAnalyzerConfig) -> None:
        self.conn = conn
        self.postcode_parser = config.postcode_parser
        self.normalizer = config.normalizer
        self.transliterator = config.transliterator
        self.preprocessors = config.preprocessors
    async def analyze_query(self, phrases: List[qmod.Phrase]) -> qmod.QueryStruct:
        """ Analyze the given list of phrases and return the
@@ -216,8 +172,9 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
        if not query.source:
            return query
-        parts, words = self.split_query(query)
+        self.split_query(query)
-        log().var_dump('Transliterated query', lambda: _dump_transliterated(query, parts))
+        log().var_dump('Transliterated query', lambda: query.get_transliterated_query())
        words = query.extract_words(base_penalty=PENALTY_IN_TOKEN_BREAK[qmod.BREAK_WORD])
        for row in await self.lookup_in_db(list(words.keys())):
            for trange in words[row.word_token]:
@@ -234,8 +191,13 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
                else:
                    query.add_token(trange, DB_TO_TOKEN_TYPE[row.type], token)
-        self.add_extra_tokens(query, parts)
+        self.add_extra_tokens(query)
-        self.rerank_tokens(query, parts)
+        for start, end, pc in self.postcode_parser.parse(query):
            query.add_token(qmod.TokenRange(start, end),
                            qmod.TOKEN_POSTCODE,
                            ICUToken(penalty=0.1, token=0, count=1, addr_count=1,
                                     lookup_word=pc, word_token=pc, info=None))
        self.rerank_tokens(query)
        log().table_dump('Word tokens', _dump_word_tokens(query))
@@ -248,16 +210,9 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
        """
        return cast(str, self.normalizer.transliterate(text)).strip('-: ')
-    def split_query(self, query: qmod.QueryStruct) -> Tuple[QueryParts, WordDict]:
+    def split_query(self, query: qmod.QueryStruct) -> None:
        """ Transliterate the phrases and split them into tokens.
            Returns the list of transliterated tokens together with their
            normalized form and a dictionary of words for lookup together
            with their position.
        """
        parts: QueryParts = []
        phrase_start = 0
        words: WordDict = defaultdict(list)
        for phrase in query.source:
            query.nodes[-1].ptype = phrase.ptype
            phrase_split = re.split('([ :-])', phrase.text)
@@ -272,38 +227,42 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
                if trans:
                    for term in trans.split(' '):
                        if term:
-                            parts.append(QueryPart(term, word,
+                            query.add_node(qmod.BREAK_TOKEN, phrase.ptype,
-                                                   PENALTY_IN_TOKEN_BREAK[qmod.BREAK_TOKEN]))
+                                           PENALTY_IN_TOKEN_BREAK[qmod.BREAK_TOKEN],
-                            query.add_node(qmod.BREAK_TOKEN, phrase.ptype)
+                                           term, word)
-                    query.nodes[-1].btype = breakchar
+                    query.nodes[-1].adjust_break(breakchar,
-                    parts[-1].penalty = PENALTY_IN_TOKEN_BREAK[breakchar]
+                                                 PENALTY_IN_TOKEN_BREAK[breakchar])
-            extract_words(parts, phrase_start, words)
+        query.nodes[-1].adjust_break(qmod.BREAK_END, PENALTY_IN_TOKEN_BREAK[qmod.BREAK_END])
            phrase_start = len(parts)
        query.nodes[-1].btype = qmod.BREAK_END
        return parts, words
    async def lookup_in_db(self, words: List[str]) -> 'sa.Result[Any]':
        """ Return the token information from the database for the
            given word tokens.
            This function excludes postcode tokens
        """
        t = self.conn.t.meta.tables['word']
-        return await self.conn.execute(t.select().where(t.c.word_token.in_(words)))
+        return await self.conn.execute(t.select()
                                        .where(t.c.word_token.in_(words))
                                        .where(t.c.type != 'P'))
-    def add_extra_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None:
+    def add_extra_tokens(self, query: qmod.QueryStruct) -> None:
        """ Add tokens to query that are not saved in the database.
        """
-        for part, node, i in zip(parts, query.nodes, range(1000)):
+        need_hnr = False
-            if len(part.token) <= 4 and part.token.isdigit()\
+        for i, node in enumerate(query.nodes):
-               and not node.has_tokens(i+1, qmod.TOKEN_HOUSENUMBER):
+            is_full_token = node.btype not in (qmod.BREAK_TOKEN, qmod.BREAK_PART)
-                query.add_token(qmod.TokenRange(i, i+1), qmod.TOKEN_HOUSENUMBER,
+            if need_hnr and is_full_token \
                    and len(node.term_normalized) <= 4 and node.term_normalized.isdigit():
                query.add_token(qmod.TokenRange(i-1, i), qmod.TOKEN_HOUSENUMBER,
                                ICUToken(penalty=0.5, token=0,
-                                         count=1, addr_count=1, lookup_word=part.token,
+                                         count=1, addr_count=1,
-                                         word_token=part.token, info=None))
+                                         lookup_word=node.term_lookup,
                                         word_token=node.term_lookup, info=None))
-    def rerank_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None:
+            need_hnr = is_full_token and not node.has_tokens(i+1, qmod.TOKEN_HOUSENUMBER)
    def rerank_tokens(self, query: qmod.QueryStruct) -> None:
        """ Add penalties to tokens that depend on presence of other token.
        """
        for i, node, tlist in query.iter_token_lists():
@@ -320,28 +279,22 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
                        if repl.end == tlist.end and repl.ttype != qmod.TOKEN_HOUSENUMBER:
                            repl.add_penalty(0.5 - tlist.tokens[0].penalty)
            elif tlist.ttype not in (qmod.TOKEN_COUNTRY, qmod.TOKEN_PARTIAL):
-                norm = parts[i].normalized
+                norm = ' '.join(n.term_normalized for n in query.nodes[i + 1:tlist.end + 1]
-                for j in range(i + 1, tlist.end):
+                                if n.btype != qmod.BREAK_TOKEN)
-                    if node.btype != qmod.BREAK_TOKEN:
+                if not norm:
-                        norm += '  ' + parts[j].normalized
+                    # Can happen when the token only covers a partial term
                    norm = query.nodes[i + 1].term_normalized
                for token in tlist.tokens:
                    cast(ICUToken, token).rematch(norm)
 def _dump_transliterated(query: qmod.QueryStruct, parts: QueryParts) -> str:
    out = query.nodes[0].btype
    for node, part in zip(query.nodes[1:], parts):
        out += part.token + node.btype
    return out
 def _dump_word_tokens(query: qmod.QueryStruct) -> Iterator[List[Any]]:
-    yield ['type', 'token', 'word_token', 'lookup_word', 'penalty', 'count', 'info']
+    yield ['type', 'from', 'to', 'token', 'word_token', 'lookup_word', 'penalty', 'count', 'info']
-    for node in query.nodes:
+    for i, node in enumerate(query.nodes):
        for tlist in node.starting:
            for token in tlist.tokens:
                t = cast(ICUToken, token)
-                yield [tlist.ttype, t.token, t.word_token or '',
+                yield [tlist.ttype, str(i), str(tlist.end), t.token, t.word_token or '',
                       t.lookup_word or '', t.penalty, t.count, t.info]
@@ -349,7 +302,17 @@ async def create_query_analyzer(conn: SearchConnection) -> AbstractQueryAnalyzer
    """ Create and set up a new query analyzer for a database based
        on the ICU tokenizer.
    """
-    out = ICUQueryAnalyzer(conn)
+    async def _get_config() -> ICUAnalyzerConfig:
-    await out.setup()
+        if 'word' not in conn.t.meta.tables:
            sa.Table('word', conn.t.meta,
                     sa.Column('word_id', sa.Integer),
                     sa.Column('word_token', sa.Text, nullable=False),
                     sa.Column('type', sa.Text, nullable=False),
                     sa.Column('word', sa.Text),
                     sa.Column('info', Json))
-    return out
+        return await ICUAnalyzerConfig.create(conn)
    config = await conn.get_cached_value('ICUTOK', 'config', _get_config)
    return ICUQueryAnalyzer(conn, config)
--- a/src/nominatim_api/search/postcode_parser.py
+++ b/src/nominatim_api/search/postcode_parser.py
@@ -0,0 +1,103 @@
 # SPDX-License-Identifier: GPL-3.0-or-later
 #
 # This file is part of Nominatim. (https://nominatim.org)
 #
 # Copyright (C) 2025 by the Nominatim developer community.
 # For a full list of authors see the git log.
 """
 Handling of arbitrary postcode tokens in tokenized query string.
 """
 from typing import Tuple, Set, Dict, List
 import re
 from collections import defaultdict
 import yaml
 from ..config import Configuration
 from . import query as qmod
 class PostcodeParser:
    """ Pattern-based parser for postcodes in tokenized queries.
        The postcode patterns are read from the country configuration.
        The parser does currently not return country restrictions.
    """
    def __init__(self, config: Configuration) -> None:
        # skip over includes here to avoid loading the complete country name data
        yaml.add_constructor('!include', lambda loader, node: [],
                             Loader=yaml.SafeLoader)
        cdata = yaml.safe_load(config.find_config_file('country_settings.yaml')
                                     .read_text(encoding='utf-8'))
        unique_patterns: Dict[str, Dict[str, List[str]]] = {}
        for cc, data in cdata.items():
            if data.get('postcode'):
                pat = data['postcode']['pattern'].replace('d', '[0-9]').replace('l', '[a-z]')
                out = data['postcode'].get('output')
                if pat not in unique_patterns:
                    unique_patterns[pat] = defaultdict(list)
                unique_patterns[pat][out].append(cc)
        self.global_pattern = re.compile(
                '(?:(?P<cc>[a-z][a-z])(?P<space>[ -]?))?(?P<pc>(?:(?:'
                + ')|(?:'.join(unique_patterns) + '))[:, >].*)')
        self.local_patterns = [(re.compile(f"{pat}[:, >]"), list(info.items()))
                               for pat, info in unique_patterns.items()]
    def parse(self, query: qmod.QueryStruct) -> Set[Tuple[int, int, str]]:
        """ Parse postcodes in the given list of query tokens taking into
            account the list of breaks from the nodes.
            The result is a sequence of tuples with
            [start node id, end node id, postcode token]
        """
        nodes = query.nodes
        outcodes: Set[Tuple[int, int, str]] = set()
        for i in range(query.num_token_slots()):
            if nodes[i].btype in '<,: ' and nodes[i + 1].btype != '`' \
                    and (i == 0 or nodes[i - 1].ptype != qmod.PHRASE_POSTCODE):
                if nodes[i].ptype == qmod.PHRASE_ANY:
                    word = nodes[i + 1].term_normalized + nodes[i + 1].btype
                    if word[-1] in ' -' and nodes[i + 2].btype != '`' \
                            and nodes[i + 1].ptype == qmod.PHRASE_ANY:
                        word += nodes[i + 2].term_normalized + nodes[i + 2].btype
                        if word[-1] in ' -' and nodes[i + 3].btype != '`' \
                                and nodes[i + 2].ptype == qmod.PHRASE_ANY:
                            word += nodes[i + 3].term_normalized + nodes[i + 3].btype
                    self._match_word(word, i, False, outcodes)
                elif nodes[i].ptype == qmod.PHRASE_POSTCODE:
                    word = nodes[i + 1].term_normalized + nodes[i + 1].btype
                    for j in range(i + 1, query.num_token_slots()):
                        if nodes[j].ptype != qmod.PHRASE_POSTCODE:
                            break
                        word += nodes[j + 1].term_normalized + nodes[j + 1].btype
                    self._match_word(word, i, True, outcodes)
        return outcodes
    def _match_word(self, word: str, pos: int, fullmatch: bool,
                    outcodes: Set[Tuple[int, int, str]]) -> None:
        # Use global pattern to check for presence of any postcode.
        m = self.global_pattern.fullmatch(word)
        if m:
            # If there was a match, check against each pattern separately
            # because multiple patterns might be machting at the end.
            cc = m.group('cc')
            pc_word = m.group('pc')
            cc_spaces = len(m.group('space') or '')
            for pattern, info in self.local_patterns:
                lm = pattern.fullmatch(pc_word) if fullmatch else pattern.match(pc_word)
                if lm:
                    trange = (pos, pos + cc_spaces + sum(c in ' ,-:>' for c in lm.group(0)))
                    for out, out_ccs in info:
                        if cc is None or cc in out_ccs:
                            if out:
                                outcodes.add((*trange, lm.expand(out).upper()))
                            else:
                                outcodes.add((*trange, lm.group(0)[:-1].upper()))
--- a/src/nominatim_api/search/query.py
+++ b/src/nominatim_api/search/query.py
@@ -7,8 +7,9 @@
 """
 Datastructures for a tokenized query.
 """
-from typing import List, Tuple, Optional, Iterator
+from typing import Dict, List, Tuple, Optional, Iterator
 from abc import ABC, abstractmethod
 from collections import defaultdict
 import dataclasses
@@ -171,11 +172,33 @@ class TokenList:
@dataclasses.dataclass
 class QueryNode:
    """ A node of the query representing a break between terms.
        The node also contains information on the source term
        ending at the node. The tokens are created from this information.
    """
    btype: BreakType
    ptype: PhraseType
    penalty: float
    """ Penalty for the break at this node.
    """
    term_lookup: str
    """ Transliterated term following this node.
    """
    term_normalized: str
    """ Normalised form of term following this node.
        When the token resulted from a split during transliteration,
        then this string contains the complete source term.
    """
    starting: List[TokenList] = dataclasses.field(default_factory=list)
    def adjust_break(self, btype: BreakType, penalty: float) -> None:
        """ Change the break type and penalty for this node.
        """
        self.btype = btype
        self.penalty = penalty
    def has_tokens(self, end: int, *ttypes: TokenType) -> bool:
        """ Check if there are tokens of the given types ending at the
            given node.
@@ -218,19 +241,22 @@ class QueryStruct:
    def __init__(self, source: List[Phrase]) -> None:
        self.source = source
        self.nodes: List[QueryNode] = \
-            [QueryNode(BREAK_START, source[0].ptype if source else PHRASE_ANY)]
+            [QueryNode(BREAK_START, source[0].ptype if source else PHRASE_ANY,
                       0.0, '', '')]
    def num_token_slots(self) -> int:
        """ Return the length of the query in vertice steps.
        """
        return len(self.nodes) - 1
-    def add_node(self, btype: BreakType, ptype: PhraseType) -> None:
+    def add_node(self, btype: BreakType, ptype: PhraseType,
                 break_penalty: float = 0.0,
                 term_lookup: str = '', term_normalized: str = '') -> None:
        """ Append a new break node with the given break type.
            The phrase type denotes the type for any tokens starting
            at the node.
        """
-        self.nodes.append(QueryNode(btype, ptype))
+        self.nodes.append(QueryNode(btype, ptype, break_penalty, term_lookup, term_normalized))
    def add_token(self, trange: TokenRange, ttype: TokenType, token: Token) -> None:
        """ Add a token to the query. 'start' and 'end' are the indexes of the
@@ -287,3 +313,42 @@ class QueryStruct:
                    if t.token == token:
                        return f"[{tlist.ttype}]{t.lookup_word}"
        return 'None'
    def get_transliterated_query(self) -> str:
        """ Return a string representation of the transliterated query
            with the character representation of the different break types.
            For debugging purposes only.
        """
        return ''.join(''.join((n.term_lookup, n.btype)) for n in self.nodes)
    def extract_words(self, base_penalty: float = 0.0,
                      start: int = 0,
                      endpos: Optional[int] = None) -> Dict[str, List[TokenRange]]:
        """ Add all combinations of words that can be formed from the terms
            between the given start and endnode. The terms are joined with
            spaces for each break. Words can never go across a BREAK_PHRASE.
            The functions returns a dictionary of possible words with their
            position within the query and a penalty. The penalty is computed
            from the base_penalty plus the penalty for each node the word
            crosses.
        """
        if endpos is None:
            endpos = len(self.nodes)
        words: Dict[str, List[TokenRange]] = defaultdict(list)
        for first in range(start, endpos - 1):
            word = self.nodes[first + 1].term_lookup
            penalty = base_penalty
            words[word].append(TokenRange(first, first + 1, penalty=penalty))
            if self.nodes[first + 1].btype != BREAK_PHRASE:
                for last in range(first + 2, min(first + 20, endpos)):
                    word = ' '.join((word, self.nodes[last].term_lookup))
                    penalty += self.nodes[last - 1].penalty
                    words[word].append(TokenRange(first, last, penalty=penalty))
                    if self.nodes[last].btype == BREAK_PHRASE:
                        break
        return words
--- a/src/nominatim_db/tokenizer/icu_tokenizer.py
+++ b/src/nominatim_db/tokenizer/icu_tokenizer.py
@@ -381,76 +381,15 @@ class ICUNameAnalyzer(AbstractAnalyzer):
        return postcode.strip().upper()
    def update_postcodes_from_db(self) -> None:
-        """ Update postcode tokens in the word table from the location_postcode
+        """ Postcode update.
-            table.
+
            Removes all postcodes from the word table because they are not
            needed. Postcodes are recognised by pattern.
        """
        assert self.conn is not None
        analyzer = self.token_analysis.analysis.get('@postcode')
        with self.conn.cursor() as cur:
-            # First get all postcode names currently in the word table.
+            cur.execute("DELETE FROM word WHERE type = 'P'")
            cur.execute("SELECT DISTINCT word FROM word WHERE type = 'P'")
            word_entries = set((entry[0] for entry in cur))
            # Then compute the required postcode names from the postcode table.
            needed_entries = set()
            cur.execute("SELECT country_code, postcode FROM location_postcode")
            for cc, postcode in cur:
                info = PlaceInfo({'country_code': cc,
                                  'class': 'place', 'type': 'postcode',
                                  'address': {'postcode': postcode}})
                address = self.sanitizer.process_names(info)[1]
                for place in address:
                    if place.kind == 'postcode':
                        if analyzer is None:
                            postcode_name = place.name.strip().upper()
                            variant_base = None
                        else:
                            postcode_name = analyzer.get_canonical_id(place)
                            variant_base = place.get_attr("variant")
                        if variant_base:
                            needed_entries.add(f'{postcode_name}@{variant_base}')
                        else:
                            needed_entries.add(postcode_name)
                        break
        # Now update the word table.
        self._delete_unused_postcode_words(word_entries - needed_entries)
        self._add_missing_postcode_words(needed_entries - word_entries)
    def _delete_unused_postcode_words(self, tokens: Iterable[str]) -> None:
        assert self.conn is not None
        if tokens:
            with self.conn.cursor() as cur:
                cur.execute("DELETE FROM word WHERE type = 'P' and word = any(%s)",
                            (list(tokens), ))
    def _add_missing_postcode_words(self, tokens: Iterable[str]) -> None:
        assert self.conn is not None
        if not tokens:
            return
        analyzer = self.token_analysis.analysis.get('@postcode')
        terms = []
        for postcode_name in tokens:
            if '@' in postcode_name:
                term, variant = postcode_name.split('@', 2)
                term = self._search_normalized(term)
                if analyzer is None:
                    variants = [term]
                else:
                    variants = analyzer.compute_variants(variant)
                    if term not in variants:
                        variants.append(term)
            else:
                variants = [self._search_normalized(postcode_name)]
            terms.append((postcode_name, variants))
        if terms:
            with self.conn.cursor() as cur:
                cur.executemany("""SELECT create_postcode_word(%s, %s)""", terms)
    def update_special_phrases(self, phrases: Iterable[Tuple[str, str, str, str]],
                               should_replace: bool) -> None:
@@ -718,32 +657,9 @@ class ICUNameAnalyzer(AbstractAnalyzer):
        analyzer = self.token_analysis.analysis.get('@postcode')
        if analyzer is None:
-            postcode_name = item.name.strip().upper()
+            return item.name.strip().upper()
            variant_base = None
        else:
-            postcode_name = analyzer.get_canonical_id(item)
+            return analyzer.get_canonical_id(item)
            variant_base = item.get_attr("variant")
        if variant_base:
            postcode = f'{postcode_name}@{variant_base}'
        else:
            postcode = postcode_name
        if postcode not in self._cache.postcodes:
            term = self._search_normalized(postcode_name)
            if not term:
                return None
            variants = {term}
            if analyzer is not None and variant_base:
                variants.update(analyzer.compute_variants(variant_base))
            with self.conn.cursor() as cur:
                cur.execute("SELECT create_postcode_word(%s, %s)",
                            (postcode, list(variants)))
            self._cache.postcodes.add(postcode)
        return postcode_name
 class _TokenInfo:
@@ -836,5 +752,4 @@ class _TokenCache:
        self.names: Dict[str, Tuple[int, List[int]]] = {}
        self.partials: Dict[str, int] = {}
        self.fulls: Dict[str, List[int]] = {}
        self.postcodes: Set[str] = set()
        self.housenumbers: Dict[str, Tuple[Optional[int], Optional[str]]] = {}
--- a/test/bdd/api/search/postcode.feature
+++ b/test/bdd/api/search/postcode.feature
@@ -3,9 +3,8 @@
 Feature: Searches with postcodes
    Various searches involving postcodes
    @Fail
    Scenario: US 5+4 ZIP codes are shortened to 5 ZIP codes if not found
-        When sending json search query "36067 1111, us" with address
+        When sending json search query "36067-1111, us" with address
        Then result addresses contain
            | postcode |
            | 36067    |
--- a/test/bdd/db/import/postcodes.feature
+++ b/test/bdd/db/import/postcodes.feature
@@ -170,7 +170,7 @@ Feature: Import of postcodes
            | object | postcode |
            | W93    | 11200    |
-    Scenario: Postcodes are added to the postcode and word table
+    Scenario: Postcodes are added to the postcode
        Given the places
           | osm | class | type  | addr+postcode | addr+housenumber | geometry |
           | N34 | place | house | 01982         | 111              |country:de |
@@ -178,7 +178,6 @@ Feature: Import of postcodes
        Then location_postcode contains exactly
           | country | postcode | geometry |
           | de      | 01982    | country:de |
        And there are word tokens for postcodes 01982
    @Fail
@@ -195,7 +194,7 @@ Feature: Import of postcodes
         | E45 2    | gb      | 23          | 5 |
         | Y45      | gb      | 21          | 5 |
-    Scenario: Postcodes outside all countries are not added to the postcode and word table
+    Scenario: Postcodes outside all countries are not added to the postcode table
        Given the places
            | osm | class | type  | addr+postcode | addr+housenumber | addr+place  | geometry  |
            | N34 | place | house | 01982         | 111              | Null Island | 0 0.00001 |
@@ -205,7 +204,6 @@ Feature: Import of postcodes
        When importing
        Then location_postcode contains exactly
            | country | postcode | geometry |
        And there are no word tokens for postcodes 01982
        When sending search query "111, 01982 Null Island"
        Then results contain
            | osm | display_name |
--- a/test/bdd/db/update/postcode.feature
+++ b/test/bdd/db/update/postcode.feature
@@ -2,7 +2,7 @@
 Feature: Update of postcode
    Tests for updating of data related to postcodes
-    Scenario: A new postcode appears in the postcode and word table
+    Scenario: A new postcode appears in the postcode table
        Given the places
           | osm | class | type  | addr+postcode | addr+housenumber | geometry |
           | N34 | place | house | 01982         | 111              |country:de |
@@ -18,9 +18,8 @@ Feature: Update of postcode
           | country | postcode | geometry |
           | de      | 01982    | country:de |
           | ch      | 4567     | country:ch |
        And there are word tokens for postcodes 01982,4567
-     Scenario: When the last postcode is deleted, it is deleted from postcode and word
+     Scenario: When the last postcode is deleted, it is deleted from postcode
        Given the places
           | osm | class | type  | addr+postcode | addr+housenumber | geometry |
           | N34 | place | house | 01982         | 111              |country:de |
@@ -31,10 +30,8 @@ Feature: Update of postcode
        Then location_postcode contains exactly
           | country | postcode | geometry |
           | ch      | 4567     | country:ch |
        And there are word tokens for postcodes 4567
        And there are no word tokens for postcodes 01982
-     Scenario: A postcode is not deleted from postcode and word when it exist in another country
+     Scenario: A postcode is not deleted from postcode when it exist in another country
        Given the places
           | osm | class | type  | addr+postcode | addr+housenumber | geometry |
           | N34 | place | house | 01982         | 111              |country:de |
@@ -45,7 +42,6 @@ Feature: Update of postcode
        Then location_postcode contains exactly
           | country | postcode | geometry |
           | fr      | 01982    | country:fr |
        And there are word tokens for postcodes 01982
     Scenario: Updating a postcode is reflected in postcode table
        Given the places
@@ -59,7 +55,6 @@ Feature: Update of postcode
        Then location_postcode contains exactly
           | country | postcode | geometry |
           | de      | 20453    | country:de |
        And there are word tokens for postcodes 20453
     Scenario: When changing from a postcode type, the entry appears in placex
        When importing
@@ -80,7 +75,6 @@ Feature: Update of postcode
        Then location_postcode contains exactly
           | country | postcode | geometry |
           | de      | 20453    | country:de |
        And there are word tokens for postcodes 20453
     Scenario: When changing to a postcode type, the entry disappears from placex
        When importing
@@ -101,7 +95,6 @@ Feature: Update of postcode
        Then location_postcode contains exactly
           | country | postcode | geometry |
           | de      | 01982    | country:de |
        And there are word tokens for postcodes 01982
    Scenario: When a parent is deleted, the postcode gets a new parent
        Given the grid with origin DE
--- a/test/python/api/search/test_api_search_query.py
+++ b/test/python/api/search/test_api_search_query.py
@@ -21,6 +21,9 @@ def mktoken(tid: int):
    return MyToken(penalty=3.0, token=tid, count=1, addr_count=1,
                   lookup_word='foo')
@pytest.fixture
 def qnode():
    return query.QueryNode(query.BREAK_PHRASE, query.PHRASE_ANY, 0.0 ,'', '')
@pytest.mark.parametrize('ptype,ttype', [(query.PHRASE_ANY, 'W'),
                                         (query.PHRASE_AMENITY, 'Q'),
@@ -37,27 +40,24 @@ def test_phrase_incompatible(ptype):
    assert not query._phrase_compatible_with(ptype, query.TOKEN_PARTIAL, True)
-def test_query_node_empty():
+def test_query_node_empty(qnode):
-    qn = query.QueryNode(query.BREAK_PHRASE, query.PHRASE_ANY)
+    assert not qnode.has_tokens(3, query.TOKEN_PARTIAL)
-
+    assert qnode.get_tokens(3, query.TOKEN_WORD) is None
    assert not qn.has_tokens(3, query.TOKEN_PARTIAL)
    assert qn.get_tokens(3, query.TOKEN_WORD) is None
-def test_query_node_with_content():
+def test_query_node_with_content(qnode):
-    qn = query.QueryNode(query.BREAK_PHRASE, query.PHRASE_ANY)
+    qnode.starting.append(query.TokenList(2, query.TOKEN_PARTIAL, [mktoken(100), mktoken(101)]))
-    qn.starting.append(query.TokenList(2, query.TOKEN_PARTIAL, [mktoken(100), mktoken(101)]))
+    qnode.starting.append(query.TokenList(2, query.TOKEN_WORD, [mktoken(1000)]))
    qn.starting.append(query.TokenList(2, query.TOKEN_WORD, [mktoken(1000)]))
-    assert not qn.has_tokens(3, query.TOKEN_PARTIAL)
+    assert not qnode.has_tokens(3, query.TOKEN_PARTIAL)
-    assert not qn.has_tokens(2, query.TOKEN_COUNTRY)
+    assert not qnode.has_tokens(2, query.TOKEN_COUNTRY)
-    assert qn.has_tokens(2, query.TOKEN_PARTIAL)
+    assert qnode.has_tokens(2, query.TOKEN_PARTIAL)
-    assert qn.has_tokens(2, query.TOKEN_WORD)
+    assert qnode.has_tokens(2, query.TOKEN_WORD)
-    assert qn.get_tokens(3, query.TOKEN_PARTIAL) is None
+    assert qnode.get_tokens(3, query.TOKEN_PARTIAL) is None
-    assert qn.get_tokens(2, query.TOKEN_COUNTRY) is None
+    assert qnode.get_tokens(2, query.TOKEN_COUNTRY) is None
-    assert len(qn.get_tokens(2, query.TOKEN_PARTIAL)) == 2
+    assert len(qnode.get_tokens(2, query.TOKEN_PARTIAL)) == 2
-    assert len(qn.get_tokens(2, query.TOKEN_WORD)) == 1
+    assert len(qnode.get_tokens(2, query.TOKEN_WORD)) == 1
 def test_query_struct_empty():
--- a/test/python/api/search/test_icu_query_analyzer.py
+++ b/test/python/api/search/test_icu_query_analyzer.py
@@ -102,12 +102,11 @@ async def test_splitting_in_transliteration(conn):
@pytest.mark.asyncio
@pytest.mark.parametrize('term,order', [('23456', ['P', 'H', 'W', 'w']),
-                                        ('3', ['H', 'P', 'W', 'w'])
+                                        ('3', ['H', 'W', 'w'])
                                       ])
 async def test_penalty_postcodes_and_housenumbers(conn, term, order):
    ana = await tok.create_query_analyzer(conn)
    await add_word(conn, 1, term, 'P', None)
    await add_word(conn, 2, term, 'H', term)
    await add_word(conn, 3, term, 'w', term)
    await add_word(conn, 4, term, 'W', term)
@@ -179,8 +178,10 @@ async def test_add_unknown_housenumbers(conn):
    assert query.nodes[1].starting[0].ttype == qmod.TOKEN_HOUSENUMBER
    assert len(query.nodes[1].starting[0].tokens) == 1
    assert query.nodes[1].starting[0].tokens[0].token == 1
-    assert not query.nodes[2].starting
+    assert query.nodes[2].has_tokens(3, qmod.TOKEN_POSTCODE)
-    assert not query.nodes[3].starting
+    assert not query.nodes[2].has_tokens(3, qmod.TOKEN_HOUSENUMBER)
    assert not query.nodes[2].has_tokens(4, qmod.TOKEN_HOUSENUMBER)
    assert not query.nodes[3].has_tokens(4, qmod.TOKEN_HOUSENUMBER)
@pytest.mark.asyncio
--- a/test/python/api/search/test_postcode_parser.py
+++ b/test/python/api/search/test_postcode_parser.py
@@ -0,0 +1,154 @@
 # SPDX-License-Identifier: GPL-3.0-or-later
 #
 # This file is part of Nominatim. (https://nominatim.org)
 #
 # Copyright (C) 2025 by the Nominatim developer community.
 # For a full list of authors see the git log.
 """
 Test for parsing of postcodes in queries.
 """
 import re
 from itertools import zip_longest
 import pytest
 from nominatim_api.search.postcode_parser import PostcodeParser
 from nominatim_api.search.query import QueryStruct, PHRASE_ANY, PHRASE_POSTCODE, PHRASE_STREET
@pytest.fixture
 def pc_config(project_env):
    country_file = project_env.project_dir / 'country_settings.yaml'
    country_file.write_text(r"""
 ab:
  postcode:
    pattern: "ddddd ll"
 ba:
  postcode:
    pattern: "ddddd"
 de:
  postcode:
    pattern: "ddddd"
 gr:
  postcode:
    pattern: "(ddd) ?(dd)"
    output: \1 \2
 in:
  postcode:
    pattern: "(ddd) ?(ddd)"
    output: \1\2
 mc:
  postcode:
    pattern: "980dd"
 mz:
  postcode:
    pattern: "(dddd)(?:-dd)?"
 bn:
  postcode:
    pattern: "(ll) ?(dddd)"
    output: \1\2
 ky:
  postcode:
    pattern: "(d)-(dddd)"
    output: KY\1-\2
    """)
    return project_env
 def mk_query(inp):
    query = QueryStruct([])
    phrase_split = re.split(r"([ ,:'-])", inp)
    for word, breakchar in zip_longest(*[iter(phrase_split)]*2, fillvalue='>'):
        query.add_node(breakchar, PHRASE_ANY, 0.1, word, word)
    return query
@pytest.mark.parametrize('query,pos', [('45325 Berlin', 0),
                                       ('45325:Berlin', 0),
                                       ('45325,Berlin', 0),
                                       ('Berlin 45325', 1),
                                       ('Berlin,45325', 1),
                                       ('Berlin:45325', 1),
                                       ('Hansastr,45325 Berlin', 1),
                                       ('Hansastr 45325 Berlin', 1)])
 def test_simple_postcode(pc_config, query, pos):
    parser = PostcodeParser(pc_config)
    result = parser.parse(mk_query(query))
    assert result == {(pos, pos + 1, '45325'), (pos, pos + 1, '453 25')}
 def test_contained_postcode(pc_config):
    parser = PostcodeParser(pc_config)
    assert parser.parse(mk_query('12345 dx')) == {(0, 1, '12345'), (0, 1, '123 45'),
                                                  (0, 2, '12345 DX')}
@pytest.mark.parametrize('query,frm,to', [('345987', 0, 1), ('345 987', 0, 2),
                                          ('Aina 345 987', 1, 3),
                                          ('Aina 23 345 987 ff', 2, 4)])
 def test_postcode_with_space(pc_config, query, frm, to):
    parser = PostcodeParser(pc_config)
    result = parser.parse(mk_query(query))
    assert result == {(frm, to, '345987')}
 def test_overlapping_postcode(pc_config):
    parser = PostcodeParser(pc_config)
    assert parser.parse(mk_query('123 456 78')) == {(0, 2, '123456'), (1, 3, '456 78')}
@pytest.mark.parametrize('query', ['45325-Berlin', "45325'Berlin",
                                   'Berlin-45325', "Berlin'45325", '45325Berlin'
                                   '345-987', "345'987", '345,987', '345:987'])
 def test_not_a_postcode(pc_config, query):
    parser = PostcodeParser(pc_config)
    assert not parser.parse(mk_query(query))
@pytest.mark.parametrize('query', ['ba 12233', 'ba-12233'])
 def test_postcode_with_country_prefix(pc_config, query):
    parser = PostcodeParser(pc_config)
    assert (0, 2, '12233') in parser.parse(mk_query(query))
 def test_postcode_with_joined_country_prefix(pc_config):
    parser = PostcodeParser(pc_config)
    assert parser.parse(mk_query('ba12233')) == {(0, 1, '12233')}
 def test_postcode_with_non_matching_country_prefix(pc_config):
    parser = PostcodeParser(pc_config)
    assert not parser.parse(mk_query('ky12233'))
 def test_postcode_inside_postcode_phrase(pc_config):
    parser = PostcodeParser(pc_config)
    query = QueryStruct([])
    query.nodes[-1].ptype = PHRASE_STREET
    query.add_node(',', PHRASE_STREET, 0.1, '12345', '12345')
    query.add_node(',', PHRASE_POSTCODE, 0.1, 'xz', 'xz')
    query.add_node('>', PHRASE_POSTCODE, 0.1, '4444', '4444')
    assert parser.parse(query) == {(2, 3, '4444')}
 def test_partial_postcode_in_postcode_phrase(pc_config):
    parser = PostcodeParser(pc_config)
    query = QueryStruct([])
    query.nodes[-1].ptype = PHRASE_POSTCODE
    query.add_node(' ', PHRASE_POSTCODE, 0.1, '2224', '2224')
    query.add_node('>', PHRASE_POSTCODE, 0.1, '12345', '12345')
    assert not parser.parse(query)
--- a/test/python/api/search/test_query.py
+++ b/test/python/api/search/test_query.py
@@ -46,3 +46,20 @@ def test_token_range_unimplemented_ops():
        nq.TokenRange(1, 3) <= nq.TokenRange(10, 12)
    with pytest.raises(TypeError):
        nq.TokenRange(1, 3) >= nq.TokenRange(10, 12)
 def test_query_extract_words():
    q = nq.QueryStruct([])
    q.add_node(nq.BREAK_WORD, nq.PHRASE_ANY, 0.1, '12', '')
    q.add_node(nq.BREAK_TOKEN, nq.PHRASE_ANY, 0.0, 'ab', '')
    q.add_node(nq.BREAK_PHRASE, nq.PHRASE_ANY, 0.0, '12', '')
    q.add_node(nq.BREAK_END, nq.PHRASE_ANY, 0.5, 'hallo', '')
    words = q.extract_words(base_penalty=1.0)
    assert set(words.keys()) \
             == {'12', 'ab', 'hallo', '12 ab', 'ab 12', '12 ab 12'}
    assert sorted(words['12']) == [nq.TokenRange(0, 1, 1.0), nq.TokenRange(2, 3, 1.0)]
    assert words['12 ab'] == [nq.TokenRange(0, 2, 1.1)]
    assert words['hallo'] == [nq.TokenRange(3, 4, 1.0)]
--- a/test/python/tokenizer/test_icu.py
+++ b/test/python/tokenizer/test_icu.py
@@ -265,37 +265,13 @@ class TestPostcodes:
                                                      'address': {'postcode': postcode}}))
-    def test_update_postcodes_from_db_empty(self, table_factory, word_table):
+    def test_update_postcodes_deleted(self, word_table):
        table_factory('location_postcode', 'country_code TEXT, postcode TEXT',
                      content=(('de', '12345'), ('se', '132 34'),
                               ('bm', 'AB23'), ('fr', '12345')))
        self.analyzer.update_postcodes_from_db()
        assert word_table.count() == 5
        assert word_table.get_postcodes() == {'12345', '132 34@132 34', 'AB 23@AB 23'}
    def test_update_postcodes_from_db_ambigious(self, table_factory, word_table):
        table_factory('location_postcode', 'country_code TEXT, postcode TEXT',
                      content=(('in', '123456'), ('sg', '123456')))
        self.analyzer.update_postcodes_from_db()
        assert word_table.count() == 3
        assert word_table.get_postcodes() == {'123456', '123456@123 456'}
    def test_update_postcodes_from_db_add_and_remove(self, table_factory, word_table):
        table_factory('location_postcode', 'country_code TEXT, postcode TEXT',
                      content=(('ch', '1234'), ('bm', 'BC 45'), ('bm', 'XX45')))
        word_table.add_postcode(' 1234', '1234')
        word_table.add_postcode(' 5678', '5678')
        self.analyzer.update_postcodes_from_db()
-        assert word_table.count() == 5
+        assert word_table.count() == 0
        assert word_table.get_postcodes() == {'1234', 'BC 45@BC 45', 'XX 45@XX 45'}
    def test_process_place_postcode_simple(self, word_table):
@@ -303,16 +279,12 @@ class TestPostcodes:
        assert info['postcode'] == '12345'
        assert word_table.get_postcodes() == {'12345', }
    def test_process_place_postcode_with_space(self, word_table):
        info = self.process_postcode('in', '123 567')
        assert info['postcode'] == '123567'
        assert word_table.get_postcodes() == {'123567@123 567', }
 def test_update_special_phrase_empty_table(analyzer, word_table):
@@ -477,9 +449,9 @@ class TestPlaceAddress:
    @pytest.mark.parametrize('pcode', ['12345', 'AB 123', '34-345'])
    def test_process_place_postcode(self, word_table, pcode):
-        self.process_address(postcode=pcode)
+        info = self.process_address(postcode=pcode)
-        assert word_table.get_postcodes() == {pcode, }
+        assert info['postcode'] == pcode
    @pytest.mark.parametrize('hnr', ['123a', '1', '101'])