From 4a9253a0a98808fecabd269002ea98dc0c43eb24 Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Wed, 9 Jul 2025 15:36:11 +0200
Subject: [PATCH 1/8] simplify QueryNode penalty and initial assignment

---
 src/nominatim_api/search/icu_tokenizer.py    | 40 ++++++++++++--------
 src/nominatim_api/search/query.py            | 30 +++++----------
 src/nominatim_api/search/token_assignment.py | 10 -----
 3 files changed, 33 insertions(+), 47 deletions(-)

diff --git a/src/nominatim_api/search/icu_tokenizer.py b/src/nominatim_api/search/icu_tokenizer.py
index 35171344..15a5e2ab 100644
--- a/src/nominatim_api/search/icu_tokenizer.py
+++ b/src/nominatim_api/search/icu_tokenizer.py
@@ -37,17 +37,16 @@ DB_TO_TOKEN_TYPE = {
     'C': qmod.TOKEN_COUNTRY
 }
 
-PENALTY_IN_TOKEN_BREAK = {
-     qmod.BREAK_START: 0.5,
-     qmod.BREAK_END: 0.5,
-     qmod.BREAK_PHRASE: 0.5,
-     qmod.BREAK_SOFT_PHRASE: 0.5,
-     qmod.BREAK_WORD: 0.1,
-     qmod.BREAK_PART: 0.0,
-     qmod.BREAK_TOKEN: 0.0
+PENALTY_BREAK = {
+     qmod.BREAK_START: -0.5,
+     qmod.BREAK_END: -0.5,
+     qmod.BREAK_PHRASE: -0.5,
+     qmod.BREAK_SOFT_PHRASE: -0.5,
+     qmod.BREAK_WORD: 0.0,
+     qmod.BREAK_PART: 0.2,
+     qmod.BREAK_TOKEN: 0.4
 }
 
-
 @dataclasses.dataclass
 class ICUToken(qmod.Token):
     """ Specialised token for ICU tokenizer.
@@ -78,13 +77,13 @@ class ICUToken(qmod.Token):
         self.penalty += (distance/len(self.lookup_word))
 
     @staticmethod
-    def from_db_row(row: SaRow, base_penalty: float = 0.0) -> 'ICUToken':
+    def from_db_row(row: SaRow) -> 'ICUToken':
         """ Create a ICUToken from the row of the word table.
         """
         count = 1 if row.info is None else row.info.get('count', 1)
         addr_count = 1 if row.info is None else row.info.get('addr_count', 1)
 
-        penalty = base_penalty
+        penalty = 0.0
         if row.type == 'w':
             penalty += 0.3
         elif row.type == 'W':
@@ -174,11 +173,14 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
 
         self.split_query(query)
         log().var_dump('Transliterated query', lambda: query.get_transliterated_query())
-        words = query.extract_words(base_penalty=PENALTY_IN_TOKEN_BREAK[qmod.BREAK_WORD])
+        words = query.extract_words()
 
         for row in await self.lookup_in_db(list(words.keys())):
             for trange in words[row.word_token]:
-                token = ICUToken.from_db_row(row, trange.penalty or 0.0)
+                # Create a new token for each position because the token
+                # penalty can vary depending on the position in the query.
+                # (See rerank_tokens() below.)
+                token = ICUToken.from_db_row(row)
                 if row.type == 'S':
                     if row.info['op'] in ('in', 'near'):
                         if trange.start == 0:
@@ -200,6 +202,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
                                      lookup_word=pc, word_token=term,
                                      info=None))
         self.rerank_tokens(query)
+        self.compute_break_penalties(query)
 
         log().table_dump('Word tokens', _dump_word_tokens(query))
 
@@ -232,10 +235,9 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
                             query.add_node(qmod.BREAK_TOKEN, phrase.ptype,
                                            PENALTY_IN_TOKEN_BREAK[qmod.BREAK_TOKEN],
                                            term, word)
-                    query.nodes[-1].adjust_break(breakchar,
-                                                 PENALTY_IN_TOKEN_BREAK[breakchar])
+                    query.nodes[-1].btype = breakchar
 
-        query.nodes[-1].adjust_break(qmod.BREAK_END, PENALTY_IN_TOKEN_BREAK[qmod.BREAK_END])
+        query.nodes[-1].btype = qmod.BREAK_END
 
     async def lookup_in_db(self, words: List[str]) -> 'sa.Result[Any]':
         """ Return the token information from the database for the
@@ -300,6 +302,12 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
                     for token in tokens:
                         cast(ICUToken, token).rematch(norm)
 
+    def compute_break_penalties(self, query: qmod.QueryStruct) -> None:
+        """ Set the break penalties for the nodes in the query.
+        """
+        for node in query.nodes:
+            node.penalty = PENALTY_BREAK[node.btype]
+
 
 def _dump_word_tokens(query: qmod.QueryStruct) -> Iterator[List[Any]]:
     yield ['type', 'from', 'to', 'token', 'word_token', 'lookup_word', 'penalty', 'count', 'info']
diff --git a/src/nominatim_api/search/query.py b/src/nominatim_api/search/query.py
index 092bd586..e89ed2dc 100644
--- a/src/nominatim_api/search/query.py
+++ b/src/nominatim_api/search/query.py
@@ -191,7 +191,9 @@ class QueryNode:
     ptype: PhraseType
 
     penalty: float
-    """ Penalty for the break at this node.
+    """ Penalty for having a word break at this position. The penalty
+        may be negative, when a word break is more likely than continuing
+        the word after the node.
     """
     term_lookup: str
     """ Transliterated term ending at this node.
@@ -221,12 +223,6 @@ class QueryNode:
 
         return self.partial.count / (self.partial.count + self.partial.addr_count)
 
-    def adjust_break(self, btype: BreakType, penalty: float) -> None:
-        """ Change the break type and penalty for this node.
-        """
-        self.btype = btype
-        self.penalty = penalty
-
     def has_tokens(self, end: int, *ttypes: TokenType) -> bool:
         """ Check if there are tokens of the given types ending at the
             given node.
@@ -277,8 +273,7 @@ class QueryStruct:
         self.source = source
         self.dir_penalty = 0.0
         self.nodes: List[QueryNode] = \
-            [QueryNode(BREAK_START, source[0].ptype if source else PHRASE_ANY,
-                       0.0, '', '')]
+            [QueryNode(BREAK_START, source[0].ptype if source else PHRASE_ANY)]
 
     def num_token_slots(self) -> int:
         """ Return the length of the query in vertice steps.
@@ -286,13 +281,12 @@ class QueryStruct:
         return len(self.nodes) - 1
 
     def add_node(self, btype: BreakType, ptype: PhraseType,
-                 break_penalty: float = 0.0,
                  term_lookup: str = '', term_normalized: str = '') -> None:
         """ Append a new break node with the given break type.
             The phrase type denotes the type for any tokens starting
             at the node.
         """
-        self.nodes.append(QueryNode(btype, ptype, break_penalty, term_lookup, term_normalized))
+        self.nodes.append(QueryNode(btype, ptype, 0.0, term_lookup, term_normalized))
 
     def add_token(self, trange: TokenRange, ttype: TokenType, token: Token) -> None:
         """ Add a token to the query. 'start' and 'end' are the indexes of the
@@ -386,17 +380,14 @@ class QueryStruct:
         """
         return ''.join(''.join((n.term_lookup, n.btype)) for n in self.nodes)
 
-    def extract_words(self, base_penalty: float = 0.0,
-                      start: int = 0,
+    def extract_words(self, start: int = 0,
                       endpos: Optional[int] = None) -> Dict[str, List[TokenRange]]:
         """ Add all combinations of words that can be formed from the terms
             between the given start and endnode. The terms are joined with
             spaces for each break. Words can never go across a BREAK_PHRASE.
 
             The functions returns a dictionary of possible words with their
-            position within the query and a penalty. The penalty is computed
-            from the base_penalty plus the penalty for each node the word
-            crosses.
+            position within the query.
         """
         if endpos is None:
             endpos = len(self.nodes)
@@ -405,16 +396,13 @@ class QueryStruct:
 
         for first, first_node in enumerate(self.nodes[start + 1:endpos], start):
             word = first_node.term_lookup
-            penalty = base_penalty
-            words[word].append(TokenRange(first, first + 1, penalty=penalty))
+            words[word].append(TokenRange(first, first + 1))
             if first_node.btype != BREAK_PHRASE:
-                penalty += first_node.penalty
                 max_last = min(first + 20, endpos)
                 for last, last_node in enumerate(self.nodes[first + 2:max_last], first + 2):
                     word = ' '.join((word, last_node.term_lookup))
-                    words[word].append(TokenRange(first, last, penalty=penalty))
+                    words[word].append(TokenRange(first, last))
                     if last_node.btype == BREAK_PHRASE:
                         break
-                    penalty += last_node.penalty
 
         return words
diff --git a/src/nominatim_api/search/token_assignment.py b/src/nominatim_api/search/token_assignment.py
index 4247158c..85c411b9 100644
--- a/src/nominatim_api/search/token_assignment.py
+++ b/src/nominatim_api/search/token_assignment.py
@@ -23,16 +23,6 @@ class TypedRange:
     trange: qmod.TokenRange
 
 
-PENALTY_TOKENCHANGE = {
-    qmod.BREAK_START: 0.0,
-    qmod.BREAK_END: 0.0,
-    qmod.BREAK_PHRASE: 0.0,
-    qmod.BREAK_SOFT_PHRASE: 0.0,
-    qmod.BREAK_WORD: 0.1,
-    qmod.BREAK_PART: 0.2,
-    qmod.BREAK_TOKEN: 0.4
-}
-
 TypedRangeSeq = List[TypedRange]
 
 

From 4634ad0720ce97973b48adbe21b55ce1e6b2c8a7 Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Wed, 9 Jul 2025 20:35:15 +0200
Subject: [PATCH 2/8] rebalance word transition penalties

---
 src/nominatim_api/search/db_search_builder.py | 43 ++++++++-----------
 src/nominatim_api/search/icu_tokenizer.py     |  5 +--
 src/nominatim_api/search/query.py             | 23 +++++++++-
 src/nominatim_api/search/token_assignment.py  | 17 +++++---
 4 files changed, 52 insertions(+), 36 deletions(-)

diff --git a/src/nominatim_api/search/db_search_builder.py b/src/nominatim_api/search/db_search_builder.py
index 9cb263fd..34f6b6c2 100644
--- a/src/nominatim_api/search/db_search_builder.py
+++ b/src/nominatim_api/search/db_search_builder.py
@@ -282,10 +282,14 @@ class SearchBuilder:
         """ Create a ranking expression for a name term in the given range.
         """
         name_fulls = self.query.get_tokens(trange, qmod.TOKEN_WORD)
-        ranks = [dbf.RankedTokens(t.penalty, [t.token]) for t in name_fulls]
+        full_word_penalty = self.query.get_in_word_penalty(trange)
+        ranks = [dbf.RankedTokens(t.penalty + full_word_penalty, [t.token])
+                 for t in name_fulls]
         ranks.sort(key=lambda r: r.penalty)
         # Fallback, sum of penalty for partials
-        default = sum(t.penalty for t in self.query.iter_partials(trange)) + 0.2
+        default = sum(t.penalty for t in self.query.iter_partials(trange))
+        default += sum(n.word_break_penalty
+                       for n in self.query.nodes[trange.start + 1:trange.end])
         return dbf.FieldRanking(db_field, default, ranks)
 
     def get_addr_ranking(self, trange: qmod.TokenRange) -> dbf.FieldRanking:
@@ -303,7 +307,7 @@ class SearchBuilder:
             if partial is not None:
                 if pos + 1 < trange.end:
                     penalty = rank.penalty + partial.penalty \
-                              + PENALTY_WORDCHANGE[self.query.nodes[pos + 1].btype]
+                              + self.query.nodes[pos + 1].word_break_penalty
                     heapq.heappush(todo, (neglen - 1, pos + 1,
                                    dbf.RankedTokens(penalty, rank.tokens)))
                 else:
@@ -313,7 +317,9 @@ class SearchBuilder:
             for tlist in self.query.nodes[pos].starting:
                 if tlist.ttype == qmod.TOKEN_WORD:
                     if tlist.end < trange.end:
-                        chgpenalty = PENALTY_WORDCHANGE[self.query.nodes[tlist.end].btype]
+                        chgpenalty = self.query.nodes[tlist.end].word_break_penalty \
+                                     + self.query.get_in_word_penalty(
+                                            qmod.TokenRange(pos, tlist.end))
                         for t in tlist.tokens:
                             heapq.heappush(todo, (neglen - 1, tlist.end,
                                                   rank.with_token(t, chgpenalty)))
@@ -323,7 +329,9 @@ class SearchBuilder:
             if len(ranks) >= 10:
                 # Too many variants, bail out and only add
                 # Worst-case Fallback: sum of penalty of partials
-                default = sum(t.penalty for t in self.query.iter_partials(trange)) + 0.2
+                default = sum(t.penalty for t in self.query.iter_partials(trange))
+                default += sum(n.word_break_penalty
+                               for n in self.query.nodes[trange.start + 1:trange.end])
                 ranks.append(dbf.RankedTokens(rank.penalty + default, []))
                 # Bail out of outer loop
                 break
@@ -346,6 +354,7 @@ class SearchBuilder:
             if not tokens:
                 return None
             sdata.set_strings('countries', tokens)
+            sdata.penalty += self.query.get_in_word_penalty(assignment.country)
         elif self.details.countries:
             sdata.countries = dbf.WeightedStrings(self.details.countries,
                                                   [0.0] * len(self.details.countries))
@@ -353,29 +362,24 @@ class SearchBuilder:
             sdata.set_strings('housenumbers',
                               self.query.get_tokens(assignment.housenumber,
                                                     qmod.TOKEN_HOUSENUMBER))
+            sdata.penalty += self.query.get_in_word_penalty(assignment.housenumber)
         if assignment.postcode:
             sdata.set_strings('postcodes',
                               self.query.get_tokens(assignment.postcode,
                                                     qmod.TOKEN_POSTCODE))
+            sdata.penalty += self.query.get_in_word_penalty(assignment.postcode)
         if assignment.qualifier:
             tokens = self.get_qualifier_tokens(assignment.qualifier)
             if not tokens:
                 return None
             sdata.set_qualifiers(tokens)
+            sdata.penalty += self.query.get_in_word_penalty(assignment.qualifier)
         elif self.details.categories:
             sdata.qualifiers = dbf.WeightedCategories(self.details.categories,
                                                       [0.0] * len(self.details.categories))
 
         if assignment.address:
-            if not assignment.name and assignment.housenumber:
-                # housenumber search: the first item needs to be handled like
-                # a name in ranking or penalties are not comparable with
-                # normal searches.
-                sdata.set_ranking([self.get_name_ranking(assignment.address[0],
-                                                         db_field='nameaddress_vector')]
-                                  + [self.get_addr_ranking(r) for r in assignment.address[1:]])
-            else:
-                sdata.set_ranking([self.get_addr_ranking(r) for r in assignment.address])
+            sdata.set_ranking([self.get_addr_ranking(r) for r in assignment.address])
         else:
             sdata.rankings = []
 
@@ -421,14 +425,3 @@ class SearchBuilder:
             return dbf.WeightedCategories(list(tokens.keys()), list(tokens.values()))
 
         return None
-
-
-PENALTY_WORDCHANGE = {
-    qmod.BREAK_START: 0.0,
-    qmod.BREAK_END: 0.0,
-    qmod.BREAK_PHRASE: 0.0,
-    qmod.BREAK_SOFT_PHRASE: 0.0,
-    qmod.BREAK_WORD: 0.1,
-    qmod.BREAK_PART: 0.2,
-    qmod.BREAK_TOKEN: 0.4
-}
diff --git a/src/nominatim_api/search/icu_tokenizer.py b/src/nominatim_api/search/icu_tokenizer.py
index 15a5e2ab..2bb9ce93 100644
--- a/src/nominatim_api/search/icu_tokenizer.py
+++ b/src/nominatim_api/search/icu_tokenizer.py
@@ -47,6 +47,7 @@ PENALTY_BREAK = {
      qmod.BREAK_TOKEN: 0.4
 }
 
+
 @dataclasses.dataclass
 class ICUToken(qmod.Token):
     """ Specialised token for ICU tokenizer.
@@ -232,9 +233,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
                 if trans:
                     for term in trans.split(' '):
                         if term:
-                            query.add_node(qmod.BREAK_TOKEN, phrase.ptype,
-                                           PENALTY_IN_TOKEN_BREAK[qmod.BREAK_TOKEN],
-                                           term, word)
+                            query.add_node(qmod.BREAK_TOKEN, phrase.ptype, term, word)
                     query.nodes[-1].btype = breakchar
 
         query.nodes[-1].btype = qmod.BREAK_END
diff --git a/src/nominatim_api/search/query.py b/src/nominatim_api/search/query.py
index e89ed2dc..3ce9db21 100644
--- a/src/nominatim_api/search/query.py
+++ b/src/nominatim_api/search/query.py
@@ -214,6 +214,19 @@ class QueryNode:
         types of tokens spanning over the gap.
     """
 
+    @property
+    def word_break_penalty(self) -> float:
+        """ Penalty to apply when a words ends at this node.
+        """
+        return max(0, self.penalty)
+
+    @property
+    def word_continuation_penalty(self) -> float:
+        """ Penalty to apply when a word continues over this node
+            (i.e. is a multi-term word).
+        """
+        return max(0, -self.penalty)
+
     def name_address_ratio(self) -> float:
         """ Return the propability that the partial token belonging to
             this node forms part of a name (as opposed of part of the address).
@@ -273,7 +286,8 @@ class QueryStruct:
         self.source = source
         self.dir_penalty = 0.0
         self.nodes: List[QueryNode] = \
-            [QueryNode(BREAK_START, source[0].ptype if source else PHRASE_ANY)]
+            [QueryNode(BREAK_START, source[0].ptype if source else PHRASE_ANY,
+                       0.0, '', '')]
 
     def num_token_slots(self) -> int:
         """ Return the length of the query in vertice steps.
@@ -338,6 +352,13 @@ class QueryStruct:
         assert ttype != TOKEN_PARTIAL
         return self.nodes[trange.start].get_tokens(trange.end, ttype) or []
 
+    def get_in_word_penalty(self, trange: TokenRange) -> float:
+        """ Gets the sum of penalties for all token transitions
+            within the given range.
+        """
+        return sum(n.word_continuation_penalty
+                   for n in self.nodes[trange.start + 1:trange.end])
+
     def iter_partials(self, trange: TokenRange) -> Iterator[Token]:
         """ Iterate over the partial tokens between the given nodes.
             Missing partials are ignored.
diff --git a/src/nominatim_api/search/token_assignment.py b/src/nominatim_api/search/token_assignment.py
index 85c411b9..798ee546 100644
--- a/src/nominatim_api/search/token_assignment.py
+++ b/src/nominatim_api/search/token_assignment.py
@@ -182,7 +182,7 @@ class _TokenSequence:
         return None
 
     def advance(self, ttype: qmod.TokenType, end_pos: int,
-                btype: qmod.BreakType) -> Optional['_TokenSequence']:
+                force_break: bool, break_penalty: float) -> Optional['_TokenSequence']:
         """ Return a new token sequence state with the given token type
             extended.
         """
@@ -195,7 +195,7 @@ class _TokenSequence:
             new_penalty = 0.0
         else:
             last = self.seq[-1]
-            if btype != qmod.BREAK_PHRASE and last.ttype == ttype:
+            if not force_break and last.ttype == ttype:
                 # extend the existing range
                 newseq = self.seq[:-1] + [TypedRange(ttype, last.trange.replace_end(end_pos))]
                 new_penalty = 0.0
@@ -203,7 +203,7 @@ class _TokenSequence:
                 # start a new range
                 newseq = list(self.seq) + [TypedRange(ttype,
                                                       qmod.TokenRange(last.trange.end, end_pos))]
-                new_penalty = PENALTY_TOKENCHANGE[btype]
+                new_penalty = break_penalty
 
         return _TokenSequence(newseq, newdir, self.penalty + new_penalty)
 
@@ -307,7 +307,7 @@ class _TokenSequence:
             name, addr = first.split(i)
             log().comment(f'split first word = name ({i - first.start})')
             yield dataclasses.replace(base, name=name, address=[addr] + base.address[1:],
-                                      penalty=penalty + PENALTY_TOKENCHANGE[query.nodes[i].btype])
+                                      penalty=penalty + query.nodes[i].word_break_penalty)
 
     def _get_assignments_address_backward(self, base: TokenAssignment,
                                           query: qmod.QueryStruct) -> Iterator[TokenAssignment]:
@@ -352,7 +352,7 @@ class _TokenSequence:
             addr, name = last.split(i)
             log().comment(f'split last word = name ({i - last.start})')
             yield dataclasses.replace(base, name=name, address=base.address[:-1] + [addr],
-                                      penalty=penalty + PENALTY_TOKENCHANGE[query.nodes[i].btype])
+                                      penalty=penalty + query.nodes[i].word_break_penalty)
 
     def get_assignments(self, query: qmod.QueryStruct) -> Iterator[TokenAssignment]:
         """ Yield possible assignments for the current sequence.
@@ -412,12 +412,15 @@ def yield_token_assignments(query: qmod.QueryStruct) -> Iterator[TokenAssignment
         for tlist in node.starting:
             yield from _append_state_to_todo(
                 query, todo,
-                state.advance(tlist.ttype, tlist.end, node.btype))
+                state.advance(tlist.ttype, tlist.end,
+                              True, node.word_break_penalty))
 
         if node.partial is not None:
             yield from _append_state_to_todo(
                 query, todo,
-                state.advance(qmod.TOKEN_PARTIAL, state.end_pos + 1, node.btype))
+                state.advance(qmod.TOKEN_PARTIAL, state.end_pos + 1,
+                              node.btype == qmod.BREAK_PHRASE,
+                              node.word_break_penalty))
 
 
 def _append_state_to_todo(query: qmod.QueryStruct, todo: List[_TokenSequence],

From 7ebd121abcf9642ec4fc0a73eafb73495e1361d5 Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Wed, 9 Jul 2025 22:25:40 +0200
Subject: [PATCH 3/8] give word break slight advantage towards continuation

prefers longer words
---
 src/nominatim_api/search/icu_tokenizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nominatim_api/search/icu_tokenizer.py b/src/nominatim_api/search/icu_tokenizer.py
index 2bb9ce93..ef6dba28 100644
--- a/src/nominatim_api/search/icu_tokenizer.py
+++ b/src/nominatim_api/search/icu_tokenizer.py
@@ -42,7 +42,7 @@ PENALTY_BREAK = {
      qmod.BREAK_END: -0.5,
      qmod.BREAK_PHRASE: -0.5,
      qmod.BREAK_SOFT_PHRASE: -0.5,
-     qmod.BREAK_WORD: 0.0,
+     qmod.BREAK_WORD: 0.1,
      qmod.BREAK_PART: 0.2,
      qmod.BREAK_TOKEN: 0.4
 }

From e4b671f8b1047b182500ddc7e9134f02eb3df99b Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Wed, 9 Jul 2025 22:27:12 +0200
Subject: [PATCH 4/8] reinstate penalty for partial only matches

---
 src/nominatim_api/search/db_search_builder.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/nominatim_api/search/db_search_builder.py b/src/nominatim_api/search/db_search_builder.py
index 34f6b6c2..cd734409 100644
--- a/src/nominatim_api/search/db_search_builder.py
+++ b/src/nominatim_api/search/db_search_builder.py
@@ -287,7 +287,7 @@ class SearchBuilder:
                  for t in name_fulls]
         ranks.sort(key=lambda r: r.penalty)
         # Fallback, sum of penalty for partials
-        default = sum(t.penalty for t in self.query.iter_partials(trange))
+        default = sum(t.penalty for t in self.query.iter_partials(trange)) + 0.2
         default += sum(n.word_break_penalty
                        for n in self.query.nodes[trange.start + 1:trange.end])
         return dbf.FieldRanking(db_field, default, ranks)
@@ -329,7 +329,7 @@ class SearchBuilder:
             if len(ranks) >= 10:
                 # Too many variants, bail out and only add
                 # Worst-case Fallback: sum of penalty of partials
-                default = sum(t.penalty for t in self.query.iter_partials(trange))
+                default = sum(t.penalty for t in self.query.iter_partials(trange)) + 0.2
                 default += sum(n.word_break_penalty
                                for n in self.query.nodes[trange.start + 1:trange.end])
                 ranks.append(dbf.RankedTokens(rank.penalty + default, []))

From 71025f3f4396e390bccfaca6640eb3b8f9c90abf Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Wed, 9 Jul 2025 23:22:20 +0200
Subject: [PATCH 5/8] fix order of address rankings prefering longest words

---
 src/nominatim_api/search/db_search_builder.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/nominatim_api/search/db_search_builder.py b/src/nominatim_api/search/db_search_builder.py
index cd734409..7974a0c4 100644
--- a/src/nominatim_api/search/db_search_builder.py
+++ b/src/nominatim_api/search/db_search_builder.py
@@ -301,14 +301,14 @@ class SearchBuilder:
         ranks: List[dbf.RankedTokens] = []
 
         while todo:
-            neglen, pos, rank = heapq.heappop(todo)
+            _, pos, rank = heapq.heappop(todo)
             # partial node
             partial = self.query.nodes[pos].partial
             if partial is not None:
                 if pos + 1 < trange.end:
                     penalty = rank.penalty + partial.penalty \
                               + self.query.nodes[pos + 1].word_break_penalty
-                    heapq.heappush(todo, (neglen - 1, pos + 1,
+                    heapq.heappush(todo, (-(pos + 1), pos + 1,
                                    dbf.RankedTokens(penalty, rank.tokens)))
                 else:
                     ranks.append(dbf.RankedTokens(rank.penalty + partial.penalty,
@@ -321,7 +321,7 @@ class SearchBuilder:
                                      + self.query.get_in_word_penalty(
                                             qmod.TokenRange(pos, tlist.end))
                         for t in tlist.tokens:
-                            heapq.heappush(todo, (neglen - 1, tlist.end,
+                            heapq.heappush(todo, (-tlist.end, tlist.end,
                                                   rank.with_token(t, chgpenalty)))
                     elif tlist.end == trange.end:
                         ranks.extend(rank.with_token(t, 0.0) for t in tlist.tokens)

From b9252cc348f3a911a98b08d15825207c2f6a0a36 Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Fri, 11 Jul 2025 09:34:21 +0200
Subject: [PATCH 6/8] reduce maximum number of SQL queries per search

---
 src/nominatim_api/search/geocoder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nominatim_api/search/geocoder.py b/src/nominatim_api/search/geocoder.py
index b48e96e8..9d5146ef 100644
--- a/src/nominatim_api/search/geocoder.py
+++ b/src/nominatim_api/search/geocoder.py
@@ -83,7 +83,7 @@ class ForwardGeocoder:
         min_ranking = searches[0].penalty + 2.0
         prev_penalty = 0.0
         for i, search in enumerate(searches):
-            if search.penalty > prev_penalty and (search.penalty > min_ranking or i > 20):
+            if search.penalty > prev_penalty and (search.penalty > min_ranking or i > 15):
                 break
             log().table_dump(f"{i + 1}. Search", _dump_searches([search], query))
             log().var_dump('Params', self.params)

From 73ee17af95444920f5d0629bf0528682e0e7dd57 Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Fri, 11 Jul 2025 10:27:18 +0200
Subject: [PATCH 7/8] adapt tests for new function signatures

---
 test/python/api/search/test_postcode_parser.py  | 12 ++++++------
 test/python/api/search/test_query.py            | 16 ++++++++--------
 test/python/api/search/test_token_assignment.py |  7 ++++---
 3 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/test/python/api/search/test_postcode_parser.py b/test/python/api/search/test_postcode_parser.py
index f7d78857..8c159467 100644
--- a/test/python/api/search/test_postcode_parser.py
+++ b/test/python/api/search/test_postcode_parser.py
@@ -68,7 +68,7 @@ def mk_query(inp):
     phrase_split = re.split(r"([ ,:'-])", inp)
 
     for word, breakchar in zip_longest(*[iter(phrase_split)]*2, fillvalue='>'):
-        query.add_node(breakchar, PHRASE_ANY, 0.1, word, word)
+        query.add_node(breakchar, PHRASE_ANY, word, word)
 
     return query
 
@@ -153,9 +153,9 @@ def test_postcode_inside_postcode_phrase(pc_config):
 
     query = QueryStruct([])
     query.nodes[-1].ptype = PHRASE_STREET
-    query.add_node(',', PHRASE_STREET, 0.1, '12345', '12345')
-    query.add_node(',', PHRASE_POSTCODE, 0.1, 'xz', 'xz')
-    query.add_node('>', PHRASE_POSTCODE, 0.1, '4444', '4444')
+    query.add_node(',', PHRASE_STREET, '12345', '12345')
+    query.add_node(',', PHRASE_POSTCODE, 'xz', 'xz')
+    query.add_node('>', PHRASE_POSTCODE, '4444', '4444')
 
     assert parser.parse(query) == {(2, 3, '4444')}
 
@@ -165,7 +165,7 @@ def test_partial_postcode_in_postcode_phrase(pc_config):
 
     query = QueryStruct([])
     query.nodes[-1].ptype = PHRASE_POSTCODE
-    query.add_node(' ', PHRASE_POSTCODE, 0.1, '2224', '2224')
-    query.add_node('>', PHRASE_POSTCODE, 0.1, '12345', '12345')
+    query.add_node(' ', PHRASE_POSTCODE, '2224', '2224')
+    query.add_node('>', PHRASE_POSTCODE, '12345', '12345')
 
     assert not parser.parse(query)
diff --git a/test/python/api/search/test_query.py b/test/python/api/search/test_query.py
index 09f25f8e..354d8655 100644
--- a/test/python/api/search/test_query.py
+++ b/test/python/api/search/test_query.py
@@ -51,15 +51,15 @@ def test_token_range_unimplemented_ops():
 
 def test_query_extract_words():
     q = nq.QueryStruct([])
-    q.add_node(nq.BREAK_WORD, nq.PHRASE_ANY, 0.1, '12', '')
-    q.add_node(nq.BREAK_TOKEN, nq.PHRASE_ANY, 0.0, 'ab', '')
-    q.add_node(nq.BREAK_PHRASE, nq.PHRASE_ANY, 0.0, '12', '')
-    q.add_node(nq.BREAK_END, nq.PHRASE_ANY, 0.5, 'hallo', '')
+    q.add_node(nq.BREAK_WORD, nq.PHRASE_ANY, '12', '')
+    q.add_node(nq.BREAK_TOKEN, nq.PHRASE_ANY, 'ab', '')
+    q.add_node(nq.BREAK_PHRASE, nq.PHRASE_ANY, '12', '')
+    q.add_node(nq.BREAK_END, nq.PHRASE_ANY, 'hallo', '')
 
-    words = q.extract_words(base_penalty=1.0)
+    words = q.extract_words()
 
     assert set(words.keys()) \
         == {'12', 'ab', 'hallo', '12 ab', 'ab 12', '12 ab 12'}
-    assert sorted(words['12']) == [nq.TokenRange(0, 1, 1.0), nq.TokenRange(2, 3, 1.0)]
-    assert words['12 ab'] == [nq.TokenRange(0, 2, 1.1)]
-    assert words['hallo'] == [nq.TokenRange(3, 4, 1.0)]
+    assert sorted(words['12']) == [nq.TokenRange(0, 1), nq.TokenRange(2, 3)]
+    assert words['12 ab'] == [nq.TokenRange(0, 2)]
+    assert words['hallo'] == [nq.TokenRange(3, 4)]
diff --git a/test/python/api/search/test_token_assignment.py b/test/python/api/search/test_token_assignment.py
index 2ffba335..e45352d7 100644
--- a/test/python/api/search/test_token_assignment.py
+++ b/test/python/api/search/test_token_assignment.py
@@ -12,8 +12,8 @@ import pytest
 from nominatim_api.search.query import QueryStruct, Phrase, TokenRange, Token
 import nominatim_api.search.query as qmod
 from nominatim_api.search.token_assignment import (yield_token_assignments,
-                                                   TokenAssignment,
-                                                   PENALTY_TOKENCHANGE)
+                                                   TokenAssignment)
+from nominatim_api.search.icu_tokenizer import PENALTY_BREAK
 
 
 class MyToken(Token):
@@ -28,6 +28,7 @@ def make_query(*args):
 
     for btype, ptype, _ in args[1:]:
         q.add_node(btype, ptype)
+        q.nodes[-1].penalty = PENALTY_BREAK[btype]
     q.add_node(qmod.BREAK_END, qmod.PHRASE_ANY)
 
     for start, t in enumerate(args):
@@ -94,7 +95,7 @@ def test_multiple_simple_words(btype):
                    (btype, qmod.PHRASE_ANY, [(2, qmod.TOKEN_PARTIAL)]),
                    (btype, qmod.PHRASE_ANY, [(3, qmod.TOKEN_PARTIAL)]))
 
-    penalty = PENALTY_TOKENCHANGE[btype]
+    penalty = PENALTY_BREAK[btype]
 
     check_assignments(yield_token_assignments(q),
                       TokenAssignment(name=TokenRange(0, 3)),

From fe30663b218b6a5b1679596e8b2ae290b0bf27ca Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Fri, 11 Jul 2025 10:57:35 +0200
Subject: [PATCH 8/8] remove penalty from TokenRanges

The parameter is no longer needed.
---
 src/nominatim_api/search/query.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/nominatim_api/search/query.py b/src/nominatim_api/search/query.py
index 3ce9db21..da7099c0 100644
--- a/src/nominatim_api/search/query.py
+++ b/src/nominatim_api/search/query.py
@@ -134,7 +134,6 @@ class TokenRange:
     """
     start: int
     end: int
-    penalty: Optional[float] = None
 
     def __lt__(self, other: 'TokenRange') -> bool:
         return self.end <= other.start