Merge pull request #3779 from lonvia/fix-zero-devision-direction

Fix direction factor computation on empty strings
Merge pull request #3777 from lonvia/harmonize-transition-penalties
2026-02-14 18:37:58 +00:00 · 2025-07-11 14:51:00 +02:00 · 2025-07-11 14:17:48 +02:00 · 2025-07-11 14:17:24 +02:00 · 2025-07-11 14:16:48 +02:00 · 2025-07-11 11:51:49 +02:00
11 changed files with 105 additions and 130 deletions
--- a/docs/customize/Settings.md
+++ b/docs/customize/Settings.md
@@ -641,24 +641,6 @@ See also [NOMINATIM_DEFAULT_LANGUAGE](#nominatim_default_language).
 ### Logging Settings
 #### NOMINATIM_LOG_DB
 | Summary            |                                                     |
 | --------------     | --------------------------------------------------- |
 | **Description:**   | Log requests into the database |
 | **Format:**        | boolean |
 | **Default:**       | no |
 | **After Changes:** | run `nominatim refresh --website` |
 Enable logging requests into a database table with this setting. The logs
 can be found in the table `new_query_log`.
 When using this logging method, it is advisable to set up a job that
 regularly clears out old logging information. Nominatim will not do that
 on its own.
 Can be used as the same time as NOMINATIM_LOG_FILE.
 #### NOMINATIM_LOG_FILE
 | Summary            |                                                     |
@@ -682,8 +664,6 @@ given in seconds and includes the entire time the query was queued and executed
 in the frontend.
 type contains the name of the endpoint used.
 Can be used as the same time as NOMINATIM_LOG_DB.
 #### NOMINATIM_DEBUG_SQL
 | Summary            |                                                     |
--- a/settings/env.defaults
+++ b/settings/env.defaults
@@ -5,7 +5,6 @@
 # Database connection string.
 # Add host, port, user etc through additional semicolon-separated attributes.
 # e.g. ;host=...;port=...;user=...;password=...
 # Changing this variable requires to run 'nominatim refresh --website'.
 NOMINATIM_DATABASE_DSN="pgsql:dbname=nominatim"
 # Database web user.
@@ -36,11 +35,11 @@ NOMINATIM_TOKENIZER_CONFIG=
 # Search in the Tiger house number data for the US.
 # Note: The tables must already exist or queries will throw errors.
-# Changing this value requires to run ./utils/setup --create-functions --setup-website.
+# Changing this value requires to run ./utils/setup --create-functions.
 NOMINATIM_USE_US_TIGER_DATA=no
 # Search in the auxiliary housenumber table.
-# Changing this value requires to run ./utils/setup --create-functions --setup-website.
+# Changing this value requires to run ./utils/setup --create-functions.
 NOMINATIM_USE_AUX_LOCATION_DATA=no
 # Proxy settings
@@ -143,8 +142,7 @@ NOMINATIM_REPLICATION_RECHECK_INTERVAL=60
 ### API settings
 #
-# The following settings configure the API responses. You must rerun
+# The following settings configure the API responses.
 # 'nominatim refresh --website' after changing any of them.
 # Send permissive CORS access headers.
 # When enabled, send CORS headers to allow access to everybody.
@@ -202,13 +200,7 @@ NOMINATIM_OUTPUT_NAMES=name:XX,name,brand,official_name:XX,short_name:XX,officia
 ### Log settings
 #
 # The following options allow to enable logging of API requests.
 # You must rerun 'nominatim refresh --website' after changing any of them.
 #
 # Enable logging of requests into the DB.
 # The request will be logged into the new_query_log table.
 # You should set up a cron job that regularly clears out this table.
 NOMINATIM_LOG_DB=no
 # Enable logging of requests into a file.
 # To enable logging set this setting to the file to log to.
 NOMINATIM_LOG_FILE=
--- a/src/nominatim_api/search/db_search_builder.py
+++ b/src/nominatim_api/search/db_search_builder.py
@@ -282,10 +282,14 @@ class SearchBuilder:
        """ Create a ranking expression for a name term in the given range.
        """
        name_fulls = self.query.get_tokens(trange, qmod.TOKEN_WORD)
-        ranks = [dbf.RankedTokens(t.penalty, [t.token]) for t in name_fulls]
+        full_word_penalty = self.query.get_in_word_penalty(trange)
        ranks = [dbf.RankedTokens(t.penalty + full_word_penalty, [t.token])
                 for t in name_fulls]
        ranks.sort(key=lambda r: r.penalty)
        # Fallback, sum of penalty for partials
        default = sum(t.penalty for t in self.query.iter_partials(trange)) + 0.2
        default += sum(n.word_break_penalty
                       for n in self.query.nodes[trange.start + 1:trange.end])
        return dbf.FieldRanking(db_field, default, ranks)
    def get_addr_ranking(self, trange: qmod.TokenRange) -> dbf.FieldRanking:
@@ -297,14 +301,14 @@ class SearchBuilder:
        ranks: List[dbf.RankedTokens] = []
        while todo:
-            neglen, pos, rank = heapq.heappop(todo)
+            _, pos, rank = heapq.heappop(todo)
            # partial node
            partial = self.query.nodes[pos].partial
            if partial is not None:
                if pos + 1 < trange.end:
                    penalty = rank.penalty + partial.penalty \
-                              + PENALTY_WORDCHANGE[self.query.nodes[pos + 1].btype]
+                              + self.query.nodes[pos + 1].word_break_penalty
-                    heapq.heappush(todo, (neglen - 1, pos + 1,
+                    heapq.heappush(todo, (-(pos + 1), pos + 1,
                                   dbf.RankedTokens(penalty, rank.tokens)))
                else:
                    ranks.append(dbf.RankedTokens(rank.penalty + partial.penalty,
@@ -313,9 +317,11 @@ class SearchBuilder:
            for tlist in self.query.nodes[pos].starting:
                if tlist.ttype == qmod.TOKEN_WORD:
                    if tlist.end < trange.end:
-                        chgpenalty = PENALTY_WORDCHANGE[self.query.nodes[tlist.end].btype]
+                        chgpenalty = self.query.nodes[tlist.end].word_break_penalty \
                                     + self.query.get_in_word_penalty(
                                            qmod.TokenRange(pos, tlist.end))
                        for t in tlist.tokens:
-                            heapq.heappush(todo, (neglen - 1, tlist.end,
+                            heapq.heappush(todo, (-tlist.end, tlist.end,
                                                  rank.with_token(t, chgpenalty)))
                    elif tlist.end == trange.end:
                        ranks.extend(rank.with_token(t, 0.0) for t in tlist.tokens)
@@ -324,6 +330,8 @@ class SearchBuilder:
                # Too many variants, bail out and only add
                # Worst-case Fallback: sum of penalty of partials
                default = sum(t.penalty for t in self.query.iter_partials(trange)) + 0.2
                default += sum(n.word_break_penalty
                               for n in self.query.nodes[trange.start + 1:trange.end])
                ranks.append(dbf.RankedTokens(rank.penalty + default, []))
                # Bail out of outer loop
                break
@@ -346,6 +354,7 @@ class SearchBuilder:
            if not tokens:
                return None
            sdata.set_strings('countries', tokens)
            sdata.penalty += self.query.get_in_word_penalty(assignment.country)
        elif self.details.countries:
            sdata.countries = dbf.WeightedStrings(self.details.countries,
                                                  [0.0] * len(self.details.countries))
@@ -353,28 +362,23 @@ class SearchBuilder:
            sdata.set_strings('housenumbers',
                              self.query.get_tokens(assignment.housenumber,
                                                    qmod.TOKEN_HOUSENUMBER))
            sdata.penalty += self.query.get_in_word_penalty(assignment.housenumber)
        if assignment.postcode:
            sdata.set_strings('postcodes',
                              self.query.get_tokens(assignment.postcode,
                                                    qmod.TOKEN_POSTCODE))
            sdata.penalty += self.query.get_in_word_penalty(assignment.postcode)
        if assignment.qualifier:
            tokens = self.get_qualifier_tokens(assignment.qualifier)
            if not tokens:
                return None
            sdata.set_qualifiers(tokens)
            sdata.penalty += self.query.get_in_word_penalty(assignment.qualifier)
        elif self.details.categories:
            sdata.qualifiers = dbf.WeightedCategories(self.details.categories,
                                                      [0.0] * len(self.details.categories))
        if assignment.address:
            if not assignment.name and assignment.housenumber:
                # housenumber search: the first item needs to be handled like
                # a name in ranking or penalties are not comparable with
                # normal searches.
                sdata.set_ranking([self.get_name_ranking(assignment.address[0],
                                                         db_field='nameaddress_vector')]
                                  + [self.get_addr_ranking(r) for r in assignment.address[1:]])
            else:
            sdata.set_ranking([self.get_addr_ranking(r) for r in assignment.address])
        else:
            sdata.rankings = []
@@ -421,14 +425,3 @@ class SearchBuilder:
            return dbf.WeightedCategories(list(tokens.keys()), list(tokens.values()))
        return None
 PENALTY_WORDCHANGE = {
    qmod.BREAK_START: 0.0,
    qmod.BREAK_END: 0.0,
    qmod.BREAK_PHRASE: 0.0,
    qmod.BREAK_SOFT_PHRASE: 0.0,
    qmod.BREAK_WORD: 0.1,
    qmod.BREAK_PART: 0.2,
    qmod.BREAK_TOKEN: 0.4
 }
--- a/src/nominatim_api/search/geocoder.py
+++ b/src/nominatim_api/search/geocoder.py
@@ -83,7 +83,7 @@ class ForwardGeocoder:
        min_ranking = searches[0].penalty + 2.0
        prev_penalty = 0.0
        for i, search in enumerate(searches):
-            if search.penalty > prev_penalty and (search.penalty > min_ranking or i > 20):
+            if search.penalty > prev_penalty and (search.penalty > min_ranking or i > 15):
                break
            log().table_dump(f"{i + 1}. Search", _dump_searches([search], query))
            log().var_dump('Params', self.params)
--- a/src/nominatim_api/search/icu_tokenizer.py
+++ b/src/nominatim_api/search/icu_tokenizer.py
@@ -37,14 +37,14 @@ DB_TO_TOKEN_TYPE = {
    'C': qmod.TOKEN_COUNTRY
 }
-PENALTY_IN_TOKEN_BREAK = {
+PENALTY_BREAK = {
-     qmod.BREAK_START: 0.5,
+     qmod.BREAK_START: -0.5,
-     qmod.BREAK_END: 0.5,
+     qmod.BREAK_END: -0.5,
-     qmod.BREAK_PHRASE: 0.5,
+     qmod.BREAK_PHRASE: -0.5,
-     qmod.BREAK_SOFT_PHRASE: 0.5,
+     qmod.BREAK_SOFT_PHRASE: -0.5,
     qmod.BREAK_WORD: 0.1,
-     qmod.BREAK_PART: 0.0,
+     qmod.BREAK_PART: 0.2,
-     qmod.BREAK_TOKEN: 0.0
+     qmod.BREAK_TOKEN: 0.4
 }
@@ -78,13 +78,13 @@ class ICUToken(qmod.Token):
        self.penalty += (distance/len(self.lookup_word))
    @staticmethod
-    def from_db_row(row: SaRow, base_penalty: float = 0.0) -> 'ICUToken':
+    def from_db_row(row: SaRow) -> 'ICUToken':
        """ Create a ICUToken from the row of the word table.
        """
        count = 1 if row.info is None else row.info.get('count', 1)
        addr_count = 1 if row.info is None else row.info.get('addr_count', 1)
-        penalty = base_penalty
+        penalty = 0.0
        if row.type == 'w':
            penalty += 0.3
        elif row.type == 'W':
@@ -174,11 +174,14 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
        self.split_query(query)
        log().var_dump('Transliterated query', lambda: query.get_transliterated_query())
-        words = query.extract_words(base_penalty=PENALTY_IN_TOKEN_BREAK[qmod.BREAK_WORD])
+        words = query.extract_words()
        for row in await self.lookup_in_db(list(words.keys())):
            for trange in words[row.word_token]:
-                token = ICUToken.from_db_row(row, trange.penalty or 0.0)
+                # Create a new token for each position because the token
                # penalty can vary depending on the position in the query.
                # (See rerank_tokens() below.)
                token = ICUToken.from_db_row(row)
                if row.type == 'S':
                    if row.info['op'] in ('in', 'near'):
                        if trange.start == 0:
@@ -200,6 +203,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
                                     lookup_word=pc, word_token=term,
                                     info=None))
        self.rerank_tokens(query)
        self.compute_break_penalties(query)
        log().table_dump('Word tokens', _dump_word_tokens(query))
@@ -229,13 +233,10 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
                if trans:
                    for term in trans.split(' '):
                        if term:
-                            query.add_node(qmod.BREAK_TOKEN, phrase.ptype,
+                            query.add_node(qmod.BREAK_TOKEN, phrase.ptype, term, word)
-                                           PENALTY_IN_TOKEN_BREAK[qmod.BREAK_TOKEN],
+                    query.nodes[-1].btype = breakchar
                                           term, word)
                    query.nodes[-1].adjust_break(breakchar,
                                                 PENALTY_IN_TOKEN_BREAK[breakchar])
-        query.nodes[-1].adjust_break(qmod.BREAK_END, PENALTY_IN_TOKEN_BREAK[qmod.BREAK_END])
+        query.nodes[-1].btype = qmod.BREAK_END
    async def lookup_in_db(self, words: List[str]) -> 'sa.Result[Any]':
        """ Return the token information from the database for the
@@ -300,6 +301,12 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
                    for token in tokens:
                        cast(ICUToken, token).rematch(norm)
    def compute_break_penalties(self, query: qmod.QueryStruct) -> None:
        """ Set the break penalties for the nodes in the query.
        """
        for node in query.nodes:
            node.penalty = PENALTY_BREAK[node.btype]
 def _dump_word_tokens(query: qmod.QueryStruct) -> Iterator[List[Any]]:
    yield ['type', 'from', 'to', 'token', 'word_token', 'lookup_word', 'penalty', 'count', 'info']
--- a/src/nominatim_api/search/query.py
+++ b/src/nominatim_api/search/query.py
@@ -134,7 +134,6 @@ class TokenRange:
    """
    start: int
    end: int
    penalty: Optional[float] = None
    def __lt__(self, other: 'TokenRange') -> bool:
        return self.end <= other.start
@@ -191,7 +190,9 @@ class QueryNode:
    ptype: PhraseType
    penalty: float
-    """ Penalty for the break at this node.
+    """ Penalty for having a word break at this position. The penalty
        may be negative, when a word break is more likely than continuing
        the word after the node.
    """
    term_lookup: str
    """ Transliterated term ending at this node.
@@ -212,6 +213,19 @@ class QueryNode:
        types of tokens spanning over the gap.
    """
    @property
    def word_break_penalty(self) -> float:
        """ Penalty to apply when a words ends at this node.
        """
        return max(0, self.penalty)
    @property
    def word_continuation_penalty(self) -> float:
        """ Penalty to apply when a word continues over this node
            (i.e. is a multi-term word).
        """
        return max(0, -self.penalty)
    def name_address_ratio(self) -> float:
        """ Return the propability that the partial token belonging to
            this node forms part of a name (as opposed of part of the address).
@@ -221,12 +235,6 @@ class QueryNode:
        return self.partial.count / (self.partial.count + self.partial.addr_count)
    def adjust_break(self, btype: BreakType, penalty: float) -> None:
        """ Change the break type and penalty for this node.
        """
        self.btype = btype
        self.penalty = penalty
    def has_tokens(self, end: int, *ttypes: TokenType) -> bool:
        """ Check if there are tokens of the given types ending at the
            given node.
@@ -286,13 +294,12 @@ class QueryStruct:
        return len(self.nodes) - 1
    def add_node(self, btype: BreakType, ptype: PhraseType,
                 break_penalty: float = 0.0,
                 term_lookup: str = '', term_normalized: str = '') -> None:
        """ Append a new break node with the given break type.
            The phrase type denotes the type for any tokens starting
            at the node.
        """
-        self.nodes.append(QueryNode(btype, ptype, break_penalty, term_lookup, term_normalized))
+        self.nodes.append(QueryNode(btype, ptype, 0.0, term_lookup, term_normalized))
    def add_token(self, trange: TokenRange, ttype: TokenType, token: Token) -> None:
        """ Add a token to the query. 'start' and 'end' are the indexes of the
@@ -324,7 +331,7 @@ class QueryStruct:
            of each node.
        """
        n = len(self.nodes) - 1
-        if n == 1 or n >= 50:
+        if n <= 1 or n >= 50:
            self.dir_penalty = 0
        elif n == 2:
            self.dir_penalty = (self.nodes[1].name_address_ratio()
@@ -344,6 +351,13 @@ class QueryStruct:
        assert ttype != TOKEN_PARTIAL
        return self.nodes[trange.start].get_tokens(trange.end, ttype) or []
    def get_in_word_penalty(self, trange: TokenRange) -> float:
        """ Gets the sum of penalties for all token transitions
            within the given range.
        """
        return sum(n.word_continuation_penalty
                   for n in self.nodes[trange.start + 1:trange.end])
    def iter_partials(self, trange: TokenRange) -> Iterator[Token]:
        """ Iterate over the partial tokens between the given nodes.
            Missing partials are ignored.
@@ -386,17 +400,14 @@ class QueryStruct:
        """
        return ''.join(''.join((n.term_lookup, n.btype)) for n in self.nodes)
-    def extract_words(self, base_penalty: float = 0.0,
+    def extract_words(self, start: int = 0,
                      start: int = 0,
                      endpos: Optional[int] = None) -> Dict[str, List[TokenRange]]:
        """ Add all combinations of words that can be formed from the terms
            between the given start and endnode. The terms are joined with
            spaces for each break. Words can never go across a BREAK_PHRASE.
            The functions returns a dictionary of possible words with their
-            position within the query and a penalty. The penalty is computed
+            position within the query.
            from the base_penalty plus the penalty for each node the word
            crosses.
        """
        if endpos is None:
            endpos = len(self.nodes)
@@ -405,16 +416,13 @@ class QueryStruct:
        for first, first_node in enumerate(self.nodes[start + 1:endpos], start):
            word = first_node.term_lookup
-            penalty = base_penalty
+            words[word].append(TokenRange(first, first + 1))
            words[word].append(TokenRange(first, first + 1, penalty=penalty))
            if first_node.btype != BREAK_PHRASE:
                penalty += first_node.penalty
                max_last = min(first + 20, endpos)
                for last, last_node in enumerate(self.nodes[first + 2:max_last], first + 2):
                    word = ' '.join((word, last_node.term_lookup))
-                    words[word].append(TokenRange(first, last, penalty=penalty))
+                    words[word].append(TokenRange(first, last))
                    if last_node.btype == BREAK_PHRASE:
                        break
                    penalty += last_node.penalty
        return words
--- a/src/nominatim_api/search/token_assignment.py
+++ b/src/nominatim_api/search/token_assignment.py
@@ -23,16 +23,6 @@ class TypedRange:
    trange: qmod.TokenRange
 PENALTY_TOKENCHANGE = {
    qmod.BREAK_START: 0.0,
    qmod.BREAK_END: 0.0,
    qmod.BREAK_PHRASE: 0.0,
    qmod.BREAK_SOFT_PHRASE: 0.0,
    qmod.BREAK_WORD: 0.1,
    qmod.BREAK_PART: 0.2,
    qmod.BREAK_TOKEN: 0.4
 }
 TypedRangeSeq = List[TypedRange]
@@ -192,7 +182,7 @@ class _TokenSequence:
        return None
    def advance(self, ttype: qmod.TokenType, end_pos: int,
-                btype: qmod.BreakType) -> Optional['_TokenSequence']:
+                force_break: bool, break_penalty: float) -> Optional['_TokenSequence']:
        """ Return a new token sequence state with the given token type
            extended.
        """
@@ -205,7 +195,7 @@ class _TokenSequence:
            new_penalty = 0.0
        else:
            last = self.seq[-1]
-            if btype != qmod.BREAK_PHRASE and last.ttype == ttype:
+            if not force_break and last.ttype == ttype:
                # extend the existing range
                newseq = self.seq[:-1] + [TypedRange(ttype, last.trange.replace_end(end_pos))]
                new_penalty = 0.0
@@ -213,7 +203,7 @@ class _TokenSequence:
                # start a new range
                newseq = list(self.seq) + [TypedRange(ttype,
                                                      qmod.TokenRange(last.trange.end, end_pos))]
-                new_penalty = PENALTY_TOKENCHANGE[btype]
+                new_penalty = break_penalty
        return _TokenSequence(newseq, newdir, self.penalty + new_penalty)
@@ -317,7 +307,7 @@ class _TokenSequence:
            name, addr = first.split(i)
            log().comment(f'split first word = name ({i - first.start})')
            yield dataclasses.replace(base, name=name, address=[addr] + base.address[1:],
-                                      penalty=penalty + PENALTY_TOKENCHANGE[query.nodes[i].btype])
+                                      penalty=penalty + query.nodes[i].word_break_penalty)
    def _get_assignments_address_backward(self, base: TokenAssignment,
                                          query: qmod.QueryStruct) -> Iterator[TokenAssignment]:
@@ -362,7 +352,7 @@ class _TokenSequence:
            addr, name = last.split(i)
            log().comment(f'split last word = name ({i - last.start})')
            yield dataclasses.replace(base, name=name, address=base.address[:-1] + [addr],
-                                      penalty=penalty + PENALTY_TOKENCHANGE[query.nodes[i].btype])
+                                      penalty=penalty + query.nodes[i].word_break_penalty)
    def get_assignments(self, query: qmod.QueryStruct) -> Iterator[TokenAssignment]:
        """ Yield possible assignments for the current sequence.
@@ -422,12 +412,15 @@ def yield_token_assignments(query: qmod.QueryStruct) -> Iterator[TokenAssignment
        for tlist in node.starting:
            yield from _append_state_to_todo(
                query, todo,
-                state.advance(tlist.ttype, tlist.end, node.btype))
+                state.advance(tlist.ttype, tlist.end,
                              True, node.word_break_penalty))
        if node.partial is not None:
            yield from _append_state_to_todo(
                query, todo,
-                state.advance(qmod.TOKEN_PARTIAL, state.end_pos + 1, node.btype))
+                state.advance(qmod.TOKEN_PARTIAL, state.end_pos + 1,
                              node.btype == qmod.BREAK_PHRASE,
                              node.word_break_penalty))
 def _append_state_to_todo(query: qmod.QueryStruct, todo: List[_TokenSequence],
--- a/test/bdd/features/api/search/simple.feature
+++ b/test/bdd/features/api/search/simple.feature
@@ -21,6 +21,7 @@ Feature: Simple Tests
     | %#$@*&l;der#$! |
     | 234.23.14.5 |
     | aussenstelle universitat lichtenstein wachterhaus aussenstelle universitat lichtenstein wachterhaus aussenstelle universitat lichtenstein wachterhaus aussenstelle universitat lichtenstein wachterhaus |
     | . |
    Scenario: Empty XML search
        When sending v1/search with format xml
--- a/test/python/api/search/test_postcode_parser.py
+++ b/test/python/api/search/test_postcode_parser.py
@@ -68,7 +68,7 @@ def mk_query(inp):
    phrase_split = re.split(r"([ ,:'-])", inp)
    for word, breakchar in zip_longest(*[iter(phrase_split)]*2, fillvalue='>'):
-        query.add_node(breakchar, PHRASE_ANY, 0.1, word, word)
+        query.add_node(breakchar, PHRASE_ANY, word, word)
    return query
@@ -153,9 +153,9 @@ def test_postcode_inside_postcode_phrase(pc_config):
    query = QueryStruct([])
    query.nodes[-1].ptype = PHRASE_STREET
-    query.add_node(',', PHRASE_STREET, 0.1, '12345', '12345')
+    query.add_node(',', PHRASE_STREET, '12345', '12345')
-    query.add_node(',', PHRASE_POSTCODE, 0.1, 'xz', 'xz')
+    query.add_node(',', PHRASE_POSTCODE, 'xz', 'xz')
-    query.add_node('>', PHRASE_POSTCODE, 0.1, '4444', '4444')
+    query.add_node('>', PHRASE_POSTCODE, '4444', '4444')
    assert parser.parse(query) == {(2, 3, '4444')}
@@ -165,7 +165,7 @@ def test_partial_postcode_in_postcode_phrase(pc_config):
    query = QueryStruct([])
    query.nodes[-1].ptype = PHRASE_POSTCODE
-    query.add_node(' ', PHRASE_POSTCODE, 0.1, '2224', '2224')
+    query.add_node(' ', PHRASE_POSTCODE, '2224', '2224')
-    query.add_node('>', PHRASE_POSTCODE, 0.1, '12345', '12345')
+    query.add_node('>', PHRASE_POSTCODE, '12345', '12345')
    assert not parser.parse(query)
--- a/test/python/api/search/test_query.py
+++ b/test/python/api/search/test_query.py
@@ -51,15 +51,15 @@ def test_token_range_unimplemented_ops():
 def test_query_extract_words():
    q = nq.QueryStruct([])
-    q.add_node(nq.BREAK_WORD, nq.PHRASE_ANY, 0.1, '12', '')
+    q.add_node(nq.BREAK_WORD, nq.PHRASE_ANY, '12', '')
-    q.add_node(nq.BREAK_TOKEN, nq.PHRASE_ANY, 0.0, 'ab', '')
+    q.add_node(nq.BREAK_TOKEN, nq.PHRASE_ANY, 'ab', '')
-    q.add_node(nq.BREAK_PHRASE, nq.PHRASE_ANY, 0.0, '12', '')
+    q.add_node(nq.BREAK_PHRASE, nq.PHRASE_ANY, '12', '')
-    q.add_node(nq.BREAK_END, nq.PHRASE_ANY, 0.5, 'hallo', '')
+    q.add_node(nq.BREAK_END, nq.PHRASE_ANY, 'hallo', '')
-    words = q.extract_words(base_penalty=1.0)
+    words = q.extract_words()
    assert set(words.keys()) \
        == {'12', 'ab', 'hallo', '12 ab', 'ab 12', '12 ab 12'}
-    assert sorted(words['12']) == [nq.TokenRange(0, 1, 1.0), nq.TokenRange(2, 3, 1.0)]
+    assert sorted(words['12']) == [nq.TokenRange(0, 1), nq.TokenRange(2, 3)]
-    assert words['12 ab'] == [nq.TokenRange(0, 2, 1.1)]
+    assert words['12 ab'] == [nq.TokenRange(0, 2)]
-    assert words['hallo'] == [nq.TokenRange(3, 4, 1.0)]
+    assert words['hallo'] == [nq.TokenRange(3, 4)]
--- a/test/python/api/search/test_token_assignment.py
+++ b/test/python/api/search/test_token_assignment.py
@@ -12,8 +12,8 @@ import pytest
 from nominatim_api.search.query import QueryStruct, Phrase, TokenRange, Token
 import nominatim_api.search.query as qmod
 from nominatim_api.search.token_assignment import (yield_token_assignments,
-                                                   TokenAssignment,
+                                                   TokenAssignment)
-                                                   PENALTY_TOKENCHANGE)
+from nominatim_api.search.icu_tokenizer import PENALTY_BREAK
 class MyToken(Token):
@@ -28,6 +28,7 @@ def make_query(*args):
    for btype, ptype, _ in args[1:]:
        q.add_node(btype, ptype)
        q.nodes[-1].penalty = PENALTY_BREAK[btype]
    q.add_node(qmod.BREAK_END, qmod.PHRASE_ANY)
    for start, t in enumerate(args):
@@ -94,7 +95,7 @@ def test_multiple_simple_words(btype):
                   (btype, qmod.PHRASE_ANY, [(2, qmod.TOKEN_PARTIAL)]),
                   (btype, qmod.PHRASE_ANY, [(3, qmod.TOKEN_PARTIAL)]))
-    penalty = PENALTY_TOKENCHANGE[btype]
+    penalty = PENALTY_BREAK[btype]
    check_assignments(yield_token_assignments(q),
                      TokenAssignment(name=TokenRange(0, 3)),
Author	SHA1	Message	Date
Sarah Hoffmann	621d8e785b	Merge pull request #3779 from lonvia/fix-zero-devision-direction Fix direction factor computation on empty strings	2025-07-11 14:51:00 +02:00
Sarah Hoffmann	830307484b	Merge pull request #3777 from lonvia/harmonize-transition-penalties Clean up word transition penalty assignment for searches	2025-07-11 14:17:48 +02:00
Sarah Hoffmann	5d6967a1d0	Merge pull request #3778 from lonvia/remove-log-db-setting Remove defaults and documentations for LOG_DB setting	2025-07-11 14:17:24 +02:00
Sarah Hoffmann	26903aec0b	add BDD test for empty queries	2025-07-11 14:16:48 +02:00
Sarah Hoffmann	c39183e3a5	remove any references to website setup or refresh Does no longer exist.	2025-07-11 11:51:49 +02:00
Sarah Hoffmann	21ef3be433	fix direction factor computation on empty strings	2025-07-11 11:25:14 +02:00
Sarah Hoffmann	99562a197e	remove LOG_DB setting, not implemented anymore	2025-07-11 11:15:41 +02:00
Sarah Hoffmann	fe30663b21	remove penalty from TokenRanges The parameter is no longer needed.	2025-07-11 11:01:22 +02:00
Sarah Hoffmann	73ee17af95	adapt tests for new function signatures	2025-07-11 11:01:22 +02:00
Sarah Hoffmann	b9252cc348	reduce maximum number of SQL queries per search	2025-07-11 11:01:22 +02:00
Sarah Hoffmann	71025f3f43	fix order of address rankings prefering longest words	2025-07-11 11:01:21 +02:00
Sarah Hoffmann	e4b671f8b1	reinstate penalty for partial only matches	2025-07-11 11:01:21 +02:00
Sarah Hoffmann	7ebd121abc	give word break slight advantage towards continuation prefers longer words	2025-07-11 11:01:21 +02:00
Sarah Hoffmann	4634ad0720	rebalance word transition penalties	2025-07-11 11:01:21 +02:00
Sarah Hoffmann	4a9253a0a9	simplify QueryNode penalty and initial assignment	2025-07-11 11:01:09 +02:00