replace BreakType enum with simple char constants

2025-02-21 09:57:48 +01:00
parent 9bf1428d81
commit 4577669213
8 changed files with 150 additions and 147 deletions
--- a/src/nominatim_api/search/db_search_builder.py
+++ b/src/nominatim_api/search/db_search_builder.py
@@ -429,11 +429,11 @@ class SearchBuilder:


 PENALTY_WORDCHANGE = {
-    qmod.BreakType.START: 0.0,
-    qmod.BreakType.END: 0.0,
-    qmod.BreakType.PHRASE: 0.0,
-    qmod.BreakType.SOFT_PHRASE: 0.0,
-    qmod.BreakType.WORD: 0.1,
-    qmod.BreakType.PART: 0.2,
-    qmod.BreakType.TOKEN: 0.4
+    qmod.BREAK_START: 0.0,
+    qmod.BREAK_END: 0.0,
+    qmod.BREAK_PHRASE: 0.0,
+    qmod.BREAK_SOFT_PHRASE: 0.0,
+    qmod.BREAK_WORD: 0.1,
+    qmod.BREAK_PART: 0.2,
+    qmod.BREAK_TOKEN: 0.4
 }
--- a/src/nominatim_api/search/icu_tokenizer.py
+++ b/src/nominatim_api/search/icu_tokenizer.py
@@ -37,13 +37,13 @@ DB_TO_TOKEN_TYPE = {
 }

 PENALTY_IN_TOKEN_BREAK = {
-     qmod.BreakType.START: 0.5,
-     qmod.BreakType.END: 0.5,
-     qmod.BreakType.PHRASE: 0.5,
-     qmod.BreakType.SOFT_PHRASE: 0.5,
-     qmod.BreakType.WORD: 0.1,
-     qmod.BreakType.PART: 0.0,
-     qmod.BreakType.TOKEN: 0.0
+     qmod.BREAK_START: 0.5,
+     qmod.BREAK_END: 0.5,
+     qmod.BREAK_PHRASE: 0.5,
+     qmod.BREAK_SOFT_PHRASE: 0.5,
+     qmod.BREAK_WORD: 0.1,
+     qmod.BREAK_PART: 0.0,
+     qmod.BREAK_TOKEN: 0.0
 }


@@ -72,7 +72,7 @@ def extract_words(terms: List[QueryPart], start: int,  words: WordDict) -> None:
        given position to the word list.
    """
    total = len(terms)
-    base_penalty = PENALTY_IN_TOKEN_BREAK[qmod.BreakType.WORD]
+    base_penalty = PENALTY_IN_TOKEN_BREAK[qmod.BREAK_WORD]
    for first in range(start, total):
        word = terms[first].token
        penalty = base_penalty
@@ -273,15 +273,15 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
                    for term in trans.split(' '):
                        if term:
                            parts.append(QueryPart(term, word,
-                                                   PENALTY_IN_TOKEN_BREAK[qmod.BreakType.TOKEN]))
-                            query.add_node(qmod.BreakType.TOKEN, phrase.ptype)
-                    query.nodes[-1].btype = qmod.BreakType(breakchar)
-                    parts[-1].penalty = PENALTY_IN_TOKEN_BREAK[qmod.BreakType(breakchar)]
+                                                   PENALTY_IN_TOKEN_BREAK[qmod.BREAK_TOKEN]))
+                            query.add_node(qmod.BREAK_TOKEN, phrase.ptype)
+                    query.nodes[-1].btype = breakchar
+                    parts[-1].penalty = PENALTY_IN_TOKEN_BREAK[breakchar]

            extract_words(parts, phrase_start, words)

            phrase_start = len(parts)
-        query.nodes[-1].btype = qmod.BreakType.END
+        query.nodes[-1].btype = qmod.BREAK_END

        return parts, words

@@ -322,16 +322,16 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
            elif tlist.ttype not in (qmod.TokenType.COUNTRY, qmod.TokenType.PARTIAL):
                norm = parts[i].normalized
                for j in range(i + 1, tlist.end):
-                    if node.btype != qmod.BreakType.TOKEN:
+                    if node.btype != qmod.BREAK_TOKEN:
                        norm += '  ' + parts[j].normalized
                for token in tlist.tokens:
                    cast(ICUToken, token).rematch(norm)


 def _dump_transliterated(query: qmod.QueryStruct, parts: QueryParts) -> str:
-    out = query.nodes[0].btype.value
+    out = query.nodes[0].btype
    for node, part in zip(query.nodes[1:], parts):
-        out += part.token + node.btype.value
+        out += part.token + node.btype
    return out


--- a/src/nominatim_api/search/query.py
+++ b/src/nominatim_api/search/query.py
@@ -13,29 +13,29 @@ import dataclasses
 import enum


-class BreakType(enum.Enum):
-    """ Type of break between tokens.
-    """
-    START = '<'
-    """ Begin of the query. """
-    END = '>'
-    """ End of the query. """
-    PHRASE = ','
-    """ Hard break between two phrases. Address parts cannot cross hard
-        phrase boundaries."""
-    SOFT_PHRASE = ':'
-    """ Likely break between two phrases. Address parts should not cross soft
-        phrase boundaries. Soft breaks can be inserted by a preprocessor
-        that is analysing the input string.
-    """
-    WORD = ' '
-    """ Break between words. """
-    PART = '-'
-    """ Break inside a word, for example a hyphen or apostrophe. """
-    TOKEN = '`'
-    """ Break created as a result of tokenization.
-        This may happen in languages without spaces between words.
-    """
+BreakType = str
+""" Type of break between tokens.
+"""
+BREAK_START = '<'
+""" Begin of the query. """
+BREAK_END = '>'
+""" End of the query. """
+BREAK_PHRASE = ','
+""" Hard break between two phrases. Address parts cannot cross hard
+    phrase boundaries."""
+BREAK_SOFT_PHRASE = ':'
+""" Likely break between two phrases. Address parts should not cross soft
+    phrase boundaries. Soft breaks can be inserted by a preprocessor
+    that is analysing the input string.
+"""
+BREAK_WORD = ' '
+""" Break between words. """
+BREAK_PART = '-'
+""" Break inside a word, for example a hyphen or apostrophe. """
+BREAK_TOKEN = '`'
+""" Break created as a result of tokenization.
+    This may happen in languages without spaces between words.
+"""


 class TokenType(enum.Enum):
@@ -218,7 +218,7 @@ class QueryStruct:
    def __init__(self, source: List[Phrase]) -> None:
        self.source = source
        self.nodes: List[QueryNode] = \
-            [QueryNode(BreakType.START, source[0].ptype if source else PhraseType.NONE)]
+            [QueryNode(BREAK_START, source[0].ptype if source else PhraseType.NONE)]

    def num_token_slots(self) -> int:
        """ Return the length of the query in vertice steps.
@@ -243,8 +243,8 @@ class QueryStruct:
            be added to, then the token is silently dropped.
        """
        snode = self.nodes[trange.start]
-        full_phrase = snode.btype in (BreakType.START, BreakType.PHRASE)\
-            and self.nodes[trange.end].btype in (BreakType.PHRASE, BreakType.END)
+        full_phrase = snode.btype in (BREAK_START, BREAK_PHRASE)\
+            and self.nodes[trange.end].btype in (BREAK_PHRASE, BREAK_END)
        if snode.ptype.compatible_with(ttype, full_phrase):
            tlist = snode.get_tokens(trange.end, ttype)
            if tlist is None:
--- a/src/nominatim_api/search/token_assignment.py
+++ b/src/nominatim_api/search/token_assignment.py
@@ -24,13 +24,13 @@ class TypedRange:


 PENALTY_TOKENCHANGE = {
-    qmod.BreakType.START: 0.0,
-    qmod.BreakType.END: 0.0,
-    qmod.BreakType.PHRASE: 0.0,
-    qmod.BreakType.SOFT_PHRASE: 0.0,
-    qmod.BreakType.WORD: 0.1,
-    qmod.BreakType.PART: 0.2,
-    qmod.BreakType.TOKEN: 0.4
+    qmod.BREAK_START: 0.0,
+    qmod.BREAK_END: 0.0,
+    qmod.BREAK_PHRASE: 0.0,
+    qmod.BREAK_SOFT_PHRASE: 0.0,
+    qmod.BREAK_WORD: 0.1,
+    qmod.BREAK_PART: 0.2,
+    qmod.BREAK_TOKEN: 0.4
 }

 TypedRangeSeq = List[TypedRange]
@@ -205,7 +205,7 @@ class _TokenSequence:
            new_penalty = 0.0
        else:
            last = self.seq[-1]
-            if btype != qmod.BreakType.PHRASE and last.ttype == ttype:
+            if btype != qmod.BREAK_PHRASE and last.ttype == ttype:
                # extend the existing range
                newseq = self.seq[:-1] + [TypedRange(ttype, last.trange.replace_end(end_pos))]
                new_penalty = 0.0