add SOFT_PHRASE break and enable parsing

Also enables parsing of PART breaks.
This commit is contained in:
Sarah Hoffmann
2025-01-06 17:10:24 +01:00
parent 14ecfc7834
commit 499110f549
4 changed files with 20 additions and 4 deletions

View File

@@ -433,6 +433,7 @@ PENALTY_WORDCHANGE = {
BreakType.START: 0.0,
BreakType.END: 0.0,
BreakType.PHRASE: 0.0,
BreakType.SOFT_PHRASE: 0.0,
BreakType.WORD: 0.1,
BreakType.PART: 0.2,
BreakType.TOKEN: 0.4

View File

@@ -11,6 +11,8 @@ from typing import Tuple, Dict, List, Optional, NamedTuple, Iterator, Any, cast
from collections import defaultdict
import dataclasses
import difflib
import re
from itertools import zip_longest
from icu import Transliterator
@@ -242,16 +244,22 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
wordnr = 0
for phrase in query.source:
query.nodes[-1].ptype = phrase.ptype
for word in phrase.text.split(' '):
phrase_split = re.split('([ :-])', phrase.text)
# The zip construct will give us the pairs of word/break from
# the regular expression split. As the split array ends on the
# final word, we simply use the fillvalue to even out the list and
# add the phrase break at the end.
for word, breakchar in zip_longest(*[iter(phrase_split)]*2, fillvalue=','):
if not word:
continue
trans = self.transliterator.transliterate(word)
if trans:
for term in trans.split(' '):
if term:
parts.append(QueryPart(term, word, wordnr))
query.add_node(qmod.BreakType.TOKEN, phrase.ptype)
query.nodes[-1].btype = qmod.BreakType.WORD
query.nodes[-1].btype = qmod.BreakType(breakchar)
wordnr += 1
query.nodes[-1].btype = qmod.BreakType.PHRASE
for word, wrange in yield_words(parts, phrase_start):
words[word].append(wrange)

View File

@@ -21,7 +21,13 @@ class BreakType(enum.Enum):
END = '>'
""" End of the query. """
PHRASE = ','
""" Break between two phrases. """
""" Hard break between two phrases. Address parts cannot cross hard
phrase boundaries."""
SOFT_PHRASE = ':'
""" Likely break between two phrases. Address parts should not cross soft
phrase boundaries. Soft breaks can be inserted by a preprocessor
that is analysing the input string.
"""
WORD = ' '
""" Break between words. """
PART = '-'

View File

@@ -27,6 +27,7 @@ PENALTY_TOKENCHANGE = {
qmod.BreakType.START: 0.0,
qmod.BreakType.END: 0.0,
qmod.BreakType.PHRASE: 0.0,
qmod.BreakType.SOFT_PHRASE: 0.0,
qmod.BreakType.WORD: 0.1,
qmod.BreakType.PART: 0.2,
qmod.BreakType.TOKEN: 0.4