add SOFT_PHRASE break and enable parsing

Also enables parsing of PART breaks.
This commit is contained in:
Sarah Hoffmann
2025-01-06 17:10:24 +01:00
parent 14ecfc7834
commit 499110f549
4 changed files with 20 additions and 4 deletions

View File

@@ -433,6 +433,7 @@ PENALTY_WORDCHANGE = {
BreakType.START: 0.0, BreakType.START: 0.0,
BreakType.END: 0.0, BreakType.END: 0.0,
BreakType.PHRASE: 0.0, BreakType.PHRASE: 0.0,
BreakType.SOFT_PHRASE: 0.0,
BreakType.WORD: 0.1, BreakType.WORD: 0.1,
BreakType.PART: 0.2, BreakType.PART: 0.2,
BreakType.TOKEN: 0.4 BreakType.TOKEN: 0.4

View File

@@ -11,6 +11,8 @@ from typing import Tuple, Dict, List, Optional, NamedTuple, Iterator, Any, cast
from collections import defaultdict from collections import defaultdict
import dataclasses import dataclasses
import difflib import difflib
import re
from itertools import zip_longest
from icu import Transliterator from icu import Transliterator
@@ -242,16 +244,22 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
wordnr = 0 wordnr = 0
for phrase in query.source: for phrase in query.source:
query.nodes[-1].ptype = phrase.ptype query.nodes[-1].ptype = phrase.ptype
for word in phrase.text.split(' '): phrase_split = re.split('([ :-])', phrase.text)
# The zip construct will give us the pairs of word/break from
# the regular expression split. As the split array ends on the
# final word, we simply use the fillvalue to even out the list and
# add the phrase break at the end.
for word, breakchar in zip_longest(*[iter(phrase_split)]*2, fillvalue=','):
if not word:
continue
trans = self.transliterator.transliterate(word) trans = self.transliterator.transliterate(word)
if trans: if trans:
for term in trans.split(' '): for term in trans.split(' '):
if term: if term:
parts.append(QueryPart(term, word, wordnr)) parts.append(QueryPart(term, word, wordnr))
query.add_node(qmod.BreakType.TOKEN, phrase.ptype) query.add_node(qmod.BreakType.TOKEN, phrase.ptype)
query.nodes[-1].btype = qmod.BreakType.WORD query.nodes[-1].btype = qmod.BreakType(breakchar)
wordnr += 1 wordnr += 1
query.nodes[-1].btype = qmod.BreakType.PHRASE
for word, wrange in yield_words(parts, phrase_start): for word, wrange in yield_words(parts, phrase_start):
words[word].append(wrange) words[word].append(wrange)

View File

@@ -21,7 +21,13 @@ class BreakType(enum.Enum):
END = '>' END = '>'
""" End of the query. """ """ End of the query. """
PHRASE = ',' PHRASE = ','
""" Break between two phrases. """ """ Hard break between two phrases. Address parts cannot cross hard
phrase boundaries."""
SOFT_PHRASE = ':'
""" Likely break between two phrases. Address parts should not cross soft
phrase boundaries. Soft breaks can be inserted by a preprocessor
that is analysing the input string.
"""
WORD = ' ' WORD = ' '
""" Break between words. """ """ Break between words. """
PART = '-' PART = '-'

View File

@@ -27,6 +27,7 @@ PENALTY_TOKENCHANGE = {
qmod.BreakType.START: 0.0, qmod.BreakType.START: 0.0,
qmod.BreakType.END: 0.0, qmod.BreakType.END: 0.0,
qmod.BreakType.PHRASE: 0.0, qmod.BreakType.PHRASE: 0.0,
qmod.BreakType.SOFT_PHRASE: 0.0,
qmod.BreakType.WORD: 0.1, qmod.BreakType.WORD: 0.1,
qmod.BreakType.PART: 0.2, qmod.BreakType.PART: 0.2,
qmod.BreakType.TOKEN: 0.4 qmod.BreakType.TOKEN: 0.4