mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-02-14 10:27:57 +00:00
add SOFT_PHRASE break and enable parsing
Also enables parsing of PART breaks.
This commit is contained in:
@@ -433,6 +433,7 @@ PENALTY_WORDCHANGE = {
|
||||
BreakType.START: 0.0,
|
||||
BreakType.END: 0.0,
|
||||
BreakType.PHRASE: 0.0,
|
||||
BreakType.SOFT_PHRASE: 0.0,
|
||||
BreakType.WORD: 0.1,
|
||||
BreakType.PART: 0.2,
|
||||
BreakType.TOKEN: 0.4
|
||||
|
||||
@@ -11,6 +11,8 @@ from typing import Tuple, Dict, List, Optional, NamedTuple, Iterator, Any, cast
|
||||
from collections import defaultdict
|
||||
import dataclasses
|
||||
import difflib
|
||||
import re
|
||||
from itertools import zip_longest
|
||||
|
||||
from icu import Transliterator
|
||||
|
||||
@@ -242,16 +244,22 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
|
||||
wordnr = 0
|
||||
for phrase in query.source:
|
||||
query.nodes[-1].ptype = phrase.ptype
|
||||
for word in phrase.text.split(' '):
|
||||
phrase_split = re.split('([ :-])', phrase.text)
|
||||
# The zip construct will give us the pairs of word/break from
|
||||
# the regular expression split. As the split array ends on the
|
||||
# final word, we simply use the fillvalue to even out the list and
|
||||
# add the phrase break at the end.
|
||||
for word, breakchar in zip_longest(*[iter(phrase_split)]*2, fillvalue=','):
|
||||
if not word:
|
||||
continue
|
||||
trans = self.transliterator.transliterate(word)
|
||||
if trans:
|
||||
for term in trans.split(' '):
|
||||
if term:
|
||||
parts.append(QueryPart(term, word, wordnr))
|
||||
query.add_node(qmod.BreakType.TOKEN, phrase.ptype)
|
||||
query.nodes[-1].btype = qmod.BreakType.WORD
|
||||
query.nodes[-1].btype = qmod.BreakType(breakchar)
|
||||
wordnr += 1
|
||||
query.nodes[-1].btype = qmod.BreakType.PHRASE
|
||||
|
||||
for word, wrange in yield_words(parts, phrase_start):
|
||||
words[word].append(wrange)
|
||||
|
||||
@@ -21,7 +21,13 @@ class BreakType(enum.Enum):
|
||||
END = '>'
|
||||
""" End of the query. """
|
||||
PHRASE = ','
|
||||
""" Break between two phrases. """
|
||||
""" Hard break between two phrases. Address parts cannot cross hard
|
||||
phrase boundaries."""
|
||||
SOFT_PHRASE = ':'
|
||||
""" Likely break between two phrases. Address parts should not cross soft
|
||||
phrase boundaries. Soft breaks can be inserted by a preprocessor
|
||||
that is analysing the input string.
|
||||
"""
|
||||
WORD = ' '
|
||||
""" Break between words. """
|
||||
PART = '-'
|
||||
|
||||
@@ -27,6 +27,7 @@ PENALTY_TOKENCHANGE = {
|
||||
qmod.BreakType.START: 0.0,
|
||||
qmod.BreakType.END: 0.0,
|
||||
qmod.BreakType.PHRASE: 0.0,
|
||||
qmod.BreakType.SOFT_PHRASE: 0.0,
|
||||
qmod.BreakType.WORD: 0.1,
|
||||
qmod.BreakType.PART: 0.2,
|
||||
qmod.BreakType.TOKEN: 0.4
|
||||
|
||||
Reference in New Issue
Block a user