mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-02-16 15:47:58 +00:00
add SOFT_PHRASE break and enable parsing
Also enables parsing of PART breaks.
This commit is contained in:
@@ -433,6 +433,7 @@ PENALTY_WORDCHANGE = {
|
|||||||
BreakType.START: 0.0,
|
BreakType.START: 0.0,
|
||||||
BreakType.END: 0.0,
|
BreakType.END: 0.0,
|
||||||
BreakType.PHRASE: 0.0,
|
BreakType.PHRASE: 0.0,
|
||||||
|
BreakType.SOFT_PHRASE: 0.0,
|
||||||
BreakType.WORD: 0.1,
|
BreakType.WORD: 0.1,
|
||||||
BreakType.PART: 0.2,
|
BreakType.PART: 0.2,
|
||||||
BreakType.TOKEN: 0.4
|
BreakType.TOKEN: 0.4
|
||||||
|
|||||||
@@ -11,6 +11,8 @@ from typing import Tuple, Dict, List, Optional, NamedTuple, Iterator, Any, cast
|
|||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
import dataclasses
|
import dataclasses
|
||||||
import difflib
|
import difflib
|
||||||
|
import re
|
||||||
|
from itertools import zip_longest
|
||||||
|
|
||||||
from icu import Transliterator
|
from icu import Transliterator
|
||||||
|
|
||||||
@@ -242,16 +244,22 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
|
|||||||
wordnr = 0
|
wordnr = 0
|
||||||
for phrase in query.source:
|
for phrase in query.source:
|
||||||
query.nodes[-1].ptype = phrase.ptype
|
query.nodes[-1].ptype = phrase.ptype
|
||||||
for word in phrase.text.split(' '):
|
phrase_split = re.split('([ :-])', phrase.text)
|
||||||
|
# The zip construct will give us the pairs of word/break from
|
||||||
|
# the regular expression split. As the split array ends on the
|
||||||
|
# final word, we simply use the fillvalue to even out the list and
|
||||||
|
# add the phrase break at the end.
|
||||||
|
for word, breakchar in zip_longest(*[iter(phrase_split)]*2, fillvalue=','):
|
||||||
|
if not word:
|
||||||
|
continue
|
||||||
trans = self.transliterator.transliterate(word)
|
trans = self.transliterator.transliterate(word)
|
||||||
if trans:
|
if trans:
|
||||||
for term in trans.split(' '):
|
for term in trans.split(' '):
|
||||||
if term:
|
if term:
|
||||||
parts.append(QueryPart(term, word, wordnr))
|
parts.append(QueryPart(term, word, wordnr))
|
||||||
query.add_node(qmod.BreakType.TOKEN, phrase.ptype)
|
query.add_node(qmod.BreakType.TOKEN, phrase.ptype)
|
||||||
query.nodes[-1].btype = qmod.BreakType.WORD
|
query.nodes[-1].btype = qmod.BreakType(breakchar)
|
||||||
wordnr += 1
|
wordnr += 1
|
||||||
query.nodes[-1].btype = qmod.BreakType.PHRASE
|
|
||||||
|
|
||||||
for word, wrange in yield_words(parts, phrase_start):
|
for word, wrange in yield_words(parts, phrase_start):
|
||||||
words[word].append(wrange)
|
words[word].append(wrange)
|
||||||
|
|||||||
@@ -21,7 +21,13 @@ class BreakType(enum.Enum):
|
|||||||
END = '>'
|
END = '>'
|
||||||
""" End of the query. """
|
""" End of the query. """
|
||||||
PHRASE = ','
|
PHRASE = ','
|
||||||
""" Break between two phrases. """
|
""" Hard break between two phrases. Address parts cannot cross hard
|
||||||
|
phrase boundaries."""
|
||||||
|
SOFT_PHRASE = ':'
|
||||||
|
""" Likely break between two phrases. Address parts should not cross soft
|
||||||
|
phrase boundaries. Soft breaks can be inserted by a preprocessor
|
||||||
|
that is analysing the input string.
|
||||||
|
"""
|
||||||
WORD = ' '
|
WORD = ' '
|
||||||
""" Break between words. """
|
""" Break between words. """
|
||||||
PART = '-'
|
PART = '-'
|
||||||
|
|||||||
@@ -27,6 +27,7 @@ PENALTY_TOKENCHANGE = {
|
|||||||
qmod.BreakType.START: 0.0,
|
qmod.BreakType.START: 0.0,
|
||||||
qmod.BreakType.END: 0.0,
|
qmod.BreakType.END: 0.0,
|
||||||
qmod.BreakType.PHRASE: 0.0,
|
qmod.BreakType.PHRASE: 0.0,
|
||||||
|
qmod.BreakType.SOFT_PHRASE: 0.0,
|
||||||
qmod.BreakType.WORD: 0.1,
|
qmod.BreakType.WORD: 0.1,
|
||||||
qmod.BreakType.PART: 0.2,
|
qmod.BreakType.PART: 0.2,
|
||||||
qmod.BreakType.TOKEN: 0.4
|
qmod.BreakType.TOKEN: 0.4
|
||||||
|
|||||||
Reference in New Issue
Block a user