mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-02-26 11:08:13 +00:00
split normalized word when transliteration is split up
This commit is contained in:
@@ -173,7 +173,10 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
|
|||||||
return query
|
return query
|
||||||
|
|
||||||
self.split_query(query)
|
self.split_query(query)
|
||||||
log().var_dump('Transliterated query', lambda: query.get_transliterated_query())
|
log().var_dump('Transliterated query',
|
||||||
|
lambda: ''.join(f"{n.term_lookup}{n.btype}" for n in query.nodes)
|
||||||
|
+ ' / '
|
||||||
|
+ ''.join(f"{n.term_normalized}{n.btype}" for n in query.nodes))
|
||||||
words = query.extract_words()
|
words = query.extract_words()
|
||||||
|
|
||||||
for row in await self.lookup_in_db(list(words.keys())):
|
for row in await self.lookup_in_db(list(words.keys())):
|
||||||
@@ -216,6 +219,34 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
|
|||||||
"""
|
"""
|
||||||
return cast(str, self.normalizer.transliterate(text)).strip('-: ')
|
return cast(str, self.normalizer.transliterate(text)).strip('-: ')
|
||||||
|
|
||||||
|
def split_transliteration(self, trans: str, word: str) -> list[tuple[str, str]]:
|
||||||
|
""" Split the given transliteration string into sub-words and
|
||||||
|
return them together with the original part of the word.
|
||||||
|
"""
|
||||||
|
subwords = trans.split(' ')
|
||||||
|
|
||||||
|
if len(subwords) == 1:
|
||||||
|
return [(trans, word)]
|
||||||
|
|
||||||
|
tlist = []
|
||||||
|
titer = filter(None, subwords)
|
||||||
|
current_trans: Optional[str] = next(titer)
|
||||||
|
assert current_trans
|
||||||
|
current_word = ''
|
||||||
|
for letter in word:
|
||||||
|
current_word += letter
|
||||||
|
if self.transliterator.transliterate(current_word).rstrip() == current_trans:
|
||||||
|
tlist.append((current_trans, current_word))
|
||||||
|
current_trans = next(titer, None)
|
||||||
|
if current_trans is None:
|
||||||
|
return tlist
|
||||||
|
current_word = ''
|
||||||
|
|
||||||
|
if current_word:
|
||||||
|
tlist.append((current_trans, current_word))
|
||||||
|
|
||||||
|
return tlist
|
||||||
|
|
||||||
def split_query(self, query: qmod.QueryStruct) -> None:
|
def split_query(self, query: qmod.QueryStruct) -> None:
|
||||||
""" Transliterate the phrases and split them into tokens.
|
""" Transliterate the phrases and split them into tokens.
|
||||||
"""
|
"""
|
||||||
@@ -229,11 +260,10 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
|
|||||||
for word, breakchar in zip_longest(*[iter(phrase_split)]*2, fillvalue=','):
|
for word, breakchar in zip_longest(*[iter(phrase_split)]*2, fillvalue=','):
|
||||||
if not word:
|
if not word:
|
||||||
continue
|
continue
|
||||||
trans = self.transliterator.transliterate(word)
|
if trans := self.transliterator.transliterate(word):
|
||||||
if trans:
|
for term, term_word in self.split_transliteration(trans, word):
|
||||||
for term in trans.split(' '):
|
|
||||||
if term:
|
if term:
|
||||||
query.add_node(qmod.BREAK_TOKEN, phrase.ptype, term, word)
|
query.add_node(qmod.BREAK_TOKEN, phrase.ptype, term, term_word)
|
||||||
query.nodes[-1].btype = breakchar
|
query.nodes[-1].btype = breakchar
|
||||||
|
|
||||||
query.nodes[-1].btype = qmod.BREAK_END
|
query.nodes[-1].btype = qmod.BREAK_END
|
||||||
@@ -291,11 +321,8 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
|
|||||||
token.penalty += penalty
|
token.penalty += penalty
|
||||||
|
|
||||||
# rerank tokens against the normalized form
|
# rerank tokens against the normalized form
|
||||||
norm = ' '.join(n.term_normalized for n in query.nodes[start + 1:end + 1]
|
norm = ''.join(f"{n.term_normalized}{'' if n.btype == qmod.BREAK_TOKEN else ' '}"
|
||||||
if n.btype != qmod.BREAK_TOKEN)
|
for n in query.nodes[start + 1:end + 1]).strip()
|
||||||
if not norm:
|
|
||||||
# Can happen when the token only covers a partial term
|
|
||||||
norm = query.nodes[start + 1].term_normalized
|
|
||||||
for ttype, tokens in tlist.items():
|
for ttype, tokens in tlist.items():
|
||||||
if ttype != qmod.TOKEN_COUNTRY:
|
if ttype != qmod.TOKEN_COUNTRY:
|
||||||
for token in tokens:
|
for token in tokens:
|
||||||
|
|||||||
@@ -199,8 +199,6 @@ class QueryNode:
|
|||||||
"""
|
"""
|
||||||
term_normalized: str
|
term_normalized: str
|
||||||
""" Normalised form of term ending at this node.
|
""" Normalised form of term ending at this node.
|
||||||
When the token resulted from a split during transliteration,
|
|
||||||
then this string contains the complete source term.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
starting: List[TokenList] = dataclasses.field(default_factory=list)
|
starting: List[TokenList] = dataclasses.field(default_factory=list)
|
||||||
@@ -392,14 +390,6 @@ class QueryStruct:
|
|||||||
return f"[{tlist.ttype}]{t.lookup_word}"
|
return f"[{tlist.ttype}]{t.lookup_word}"
|
||||||
return 'None'
|
return 'None'
|
||||||
|
|
||||||
def get_transliterated_query(self) -> str:
|
|
||||||
""" Return a string representation of the transliterated query
|
|
||||||
with the character representation of the different break types.
|
|
||||||
|
|
||||||
For debugging purposes only.
|
|
||||||
"""
|
|
||||||
return ''.join(''.join((n.term_lookup, n.btype)) for n in self.nodes)
|
|
||||||
|
|
||||||
def extract_words(self, start: int = 0,
|
def extract_words(self, start: int = 0,
|
||||||
endpos: Optional[int] = None) -> Dict[str, List[TokenRange]]:
|
endpos: Optional[int] = None) -> Dict[str, List[TokenRange]]:
|
||||||
""" Add all combinations of words that can be formed from the terms
|
""" Add all combinations of words that can be formed from the terms
|
||||||
|
|||||||
Reference in New Issue
Block a user