Compare commits

...

15 Commits

Author SHA1 Message Date
Sarah Hoffmann
621d8e785b Merge pull request #3779 from lonvia/fix-zero-devision-direction
Fix direction factor computation on empty strings
2025-07-11 14:51:00 +02:00
Sarah Hoffmann
830307484b Merge pull request #3777 from lonvia/harmonize-transition-penalties
Clean up word transition penalty assignment for searches
2025-07-11 14:17:48 +02:00
Sarah Hoffmann
5d6967a1d0 Merge pull request #3778 from lonvia/remove-log-db-setting
Remove defaults and documentations for LOG_DB setting
2025-07-11 14:17:24 +02:00
Sarah Hoffmann
26903aec0b add BDD test for empty queries 2025-07-11 14:16:48 +02:00
Sarah Hoffmann
c39183e3a5 remove any references to website setup or refresh
Does no longer exist.
2025-07-11 11:51:49 +02:00
Sarah Hoffmann
21ef3be433 fix direction factor computation on empty strings 2025-07-11 11:25:14 +02:00
Sarah Hoffmann
99562a197e remove LOG_DB setting, not implemented anymore 2025-07-11 11:15:41 +02:00
Sarah Hoffmann
fe30663b21 remove penalty from TokenRanges
The parameter is no longer needed.
2025-07-11 11:01:22 +02:00
Sarah Hoffmann
73ee17af95 adapt tests for new function signatures 2025-07-11 11:01:22 +02:00
Sarah Hoffmann
b9252cc348 reduce maximum number of SQL queries per search 2025-07-11 11:01:22 +02:00
Sarah Hoffmann
71025f3f43 fix order of address rankings prefering longest words 2025-07-11 11:01:21 +02:00
Sarah Hoffmann
e4b671f8b1 reinstate penalty for partial only matches 2025-07-11 11:01:21 +02:00
Sarah Hoffmann
7ebd121abc give word break slight advantage towards continuation
prefers longer words
2025-07-11 11:01:21 +02:00
Sarah Hoffmann
4634ad0720 rebalance word transition penalties 2025-07-11 11:01:21 +02:00
Sarah Hoffmann
4a9253a0a9 simplify QueryNode penalty and initial assignment 2025-07-11 11:01:09 +02:00
11 changed files with 105 additions and 130 deletions

View File

@@ -641,24 +641,6 @@ See also [NOMINATIM_DEFAULT_LANGUAGE](#nominatim_default_language).
### Logging Settings ### Logging Settings
#### NOMINATIM_LOG_DB
| Summary | |
| -------------- | --------------------------------------------------- |
| **Description:** | Log requests into the database |
| **Format:** | boolean |
| **Default:** | no |
| **After Changes:** | run `nominatim refresh --website` |
Enable logging requests into a database table with this setting. The logs
can be found in the table `new_query_log`.
When using this logging method, it is advisable to set up a job that
regularly clears out old logging information. Nominatim will not do that
on its own.
Can be used as the same time as NOMINATIM_LOG_FILE.
#### NOMINATIM_LOG_FILE #### NOMINATIM_LOG_FILE
| Summary | | | Summary | |
@@ -682,8 +664,6 @@ given in seconds and includes the entire time the query was queued and executed
in the frontend. in the frontend.
type contains the name of the endpoint used. type contains the name of the endpoint used.
Can be used as the same time as NOMINATIM_LOG_DB.
#### NOMINATIM_DEBUG_SQL #### NOMINATIM_DEBUG_SQL
| Summary | | | Summary | |

View File

@@ -5,7 +5,6 @@
# Database connection string. # Database connection string.
# Add host, port, user etc through additional semicolon-separated attributes. # Add host, port, user etc through additional semicolon-separated attributes.
# e.g. ;host=...;port=...;user=...;password=... # e.g. ;host=...;port=...;user=...;password=...
# Changing this variable requires to run 'nominatim refresh --website'.
NOMINATIM_DATABASE_DSN="pgsql:dbname=nominatim" NOMINATIM_DATABASE_DSN="pgsql:dbname=nominatim"
# Database web user. # Database web user.
@@ -36,11 +35,11 @@ NOMINATIM_TOKENIZER_CONFIG=
# Search in the Tiger house number data for the US. # Search in the Tiger house number data for the US.
# Note: The tables must already exist or queries will throw errors. # Note: The tables must already exist or queries will throw errors.
# Changing this value requires to run ./utils/setup --create-functions --setup-website. # Changing this value requires to run ./utils/setup --create-functions.
NOMINATIM_USE_US_TIGER_DATA=no NOMINATIM_USE_US_TIGER_DATA=no
# Search in the auxiliary housenumber table. # Search in the auxiliary housenumber table.
# Changing this value requires to run ./utils/setup --create-functions --setup-website. # Changing this value requires to run ./utils/setup --create-functions.
NOMINATIM_USE_AUX_LOCATION_DATA=no NOMINATIM_USE_AUX_LOCATION_DATA=no
# Proxy settings # Proxy settings
@@ -143,8 +142,7 @@ NOMINATIM_REPLICATION_RECHECK_INTERVAL=60
### API settings ### API settings
# #
# The following settings configure the API responses. You must rerun # The following settings configure the API responses.
# 'nominatim refresh --website' after changing any of them.
# Send permissive CORS access headers. # Send permissive CORS access headers.
# When enabled, send CORS headers to allow access to everybody. # When enabled, send CORS headers to allow access to everybody.
@@ -202,13 +200,7 @@ NOMINATIM_OUTPUT_NAMES=name:XX,name,brand,official_name:XX,short_name:XX,officia
### Log settings ### Log settings
# #
# The following options allow to enable logging of API requests. # The following options allow to enable logging of API requests.
# You must rerun 'nominatim refresh --website' after changing any of them.
# #
# Enable logging of requests into the DB.
# The request will be logged into the new_query_log table.
# You should set up a cron job that regularly clears out this table.
NOMINATIM_LOG_DB=no
# Enable logging of requests into a file. # Enable logging of requests into a file.
# To enable logging set this setting to the file to log to. # To enable logging set this setting to the file to log to.
NOMINATIM_LOG_FILE= NOMINATIM_LOG_FILE=

View File

@@ -282,10 +282,14 @@ class SearchBuilder:
""" Create a ranking expression for a name term in the given range. """ Create a ranking expression for a name term in the given range.
""" """
name_fulls = self.query.get_tokens(trange, qmod.TOKEN_WORD) name_fulls = self.query.get_tokens(trange, qmod.TOKEN_WORD)
ranks = [dbf.RankedTokens(t.penalty, [t.token]) for t in name_fulls] full_word_penalty = self.query.get_in_word_penalty(trange)
ranks = [dbf.RankedTokens(t.penalty + full_word_penalty, [t.token])
for t in name_fulls]
ranks.sort(key=lambda r: r.penalty) ranks.sort(key=lambda r: r.penalty)
# Fallback, sum of penalty for partials # Fallback, sum of penalty for partials
default = sum(t.penalty for t in self.query.iter_partials(trange)) + 0.2 default = sum(t.penalty for t in self.query.iter_partials(trange)) + 0.2
default += sum(n.word_break_penalty
for n in self.query.nodes[trange.start + 1:trange.end])
return dbf.FieldRanking(db_field, default, ranks) return dbf.FieldRanking(db_field, default, ranks)
def get_addr_ranking(self, trange: qmod.TokenRange) -> dbf.FieldRanking: def get_addr_ranking(self, trange: qmod.TokenRange) -> dbf.FieldRanking:
@@ -297,14 +301,14 @@ class SearchBuilder:
ranks: List[dbf.RankedTokens] = [] ranks: List[dbf.RankedTokens] = []
while todo: while todo:
neglen, pos, rank = heapq.heappop(todo) _, pos, rank = heapq.heappop(todo)
# partial node # partial node
partial = self.query.nodes[pos].partial partial = self.query.nodes[pos].partial
if partial is not None: if partial is not None:
if pos + 1 < trange.end: if pos + 1 < trange.end:
penalty = rank.penalty + partial.penalty \ penalty = rank.penalty + partial.penalty \
+ PENALTY_WORDCHANGE[self.query.nodes[pos + 1].btype] + self.query.nodes[pos + 1].word_break_penalty
heapq.heappush(todo, (neglen - 1, pos + 1, heapq.heappush(todo, (-(pos + 1), pos + 1,
dbf.RankedTokens(penalty, rank.tokens))) dbf.RankedTokens(penalty, rank.tokens)))
else: else:
ranks.append(dbf.RankedTokens(rank.penalty + partial.penalty, ranks.append(dbf.RankedTokens(rank.penalty + partial.penalty,
@@ -313,9 +317,11 @@ class SearchBuilder:
for tlist in self.query.nodes[pos].starting: for tlist in self.query.nodes[pos].starting:
if tlist.ttype == qmod.TOKEN_WORD: if tlist.ttype == qmod.TOKEN_WORD:
if tlist.end < trange.end: if tlist.end < trange.end:
chgpenalty = PENALTY_WORDCHANGE[self.query.nodes[tlist.end].btype] chgpenalty = self.query.nodes[tlist.end].word_break_penalty \
+ self.query.get_in_word_penalty(
qmod.TokenRange(pos, tlist.end))
for t in tlist.tokens: for t in tlist.tokens:
heapq.heappush(todo, (neglen - 1, tlist.end, heapq.heappush(todo, (-tlist.end, tlist.end,
rank.with_token(t, chgpenalty))) rank.with_token(t, chgpenalty)))
elif tlist.end == trange.end: elif tlist.end == trange.end:
ranks.extend(rank.with_token(t, 0.0) for t in tlist.tokens) ranks.extend(rank.with_token(t, 0.0) for t in tlist.tokens)
@@ -324,6 +330,8 @@ class SearchBuilder:
# Too many variants, bail out and only add # Too many variants, bail out and only add
# Worst-case Fallback: sum of penalty of partials # Worst-case Fallback: sum of penalty of partials
default = sum(t.penalty for t in self.query.iter_partials(trange)) + 0.2 default = sum(t.penalty for t in self.query.iter_partials(trange)) + 0.2
default += sum(n.word_break_penalty
for n in self.query.nodes[trange.start + 1:trange.end])
ranks.append(dbf.RankedTokens(rank.penalty + default, [])) ranks.append(dbf.RankedTokens(rank.penalty + default, []))
# Bail out of outer loop # Bail out of outer loop
break break
@@ -346,6 +354,7 @@ class SearchBuilder:
if not tokens: if not tokens:
return None return None
sdata.set_strings('countries', tokens) sdata.set_strings('countries', tokens)
sdata.penalty += self.query.get_in_word_penalty(assignment.country)
elif self.details.countries: elif self.details.countries:
sdata.countries = dbf.WeightedStrings(self.details.countries, sdata.countries = dbf.WeightedStrings(self.details.countries,
[0.0] * len(self.details.countries)) [0.0] * len(self.details.countries))
@@ -353,28 +362,23 @@ class SearchBuilder:
sdata.set_strings('housenumbers', sdata.set_strings('housenumbers',
self.query.get_tokens(assignment.housenumber, self.query.get_tokens(assignment.housenumber,
qmod.TOKEN_HOUSENUMBER)) qmod.TOKEN_HOUSENUMBER))
sdata.penalty += self.query.get_in_word_penalty(assignment.housenumber)
if assignment.postcode: if assignment.postcode:
sdata.set_strings('postcodes', sdata.set_strings('postcodes',
self.query.get_tokens(assignment.postcode, self.query.get_tokens(assignment.postcode,
qmod.TOKEN_POSTCODE)) qmod.TOKEN_POSTCODE))
sdata.penalty += self.query.get_in_word_penalty(assignment.postcode)
if assignment.qualifier: if assignment.qualifier:
tokens = self.get_qualifier_tokens(assignment.qualifier) tokens = self.get_qualifier_tokens(assignment.qualifier)
if not tokens: if not tokens:
return None return None
sdata.set_qualifiers(tokens) sdata.set_qualifiers(tokens)
sdata.penalty += self.query.get_in_word_penalty(assignment.qualifier)
elif self.details.categories: elif self.details.categories:
sdata.qualifiers = dbf.WeightedCategories(self.details.categories, sdata.qualifiers = dbf.WeightedCategories(self.details.categories,
[0.0] * len(self.details.categories)) [0.0] * len(self.details.categories))
if assignment.address: if assignment.address:
if not assignment.name and assignment.housenumber:
# housenumber search: the first item needs to be handled like
# a name in ranking or penalties are not comparable with
# normal searches.
sdata.set_ranking([self.get_name_ranking(assignment.address[0],
db_field='nameaddress_vector')]
+ [self.get_addr_ranking(r) for r in assignment.address[1:]])
else:
sdata.set_ranking([self.get_addr_ranking(r) for r in assignment.address]) sdata.set_ranking([self.get_addr_ranking(r) for r in assignment.address])
else: else:
sdata.rankings = [] sdata.rankings = []
@@ -421,14 +425,3 @@ class SearchBuilder:
return dbf.WeightedCategories(list(tokens.keys()), list(tokens.values())) return dbf.WeightedCategories(list(tokens.keys()), list(tokens.values()))
return None return None
PENALTY_WORDCHANGE = {
qmod.BREAK_START: 0.0,
qmod.BREAK_END: 0.0,
qmod.BREAK_PHRASE: 0.0,
qmod.BREAK_SOFT_PHRASE: 0.0,
qmod.BREAK_WORD: 0.1,
qmod.BREAK_PART: 0.2,
qmod.BREAK_TOKEN: 0.4
}

View File

@@ -83,7 +83,7 @@ class ForwardGeocoder:
min_ranking = searches[0].penalty + 2.0 min_ranking = searches[0].penalty + 2.0
prev_penalty = 0.0 prev_penalty = 0.0
for i, search in enumerate(searches): for i, search in enumerate(searches):
if search.penalty > prev_penalty and (search.penalty > min_ranking or i > 20): if search.penalty > prev_penalty and (search.penalty > min_ranking or i > 15):
break break
log().table_dump(f"{i + 1}. Search", _dump_searches([search], query)) log().table_dump(f"{i + 1}. Search", _dump_searches([search], query))
log().var_dump('Params', self.params) log().var_dump('Params', self.params)

View File

@@ -37,14 +37,14 @@ DB_TO_TOKEN_TYPE = {
'C': qmod.TOKEN_COUNTRY 'C': qmod.TOKEN_COUNTRY
} }
PENALTY_IN_TOKEN_BREAK = { PENALTY_BREAK = {
qmod.BREAK_START: 0.5, qmod.BREAK_START: -0.5,
qmod.BREAK_END: 0.5, qmod.BREAK_END: -0.5,
qmod.BREAK_PHRASE: 0.5, qmod.BREAK_PHRASE: -0.5,
qmod.BREAK_SOFT_PHRASE: 0.5, qmod.BREAK_SOFT_PHRASE: -0.5,
qmod.BREAK_WORD: 0.1, qmod.BREAK_WORD: 0.1,
qmod.BREAK_PART: 0.0, qmod.BREAK_PART: 0.2,
qmod.BREAK_TOKEN: 0.0 qmod.BREAK_TOKEN: 0.4
} }
@@ -78,13 +78,13 @@ class ICUToken(qmod.Token):
self.penalty += (distance/len(self.lookup_word)) self.penalty += (distance/len(self.lookup_word))
@staticmethod @staticmethod
def from_db_row(row: SaRow, base_penalty: float = 0.0) -> 'ICUToken': def from_db_row(row: SaRow) -> 'ICUToken':
""" Create a ICUToken from the row of the word table. """ Create a ICUToken from the row of the word table.
""" """
count = 1 if row.info is None else row.info.get('count', 1) count = 1 if row.info is None else row.info.get('count', 1)
addr_count = 1 if row.info is None else row.info.get('addr_count', 1) addr_count = 1 if row.info is None else row.info.get('addr_count', 1)
penalty = base_penalty penalty = 0.0
if row.type == 'w': if row.type == 'w':
penalty += 0.3 penalty += 0.3
elif row.type == 'W': elif row.type == 'W':
@@ -174,11 +174,14 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
self.split_query(query) self.split_query(query)
log().var_dump('Transliterated query', lambda: query.get_transliterated_query()) log().var_dump('Transliterated query', lambda: query.get_transliterated_query())
words = query.extract_words(base_penalty=PENALTY_IN_TOKEN_BREAK[qmod.BREAK_WORD]) words = query.extract_words()
for row in await self.lookup_in_db(list(words.keys())): for row in await self.lookup_in_db(list(words.keys())):
for trange in words[row.word_token]: for trange in words[row.word_token]:
token = ICUToken.from_db_row(row, trange.penalty or 0.0) # Create a new token for each position because the token
# penalty can vary depending on the position in the query.
# (See rerank_tokens() below.)
token = ICUToken.from_db_row(row)
if row.type == 'S': if row.type == 'S':
if row.info['op'] in ('in', 'near'): if row.info['op'] in ('in', 'near'):
if trange.start == 0: if trange.start == 0:
@@ -200,6 +203,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
lookup_word=pc, word_token=term, lookup_word=pc, word_token=term,
info=None)) info=None))
self.rerank_tokens(query) self.rerank_tokens(query)
self.compute_break_penalties(query)
log().table_dump('Word tokens', _dump_word_tokens(query)) log().table_dump('Word tokens', _dump_word_tokens(query))
@@ -229,13 +233,10 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
if trans: if trans:
for term in trans.split(' '): for term in trans.split(' '):
if term: if term:
query.add_node(qmod.BREAK_TOKEN, phrase.ptype, query.add_node(qmod.BREAK_TOKEN, phrase.ptype, term, word)
PENALTY_IN_TOKEN_BREAK[qmod.BREAK_TOKEN], query.nodes[-1].btype = breakchar
term, word)
query.nodes[-1].adjust_break(breakchar,
PENALTY_IN_TOKEN_BREAK[breakchar])
query.nodes[-1].adjust_break(qmod.BREAK_END, PENALTY_IN_TOKEN_BREAK[qmod.BREAK_END]) query.nodes[-1].btype = qmod.BREAK_END
async def lookup_in_db(self, words: List[str]) -> 'sa.Result[Any]': async def lookup_in_db(self, words: List[str]) -> 'sa.Result[Any]':
""" Return the token information from the database for the """ Return the token information from the database for the
@@ -300,6 +301,12 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
for token in tokens: for token in tokens:
cast(ICUToken, token).rematch(norm) cast(ICUToken, token).rematch(norm)
def compute_break_penalties(self, query: qmod.QueryStruct) -> None:
""" Set the break penalties for the nodes in the query.
"""
for node in query.nodes:
node.penalty = PENALTY_BREAK[node.btype]
def _dump_word_tokens(query: qmod.QueryStruct) -> Iterator[List[Any]]: def _dump_word_tokens(query: qmod.QueryStruct) -> Iterator[List[Any]]:
yield ['type', 'from', 'to', 'token', 'word_token', 'lookup_word', 'penalty', 'count', 'info'] yield ['type', 'from', 'to', 'token', 'word_token', 'lookup_word', 'penalty', 'count', 'info']

View File

@@ -134,7 +134,6 @@ class TokenRange:
""" """
start: int start: int
end: int end: int
penalty: Optional[float] = None
def __lt__(self, other: 'TokenRange') -> bool: def __lt__(self, other: 'TokenRange') -> bool:
return self.end <= other.start return self.end <= other.start
@@ -191,7 +190,9 @@ class QueryNode:
ptype: PhraseType ptype: PhraseType
penalty: float penalty: float
""" Penalty for the break at this node. """ Penalty for having a word break at this position. The penalty
may be negative, when a word break is more likely than continuing
the word after the node.
""" """
term_lookup: str term_lookup: str
""" Transliterated term ending at this node. """ Transliterated term ending at this node.
@@ -212,6 +213,19 @@ class QueryNode:
types of tokens spanning over the gap. types of tokens spanning over the gap.
""" """
@property
def word_break_penalty(self) -> float:
""" Penalty to apply when a words ends at this node.
"""
return max(0, self.penalty)
@property
def word_continuation_penalty(self) -> float:
""" Penalty to apply when a word continues over this node
(i.e. is a multi-term word).
"""
return max(0, -self.penalty)
def name_address_ratio(self) -> float: def name_address_ratio(self) -> float:
""" Return the propability that the partial token belonging to """ Return the propability that the partial token belonging to
this node forms part of a name (as opposed of part of the address). this node forms part of a name (as opposed of part of the address).
@@ -221,12 +235,6 @@ class QueryNode:
return self.partial.count / (self.partial.count + self.partial.addr_count) return self.partial.count / (self.partial.count + self.partial.addr_count)
def adjust_break(self, btype: BreakType, penalty: float) -> None:
""" Change the break type and penalty for this node.
"""
self.btype = btype
self.penalty = penalty
def has_tokens(self, end: int, *ttypes: TokenType) -> bool: def has_tokens(self, end: int, *ttypes: TokenType) -> bool:
""" Check if there are tokens of the given types ending at the """ Check if there are tokens of the given types ending at the
given node. given node.
@@ -286,13 +294,12 @@ class QueryStruct:
return len(self.nodes) - 1 return len(self.nodes) - 1
def add_node(self, btype: BreakType, ptype: PhraseType, def add_node(self, btype: BreakType, ptype: PhraseType,
break_penalty: float = 0.0,
term_lookup: str = '', term_normalized: str = '') -> None: term_lookup: str = '', term_normalized: str = '') -> None:
""" Append a new break node with the given break type. """ Append a new break node with the given break type.
The phrase type denotes the type for any tokens starting The phrase type denotes the type for any tokens starting
at the node. at the node.
""" """
self.nodes.append(QueryNode(btype, ptype, break_penalty, term_lookup, term_normalized)) self.nodes.append(QueryNode(btype, ptype, 0.0, term_lookup, term_normalized))
def add_token(self, trange: TokenRange, ttype: TokenType, token: Token) -> None: def add_token(self, trange: TokenRange, ttype: TokenType, token: Token) -> None:
""" Add a token to the query. 'start' and 'end' are the indexes of the """ Add a token to the query. 'start' and 'end' are the indexes of the
@@ -324,7 +331,7 @@ class QueryStruct:
of each node. of each node.
""" """
n = len(self.nodes) - 1 n = len(self.nodes) - 1
if n == 1 or n >= 50: if n <= 1 or n >= 50:
self.dir_penalty = 0 self.dir_penalty = 0
elif n == 2: elif n == 2:
self.dir_penalty = (self.nodes[1].name_address_ratio() self.dir_penalty = (self.nodes[1].name_address_ratio()
@@ -344,6 +351,13 @@ class QueryStruct:
assert ttype != TOKEN_PARTIAL assert ttype != TOKEN_PARTIAL
return self.nodes[trange.start].get_tokens(trange.end, ttype) or [] return self.nodes[trange.start].get_tokens(trange.end, ttype) or []
def get_in_word_penalty(self, trange: TokenRange) -> float:
""" Gets the sum of penalties for all token transitions
within the given range.
"""
return sum(n.word_continuation_penalty
for n in self.nodes[trange.start + 1:trange.end])
def iter_partials(self, trange: TokenRange) -> Iterator[Token]: def iter_partials(self, trange: TokenRange) -> Iterator[Token]:
""" Iterate over the partial tokens between the given nodes. """ Iterate over the partial tokens between the given nodes.
Missing partials are ignored. Missing partials are ignored.
@@ -386,17 +400,14 @@ class QueryStruct:
""" """
return ''.join(''.join((n.term_lookup, n.btype)) for n in self.nodes) return ''.join(''.join((n.term_lookup, n.btype)) for n in self.nodes)
def extract_words(self, base_penalty: float = 0.0, def extract_words(self, start: int = 0,
start: int = 0,
endpos: Optional[int] = None) -> Dict[str, List[TokenRange]]: endpos: Optional[int] = None) -> Dict[str, List[TokenRange]]:
""" Add all combinations of words that can be formed from the terms """ Add all combinations of words that can be formed from the terms
between the given start and endnode. The terms are joined with between the given start and endnode. The terms are joined with
spaces for each break. Words can never go across a BREAK_PHRASE. spaces for each break. Words can never go across a BREAK_PHRASE.
The functions returns a dictionary of possible words with their The functions returns a dictionary of possible words with their
position within the query and a penalty. The penalty is computed position within the query.
from the base_penalty plus the penalty for each node the word
crosses.
""" """
if endpos is None: if endpos is None:
endpos = len(self.nodes) endpos = len(self.nodes)
@@ -405,16 +416,13 @@ class QueryStruct:
for first, first_node in enumerate(self.nodes[start + 1:endpos], start): for first, first_node in enumerate(self.nodes[start + 1:endpos], start):
word = first_node.term_lookup word = first_node.term_lookup
penalty = base_penalty words[word].append(TokenRange(first, first + 1))
words[word].append(TokenRange(first, first + 1, penalty=penalty))
if first_node.btype != BREAK_PHRASE: if first_node.btype != BREAK_PHRASE:
penalty += first_node.penalty
max_last = min(first + 20, endpos) max_last = min(first + 20, endpos)
for last, last_node in enumerate(self.nodes[first + 2:max_last], first + 2): for last, last_node in enumerate(self.nodes[first + 2:max_last], first + 2):
word = ' '.join((word, last_node.term_lookup)) word = ' '.join((word, last_node.term_lookup))
words[word].append(TokenRange(first, last, penalty=penalty)) words[word].append(TokenRange(first, last))
if last_node.btype == BREAK_PHRASE: if last_node.btype == BREAK_PHRASE:
break break
penalty += last_node.penalty
return words return words

View File

@@ -23,16 +23,6 @@ class TypedRange:
trange: qmod.TokenRange trange: qmod.TokenRange
PENALTY_TOKENCHANGE = {
qmod.BREAK_START: 0.0,
qmod.BREAK_END: 0.0,
qmod.BREAK_PHRASE: 0.0,
qmod.BREAK_SOFT_PHRASE: 0.0,
qmod.BREAK_WORD: 0.1,
qmod.BREAK_PART: 0.2,
qmod.BREAK_TOKEN: 0.4
}
TypedRangeSeq = List[TypedRange] TypedRangeSeq = List[TypedRange]
@@ -192,7 +182,7 @@ class _TokenSequence:
return None return None
def advance(self, ttype: qmod.TokenType, end_pos: int, def advance(self, ttype: qmod.TokenType, end_pos: int,
btype: qmod.BreakType) -> Optional['_TokenSequence']: force_break: bool, break_penalty: float) -> Optional['_TokenSequence']:
""" Return a new token sequence state with the given token type """ Return a new token sequence state with the given token type
extended. extended.
""" """
@@ -205,7 +195,7 @@ class _TokenSequence:
new_penalty = 0.0 new_penalty = 0.0
else: else:
last = self.seq[-1] last = self.seq[-1]
if btype != qmod.BREAK_PHRASE and last.ttype == ttype: if not force_break and last.ttype == ttype:
# extend the existing range # extend the existing range
newseq = self.seq[:-1] + [TypedRange(ttype, last.trange.replace_end(end_pos))] newseq = self.seq[:-1] + [TypedRange(ttype, last.trange.replace_end(end_pos))]
new_penalty = 0.0 new_penalty = 0.0
@@ -213,7 +203,7 @@ class _TokenSequence:
# start a new range # start a new range
newseq = list(self.seq) + [TypedRange(ttype, newseq = list(self.seq) + [TypedRange(ttype,
qmod.TokenRange(last.trange.end, end_pos))] qmod.TokenRange(last.trange.end, end_pos))]
new_penalty = PENALTY_TOKENCHANGE[btype] new_penalty = break_penalty
return _TokenSequence(newseq, newdir, self.penalty + new_penalty) return _TokenSequence(newseq, newdir, self.penalty + new_penalty)
@@ -317,7 +307,7 @@ class _TokenSequence:
name, addr = first.split(i) name, addr = first.split(i)
log().comment(f'split first word = name ({i - first.start})') log().comment(f'split first word = name ({i - first.start})')
yield dataclasses.replace(base, name=name, address=[addr] + base.address[1:], yield dataclasses.replace(base, name=name, address=[addr] + base.address[1:],
penalty=penalty + PENALTY_TOKENCHANGE[query.nodes[i].btype]) penalty=penalty + query.nodes[i].word_break_penalty)
def _get_assignments_address_backward(self, base: TokenAssignment, def _get_assignments_address_backward(self, base: TokenAssignment,
query: qmod.QueryStruct) -> Iterator[TokenAssignment]: query: qmod.QueryStruct) -> Iterator[TokenAssignment]:
@@ -362,7 +352,7 @@ class _TokenSequence:
addr, name = last.split(i) addr, name = last.split(i)
log().comment(f'split last word = name ({i - last.start})') log().comment(f'split last word = name ({i - last.start})')
yield dataclasses.replace(base, name=name, address=base.address[:-1] + [addr], yield dataclasses.replace(base, name=name, address=base.address[:-1] + [addr],
penalty=penalty + PENALTY_TOKENCHANGE[query.nodes[i].btype]) penalty=penalty + query.nodes[i].word_break_penalty)
def get_assignments(self, query: qmod.QueryStruct) -> Iterator[TokenAssignment]: def get_assignments(self, query: qmod.QueryStruct) -> Iterator[TokenAssignment]:
""" Yield possible assignments for the current sequence. """ Yield possible assignments for the current sequence.
@@ -422,12 +412,15 @@ def yield_token_assignments(query: qmod.QueryStruct) -> Iterator[TokenAssignment
for tlist in node.starting: for tlist in node.starting:
yield from _append_state_to_todo( yield from _append_state_to_todo(
query, todo, query, todo,
state.advance(tlist.ttype, tlist.end, node.btype)) state.advance(tlist.ttype, tlist.end,
True, node.word_break_penalty))
if node.partial is not None: if node.partial is not None:
yield from _append_state_to_todo( yield from _append_state_to_todo(
query, todo, query, todo,
state.advance(qmod.TOKEN_PARTIAL, state.end_pos + 1, node.btype)) state.advance(qmod.TOKEN_PARTIAL, state.end_pos + 1,
node.btype == qmod.BREAK_PHRASE,
node.word_break_penalty))
def _append_state_to_todo(query: qmod.QueryStruct, todo: List[_TokenSequence], def _append_state_to_todo(query: qmod.QueryStruct, todo: List[_TokenSequence],

View File

@@ -21,6 +21,7 @@ Feature: Simple Tests
| %#$@*&l;der#$! | | %#$@*&l;der#$! |
| 234.23.14.5 | | 234.23.14.5 |
| aussenstelle universitat lichtenstein wachterhaus aussenstelle universitat lichtenstein wachterhaus aussenstelle universitat lichtenstein wachterhaus aussenstelle universitat lichtenstein wachterhaus | | aussenstelle universitat lichtenstein wachterhaus aussenstelle universitat lichtenstein wachterhaus aussenstelle universitat lichtenstein wachterhaus aussenstelle universitat lichtenstein wachterhaus |
| . |
Scenario: Empty XML search Scenario: Empty XML search
When sending v1/search with format xml When sending v1/search with format xml

View File

@@ -68,7 +68,7 @@ def mk_query(inp):
phrase_split = re.split(r"([ ,:'-])", inp) phrase_split = re.split(r"([ ,:'-])", inp)
for word, breakchar in zip_longest(*[iter(phrase_split)]*2, fillvalue='>'): for word, breakchar in zip_longest(*[iter(phrase_split)]*2, fillvalue='>'):
query.add_node(breakchar, PHRASE_ANY, 0.1, word, word) query.add_node(breakchar, PHRASE_ANY, word, word)
return query return query
@@ -153,9 +153,9 @@ def test_postcode_inside_postcode_phrase(pc_config):
query = QueryStruct([]) query = QueryStruct([])
query.nodes[-1].ptype = PHRASE_STREET query.nodes[-1].ptype = PHRASE_STREET
query.add_node(',', PHRASE_STREET, 0.1, '12345', '12345') query.add_node(',', PHRASE_STREET, '12345', '12345')
query.add_node(',', PHRASE_POSTCODE, 0.1, 'xz', 'xz') query.add_node(',', PHRASE_POSTCODE, 'xz', 'xz')
query.add_node('>', PHRASE_POSTCODE, 0.1, '4444', '4444') query.add_node('>', PHRASE_POSTCODE, '4444', '4444')
assert parser.parse(query) == {(2, 3, '4444')} assert parser.parse(query) == {(2, 3, '4444')}
@@ -165,7 +165,7 @@ def test_partial_postcode_in_postcode_phrase(pc_config):
query = QueryStruct([]) query = QueryStruct([])
query.nodes[-1].ptype = PHRASE_POSTCODE query.nodes[-1].ptype = PHRASE_POSTCODE
query.add_node(' ', PHRASE_POSTCODE, 0.1, '2224', '2224') query.add_node(' ', PHRASE_POSTCODE, '2224', '2224')
query.add_node('>', PHRASE_POSTCODE, 0.1, '12345', '12345') query.add_node('>', PHRASE_POSTCODE, '12345', '12345')
assert not parser.parse(query) assert not parser.parse(query)

View File

@@ -51,15 +51,15 @@ def test_token_range_unimplemented_ops():
def test_query_extract_words(): def test_query_extract_words():
q = nq.QueryStruct([]) q = nq.QueryStruct([])
q.add_node(nq.BREAK_WORD, nq.PHRASE_ANY, 0.1, '12', '') q.add_node(nq.BREAK_WORD, nq.PHRASE_ANY, '12', '')
q.add_node(nq.BREAK_TOKEN, nq.PHRASE_ANY, 0.0, 'ab', '') q.add_node(nq.BREAK_TOKEN, nq.PHRASE_ANY, 'ab', '')
q.add_node(nq.BREAK_PHRASE, nq.PHRASE_ANY, 0.0, '12', '') q.add_node(nq.BREAK_PHRASE, nq.PHRASE_ANY, '12', '')
q.add_node(nq.BREAK_END, nq.PHRASE_ANY, 0.5, 'hallo', '') q.add_node(nq.BREAK_END, nq.PHRASE_ANY, 'hallo', '')
words = q.extract_words(base_penalty=1.0) words = q.extract_words()
assert set(words.keys()) \ assert set(words.keys()) \
== {'12', 'ab', 'hallo', '12 ab', 'ab 12', '12 ab 12'} == {'12', 'ab', 'hallo', '12 ab', 'ab 12', '12 ab 12'}
assert sorted(words['12']) == [nq.TokenRange(0, 1, 1.0), nq.TokenRange(2, 3, 1.0)] assert sorted(words['12']) == [nq.TokenRange(0, 1), nq.TokenRange(2, 3)]
assert words['12 ab'] == [nq.TokenRange(0, 2, 1.1)] assert words['12 ab'] == [nq.TokenRange(0, 2)]
assert words['hallo'] == [nq.TokenRange(3, 4, 1.0)] assert words['hallo'] == [nq.TokenRange(3, 4)]

View File

@@ -12,8 +12,8 @@ import pytest
from nominatim_api.search.query import QueryStruct, Phrase, TokenRange, Token from nominatim_api.search.query import QueryStruct, Phrase, TokenRange, Token
import nominatim_api.search.query as qmod import nominatim_api.search.query as qmod
from nominatim_api.search.token_assignment import (yield_token_assignments, from nominatim_api.search.token_assignment import (yield_token_assignments,
TokenAssignment, TokenAssignment)
PENALTY_TOKENCHANGE) from nominatim_api.search.icu_tokenizer import PENALTY_BREAK
class MyToken(Token): class MyToken(Token):
@@ -28,6 +28,7 @@ def make_query(*args):
for btype, ptype, _ in args[1:]: for btype, ptype, _ in args[1:]:
q.add_node(btype, ptype) q.add_node(btype, ptype)
q.nodes[-1].penalty = PENALTY_BREAK[btype]
q.add_node(qmod.BREAK_END, qmod.PHRASE_ANY) q.add_node(qmod.BREAK_END, qmod.PHRASE_ANY)
for start, t in enumerate(args): for start, t in enumerate(args):
@@ -94,7 +95,7 @@ def test_multiple_simple_words(btype):
(btype, qmod.PHRASE_ANY, [(2, qmod.TOKEN_PARTIAL)]), (btype, qmod.PHRASE_ANY, [(2, qmod.TOKEN_PARTIAL)]),
(btype, qmod.PHRASE_ANY, [(3, qmod.TOKEN_PARTIAL)])) (btype, qmod.PHRASE_ANY, [(3, qmod.TOKEN_PARTIAL)]))
penalty = PENALTY_TOKENCHANGE[btype] penalty = PENALTY_BREAK[btype]
check_assignments(yield_token_assignments(q), check_assignments(yield_token_assignments(q),
TokenAssignment(name=TokenRange(0, 3)), TokenAssignment(name=TokenRange(0, 3)),