diff --git a/src/nominatim_api/search/geocoder.py b/src/nominatim_api/search/geocoder.py index 8901529f..5fefe5ea 100644 --- a/src/nominatim_api/search/geocoder.py +++ b/src/nominatim_api/search/geocoder.py @@ -2,7 +2,7 @@ # # This file is part of Nominatim. (https://nominatim.org) # -# Copyright (C) 2024 by the Nominatim developer community. +# Copyright (C) 2025 by the Nominatim developer community. # For a full list of authors see the git log. """ Public interface to the search code. @@ -50,6 +50,9 @@ class ForwardGeocoder: self.query_analyzer = await make_query_analyzer(self.conn) query = await self.query_analyzer.analyze_query(phrases) + query.compute_direction_penalty() + log().var_dump('Query direction penalty', + lambda: f"[{'LR' if query.dir_penalty < 0 else 'RL'}] {query.dir_penalty}") searches: List[AbstractSearch] = [] if query.num_token_slots() > 0: diff --git a/src/nominatim_api/search/query.py b/src/nominatim_api/search/query.py index 652c3986..4d8022b6 100644 --- a/src/nominatim_api/search/query.py +++ b/src/nominatim_api/search/query.py @@ -13,6 +13,10 @@ from collections import defaultdict import dataclasses +LINFAC = [i * (sum(si * si for si in range(i)) - (i - 1) * i * (i - 1) / 4) + for i in range(50)] + + BreakType = str """ Type of break between tokens. """ @@ -201,6 +205,15 @@ class QueryNode: types of tokens spanning over the gap. """ + def name_address_ratio(self) -> float: + """ Return the propability that the partial token belonging to + this node forms part of a name (as opposed of part of the address). + """ + if self.partial is None: + return 0.5 + + return self.partial.count / (self.partial.count + self.partial.addr_count) + def adjust_break(self, btype: BreakType, penalty: float) -> None: """ Change the break type and penalty for this node. """ @@ -242,12 +255,20 @@ class QueryStruct: need to be direct neighbours. Thus the query is represented as a directed acyclic graph. + A query also has a direction penalty 'dir_penalty'. This describes + the likelyhood if the query should be read from left-to-right or + vice versa. A negative 'dir_penalty' should be read as a penalty on + right-to-left reading, while a positive value represents a penalty + for left-to-right reading. The default value is 0, which is equivalent + to having no information about the reading. + When created, a query contains a single node: the start of the query. Further nodes can be added by appending to 'nodes'. """ def __init__(self, source: List[Phrase]) -> None: self.source = source + self.dir_penalty = 0.0 self.nodes: List[QueryNode] = \ [QueryNode(BREAK_START, source[0].ptype if source else PHRASE_ANY, 0.0, '', '')] @@ -291,6 +312,21 @@ class QueryStruct: else: tlist.append(token) + def compute_direction_penalty(self) -> None: + """ Recompute the direction probability from the partial tokens + of each node. + """ + n = len(self.nodes) - 1 + if n == 1 or n >= 50: + self.dir_penalty = 0 + elif n == 2: + self.dir_penalty = (self.nodes[1].name_address_ratio() + - self.nodes[0].name_address_ratio()) / 3 + else: + ratios = [n.name_address_ratio() for n in self.nodes[:-1]] + self.dir_penalty = (n * sum(i * r for i, r in enumerate(ratios)) + - sum(ratios) * n * (n - 1) / 2) / LINFAC[n] + def get_tokens(self, trange: TokenRange, ttype: TokenType) -> List[Token]: """ Get the list of tokens of a given type, spanning the given nodes. The nodes must exist. If no tokens exist, an diff --git a/src/nominatim_api/search/token_assignment.py b/src/nominatim_api/search/token_assignment.py index a7a2e8d5..a0df7d03 100644 --- a/src/nominatim_api/search/token_assignment.py +++ b/src/nominatim_api/search/token_assignment.py @@ -286,8 +286,12 @@ class _TokenSequence: log().var_dump('skip forward', (base.postcode, first)) return + penalty = self.penalty + if self.direction == 1 and query.dir_penalty > 0: + penalty += query.dir_penalty + log().comment('first word = name') - yield dataclasses.replace(base, penalty=self.penalty, + yield dataclasses.replace(base, penalty=penalty, name=first, address=base.address[1:]) # To paraphrase: @@ -300,14 +304,15 @@ class _TokenSequence: or (query.nodes[first.start].ptype != qmod.PHRASE_ANY): return - penalty = self.penalty - # Penalty for: # * , , , ... # * queries that are comma-separated if (base.housenumber and base.housenumber > first) or len(query.source) > 1: penalty += 0.25 + if self.direction == 0 and query.dir_penalty > 0: + penalty += query.dir_penalty + for i in range(first.start + 1, first.end): name, addr = first.split(i) log().comment(f'split first word = name ({i - first.start})') @@ -326,9 +331,13 @@ class _TokenSequence: log().var_dump('skip backward', (base.postcode, last)) return + penalty = self.penalty + if self.direction == -1 and query.dir_penalty < 0: + penalty -= query.dir_penalty + if self.direction == -1 or len(base.address) > 1 or base.postcode: log().comment('last word = name') - yield dataclasses.replace(base, penalty=self.penalty, + yield dataclasses.replace(base, penalty=penalty, name=last, address=base.address[:-1]) # To paraphrase: @@ -341,12 +350,14 @@ class _TokenSequence: or (query.nodes[last.start].ptype != qmod.PHRASE_ANY): return - penalty = self.penalty if base.housenumber and base.housenumber < last: penalty += 0.4 if len(query.source) > 1: penalty += 0.25 + if self.direction == 0 and query.dir_penalty < 0: + penalty -= query.dir_penalty + for i in range(last.start + 1, last.end): addr, name = last.split(i) log().comment(f'split last word = name ({i - last.start})') @@ -379,11 +390,11 @@ class _TokenSequence: if base.postcode and base.postcode.start == 0: self.penalty += 0.1 - # Right-to-left reading of the address + # Left-to-right reading of the address if self.direction != -1: yield from self._get_assignments_address_forward(base, query) - # Left-to-right reading of the address + # Right-to-left reading of the address if self.direction != 1: yield from self._get_assignments_address_backward(base, query)