add direction penalties

Direction penalties are estimated by getting the name to address
ratio usage for each partial term in the query and computing the
linear regression of that ratio over the entire phrase. Or to put
it in ither words: we try to determine if the terms at the beginning
or the end of the query are more likely to constitute a name.

Direction penalties are currently used only in classic name queries.
This commit is contained in:
Sarah Hoffmann
2025-04-11 20:41:06 +02:00
parent 2ef0e20a3f
commit 06e39e42d8
3 changed files with 58 additions and 8 deletions

View File

@@ -2,7 +2,7 @@
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# Copyright (C) 2025 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Public interface to the search code.
@@ -50,6 +50,9 @@ class ForwardGeocoder:
self.query_analyzer = await make_query_analyzer(self.conn)
query = await self.query_analyzer.analyze_query(phrases)
query.compute_direction_penalty()
log().var_dump('Query direction penalty',
lambda: f"[{'LR' if query.dir_penalty < 0 else 'RL'}] {query.dir_penalty}")
searches: List[AbstractSearch] = []
if query.num_token_slots() > 0:

View File

@@ -13,6 +13,10 @@ from collections import defaultdict
import dataclasses
LINFAC = [i * (sum(si * si for si in range(i)) - (i - 1) * i * (i - 1) / 4)
for i in range(50)]
BreakType = str
""" Type of break between tokens.
"""
@@ -201,6 +205,15 @@ class QueryNode:
types of tokens spanning over the gap.
"""
def name_address_ratio(self) -> float:
""" Return the propability that the partial token belonging to
this node forms part of a name (as opposed of part of the address).
"""
if self.partial is None:
return 0.5
return self.partial.count / (self.partial.count + self.partial.addr_count)
def adjust_break(self, btype: BreakType, penalty: float) -> None:
""" Change the break type and penalty for this node.
"""
@@ -242,12 +255,20 @@ class QueryStruct:
need to be direct neighbours. Thus the query is represented as a
directed acyclic graph.
A query also has a direction penalty 'dir_penalty'. This describes
the likelyhood if the query should be read from left-to-right or
vice versa. A negative 'dir_penalty' should be read as a penalty on
right-to-left reading, while a positive value represents a penalty
for left-to-right reading. The default value is 0, which is equivalent
to having no information about the reading.
When created, a query contains a single node: the start of the
query. Further nodes can be added by appending to 'nodes'.
"""
def __init__(self, source: List[Phrase]) -> None:
self.source = source
self.dir_penalty = 0.0
self.nodes: List[QueryNode] = \
[QueryNode(BREAK_START, source[0].ptype if source else PHRASE_ANY,
0.0, '', '')]
@@ -291,6 +312,21 @@ class QueryStruct:
else:
tlist.append(token)
def compute_direction_penalty(self) -> None:
""" Recompute the direction probability from the partial tokens
of each node.
"""
n = len(self.nodes) - 1
if n == 1 or n >= 50:
self.dir_penalty = 0
elif n == 2:
self.dir_penalty = (self.nodes[1].name_address_ratio()
- self.nodes[0].name_address_ratio()) / 3
else:
ratios = [n.name_address_ratio() for n in self.nodes[:-1]]
self.dir_penalty = (n * sum(i * r for i, r in enumerate(ratios))
- sum(ratios) * n * (n - 1) / 2) / LINFAC[n]
def get_tokens(self, trange: TokenRange, ttype: TokenType) -> List[Token]:
""" Get the list of tokens of a given type, spanning the given
nodes. The nodes must exist. If no tokens exist, an

View File

@@ -286,8 +286,12 @@ class _TokenSequence:
log().var_dump('skip forward', (base.postcode, first))
return
penalty = self.penalty
if self.direction == 1 and query.dir_penalty > 0:
penalty += query.dir_penalty
log().comment('first word = name')
yield dataclasses.replace(base, penalty=self.penalty,
yield dataclasses.replace(base, penalty=penalty,
name=first, address=base.address[1:])
# To paraphrase:
@@ -300,14 +304,15 @@ class _TokenSequence:
or (query.nodes[first.start].ptype != qmod.PHRASE_ANY):
return
penalty = self.penalty
# Penalty for:
# * <name>, <street>, <housenumber> , ...
# * queries that are comma-separated
if (base.housenumber and base.housenumber > first) or len(query.source) > 1:
penalty += 0.25
if self.direction == 0 and query.dir_penalty > 0:
penalty += query.dir_penalty
for i in range(first.start + 1, first.end):
name, addr = first.split(i)
log().comment(f'split first word = name ({i - first.start})')
@@ -326,9 +331,13 @@ class _TokenSequence:
log().var_dump('skip backward', (base.postcode, last))
return
penalty = self.penalty
if self.direction == -1 and query.dir_penalty < 0:
penalty -= query.dir_penalty
if self.direction == -1 or len(base.address) > 1 or base.postcode:
log().comment('last word = name')
yield dataclasses.replace(base, penalty=self.penalty,
yield dataclasses.replace(base, penalty=penalty,
name=last, address=base.address[:-1])
# To paraphrase:
@@ -341,12 +350,14 @@ class _TokenSequence:
or (query.nodes[last.start].ptype != qmod.PHRASE_ANY):
return
penalty = self.penalty
if base.housenumber and base.housenumber < last:
penalty += 0.4
if len(query.source) > 1:
penalty += 0.25
if self.direction == 0 and query.dir_penalty < 0:
penalty -= query.dir_penalty
for i in range(last.start + 1, last.end):
addr, name = last.split(i)
log().comment(f'split last word = name ({i - last.start})')
@@ -379,11 +390,11 @@ class _TokenSequence:
if base.postcode and base.postcode.start == 0:
self.penalty += 0.1
# Right-to-left reading of the address
# Left-to-right reading of the address
if self.direction != -1:
yield from self._get_assignments_address_forward(base, query)
# Left-to-right reading of the address
# Right-to-left reading of the address
if self.direction != 1:
yield from self._get_assignments_address_backward(base, query)