mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-02-26 11:08:13 +00:00
add direction penalties
Direction penalties are estimated by getting the name to address ratio usage for each partial term in the query and computing the linear regression of that ratio over the entire phrase. Or to put it in ither words: we try to determine if the terms at the beginning or the end of the query are more likely to constitute a name. Direction penalties are currently used only in classic name queries.
This commit is contained in:
@@ -2,7 +2,7 @@
|
|||||||
#
|
#
|
||||||
# This file is part of Nominatim. (https://nominatim.org)
|
# This file is part of Nominatim. (https://nominatim.org)
|
||||||
#
|
#
|
||||||
# Copyright (C) 2024 by the Nominatim developer community.
|
# Copyright (C) 2025 by the Nominatim developer community.
|
||||||
# For a full list of authors see the git log.
|
# For a full list of authors see the git log.
|
||||||
"""
|
"""
|
||||||
Public interface to the search code.
|
Public interface to the search code.
|
||||||
@@ -50,6 +50,9 @@ class ForwardGeocoder:
|
|||||||
self.query_analyzer = await make_query_analyzer(self.conn)
|
self.query_analyzer = await make_query_analyzer(self.conn)
|
||||||
|
|
||||||
query = await self.query_analyzer.analyze_query(phrases)
|
query = await self.query_analyzer.analyze_query(phrases)
|
||||||
|
query.compute_direction_penalty()
|
||||||
|
log().var_dump('Query direction penalty',
|
||||||
|
lambda: f"[{'LR' if query.dir_penalty < 0 else 'RL'}] {query.dir_penalty}")
|
||||||
|
|
||||||
searches: List[AbstractSearch] = []
|
searches: List[AbstractSearch] = []
|
||||||
if query.num_token_slots() > 0:
|
if query.num_token_slots() > 0:
|
||||||
|
|||||||
@@ -13,6 +13,10 @@ from collections import defaultdict
|
|||||||
import dataclasses
|
import dataclasses
|
||||||
|
|
||||||
|
|
||||||
|
LINFAC = [i * (sum(si * si for si in range(i)) - (i - 1) * i * (i - 1) / 4)
|
||||||
|
for i in range(50)]
|
||||||
|
|
||||||
|
|
||||||
BreakType = str
|
BreakType = str
|
||||||
""" Type of break between tokens.
|
""" Type of break between tokens.
|
||||||
"""
|
"""
|
||||||
@@ -201,6 +205,15 @@ class QueryNode:
|
|||||||
types of tokens spanning over the gap.
|
types of tokens spanning over the gap.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
def name_address_ratio(self) -> float:
|
||||||
|
""" Return the propability that the partial token belonging to
|
||||||
|
this node forms part of a name (as opposed of part of the address).
|
||||||
|
"""
|
||||||
|
if self.partial is None:
|
||||||
|
return 0.5
|
||||||
|
|
||||||
|
return self.partial.count / (self.partial.count + self.partial.addr_count)
|
||||||
|
|
||||||
def adjust_break(self, btype: BreakType, penalty: float) -> None:
|
def adjust_break(self, btype: BreakType, penalty: float) -> None:
|
||||||
""" Change the break type and penalty for this node.
|
""" Change the break type and penalty for this node.
|
||||||
"""
|
"""
|
||||||
@@ -242,12 +255,20 @@ class QueryStruct:
|
|||||||
need to be direct neighbours. Thus the query is represented as a
|
need to be direct neighbours. Thus the query is represented as a
|
||||||
directed acyclic graph.
|
directed acyclic graph.
|
||||||
|
|
||||||
|
A query also has a direction penalty 'dir_penalty'. This describes
|
||||||
|
the likelyhood if the query should be read from left-to-right or
|
||||||
|
vice versa. A negative 'dir_penalty' should be read as a penalty on
|
||||||
|
right-to-left reading, while a positive value represents a penalty
|
||||||
|
for left-to-right reading. The default value is 0, which is equivalent
|
||||||
|
to having no information about the reading.
|
||||||
|
|
||||||
When created, a query contains a single node: the start of the
|
When created, a query contains a single node: the start of the
|
||||||
query. Further nodes can be added by appending to 'nodes'.
|
query. Further nodes can be added by appending to 'nodes'.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, source: List[Phrase]) -> None:
|
def __init__(self, source: List[Phrase]) -> None:
|
||||||
self.source = source
|
self.source = source
|
||||||
|
self.dir_penalty = 0.0
|
||||||
self.nodes: List[QueryNode] = \
|
self.nodes: List[QueryNode] = \
|
||||||
[QueryNode(BREAK_START, source[0].ptype if source else PHRASE_ANY,
|
[QueryNode(BREAK_START, source[0].ptype if source else PHRASE_ANY,
|
||||||
0.0, '', '')]
|
0.0, '', '')]
|
||||||
@@ -291,6 +312,21 @@ class QueryStruct:
|
|||||||
else:
|
else:
|
||||||
tlist.append(token)
|
tlist.append(token)
|
||||||
|
|
||||||
|
def compute_direction_penalty(self) -> None:
|
||||||
|
""" Recompute the direction probability from the partial tokens
|
||||||
|
of each node.
|
||||||
|
"""
|
||||||
|
n = len(self.nodes) - 1
|
||||||
|
if n == 1 or n >= 50:
|
||||||
|
self.dir_penalty = 0
|
||||||
|
elif n == 2:
|
||||||
|
self.dir_penalty = (self.nodes[1].name_address_ratio()
|
||||||
|
- self.nodes[0].name_address_ratio()) / 3
|
||||||
|
else:
|
||||||
|
ratios = [n.name_address_ratio() for n in self.nodes[:-1]]
|
||||||
|
self.dir_penalty = (n * sum(i * r for i, r in enumerate(ratios))
|
||||||
|
- sum(ratios) * n * (n - 1) / 2) / LINFAC[n]
|
||||||
|
|
||||||
def get_tokens(self, trange: TokenRange, ttype: TokenType) -> List[Token]:
|
def get_tokens(self, trange: TokenRange, ttype: TokenType) -> List[Token]:
|
||||||
""" Get the list of tokens of a given type, spanning the given
|
""" Get the list of tokens of a given type, spanning the given
|
||||||
nodes. The nodes must exist. If no tokens exist, an
|
nodes. The nodes must exist. If no tokens exist, an
|
||||||
|
|||||||
@@ -286,8 +286,12 @@ class _TokenSequence:
|
|||||||
log().var_dump('skip forward', (base.postcode, first))
|
log().var_dump('skip forward', (base.postcode, first))
|
||||||
return
|
return
|
||||||
|
|
||||||
|
penalty = self.penalty
|
||||||
|
if self.direction == 1 and query.dir_penalty > 0:
|
||||||
|
penalty += query.dir_penalty
|
||||||
|
|
||||||
log().comment('first word = name')
|
log().comment('first word = name')
|
||||||
yield dataclasses.replace(base, penalty=self.penalty,
|
yield dataclasses.replace(base, penalty=penalty,
|
||||||
name=first, address=base.address[1:])
|
name=first, address=base.address[1:])
|
||||||
|
|
||||||
# To paraphrase:
|
# To paraphrase:
|
||||||
@@ -300,14 +304,15 @@ class _TokenSequence:
|
|||||||
or (query.nodes[first.start].ptype != qmod.PHRASE_ANY):
|
or (query.nodes[first.start].ptype != qmod.PHRASE_ANY):
|
||||||
return
|
return
|
||||||
|
|
||||||
penalty = self.penalty
|
|
||||||
|
|
||||||
# Penalty for:
|
# Penalty for:
|
||||||
# * <name>, <street>, <housenumber> , ...
|
# * <name>, <street>, <housenumber> , ...
|
||||||
# * queries that are comma-separated
|
# * queries that are comma-separated
|
||||||
if (base.housenumber and base.housenumber > first) or len(query.source) > 1:
|
if (base.housenumber and base.housenumber > first) or len(query.source) > 1:
|
||||||
penalty += 0.25
|
penalty += 0.25
|
||||||
|
|
||||||
|
if self.direction == 0 and query.dir_penalty > 0:
|
||||||
|
penalty += query.dir_penalty
|
||||||
|
|
||||||
for i in range(first.start + 1, first.end):
|
for i in range(first.start + 1, first.end):
|
||||||
name, addr = first.split(i)
|
name, addr = first.split(i)
|
||||||
log().comment(f'split first word = name ({i - first.start})')
|
log().comment(f'split first word = name ({i - first.start})')
|
||||||
@@ -326,9 +331,13 @@ class _TokenSequence:
|
|||||||
log().var_dump('skip backward', (base.postcode, last))
|
log().var_dump('skip backward', (base.postcode, last))
|
||||||
return
|
return
|
||||||
|
|
||||||
|
penalty = self.penalty
|
||||||
|
if self.direction == -1 and query.dir_penalty < 0:
|
||||||
|
penalty -= query.dir_penalty
|
||||||
|
|
||||||
if self.direction == -1 or len(base.address) > 1 or base.postcode:
|
if self.direction == -1 or len(base.address) > 1 or base.postcode:
|
||||||
log().comment('last word = name')
|
log().comment('last word = name')
|
||||||
yield dataclasses.replace(base, penalty=self.penalty,
|
yield dataclasses.replace(base, penalty=penalty,
|
||||||
name=last, address=base.address[:-1])
|
name=last, address=base.address[:-1])
|
||||||
|
|
||||||
# To paraphrase:
|
# To paraphrase:
|
||||||
@@ -341,12 +350,14 @@ class _TokenSequence:
|
|||||||
or (query.nodes[last.start].ptype != qmod.PHRASE_ANY):
|
or (query.nodes[last.start].ptype != qmod.PHRASE_ANY):
|
||||||
return
|
return
|
||||||
|
|
||||||
penalty = self.penalty
|
|
||||||
if base.housenumber and base.housenumber < last:
|
if base.housenumber and base.housenumber < last:
|
||||||
penalty += 0.4
|
penalty += 0.4
|
||||||
if len(query.source) > 1:
|
if len(query.source) > 1:
|
||||||
penalty += 0.25
|
penalty += 0.25
|
||||||
|
|
||||||
|
if self.direction == 0 and query.dir_penalty < 0:
|
||||||
|
penalty -= query.dir_penalty
|
||||||
|
|
||||||
for i in range(last.start + 1, last.end):
|
for i in range(last.start + 1, last.end):
|
||||||
addr, name = last.split(i)
|
addr, name = last.split(i)
|
||||||
log().comment(f'split last word = name ({i - last.start})')
|
log().comment(f'split last word = name ({i - last.start})')
|
||||||
@@ -379,11 +390,11 @@ class _TokenSequence:
|
|||||||
if base.postcode and base.postcode.start == 0:
|
if base.postcode and base.postcode.start == 0:
|
||||||
self.penalty += 0.1
|
self.penalty += 0.1
|
||||||
|
|
||||||
# Right-to-left reading of the address
|
# Left-to-right reading of the address
|
||||||
if self.direction != -1:
|
if self.direction != -1:
|
||||||
yield from self._get_assignments_address_forward(base, query)
|
yield from self._get_assignments_address_forward(base, query)
|
||||||
|
|
||||||
# Left-to-right reading of the address
|
# Right-to-left reading of the address
|
||||||
if self.direction != 1:
|
if self.direction != 1:
|
||||||
yield from self._get_assignments_address_backward(base, query)
|
yield from self._get_assignments_address_backward(base, query)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user