add direction penalties

Direction penalties are estimated by getting the name to address
ratio usage for each partial term in the query and computing the
linear regression of that ratio over the entire phrase. Or to put
it in ither words: we try to determine if the terms at the beginning
or the end of the query are more likely to constitute a name.

Direction penalties are currently used only in classic name queries.
This commit is contained in:
Sarah Hoffmann
2025-04-11 20:41:06 +02:00
parent 2ef0e20a3f
commit 06e39e42d8
3 changed files with 58 additions and 8 deletions

View File

@@ -286,8 +286,12 @@ class _TokenSequence:
log().var_dump('skip forward', (base.postcode, first))
return
penalty = self.penalty
if self.direction == 1 and query.dir_penalty > 0:
penalty += query.dir_penalty
log().comment('first word = name')
yield dataclasses.replace(base, penalty=self.penalty,
yield dataclasses.replace(base, penalty=penalty,
name=first, address=base.address[1:])
# To paraphrase:
@@ -300,14 +304,15 @@ class _TokenSequence:
or (query.nodes[first.start].ptype != qmod.PHRASE_ANY):
return
penalty = self.penalty
# Penalty for:
# * <name>, <street>, <housenumber> , ...
# * queries that are comma-separated
if (base.housenumber and base.housenumber > first) or len(query.source) > 1:
penalty += 0.25
if self.direction == 0 and query.dir_penalty > 0:
penalty += query.dir_penalty
for i in range(first.start + 1, first.end):
name, addr = first.split(i)
log().comment(f'split first word = name ({i - first.start})')
@@ -326,9 +331,13 @@ class _TokenSequence:
log().var_dump('skip backward', (base.postcode, last))
return
penalty = self.penalty
if self.direction == -1 and query.dir_penalty < 0:
penalty -= query.dir_penalty
if self.direction == -1 or len(base.address) > 1 or base.postcode:
log().comment('last word = name')
yield dataclasses.replace(base, penalty=self.penalty,
yield dataclasses.replace(base, penalty=penalty,
name=last, address=base.address[:-1])
# To paraphrase:
@@ -341,12 +350,14 @@ class _TokenSequence:
or (query.nodes[last.start].ptype != qmod.PHRASE_ANY):
return
penalty = self.penalty
if base.housenumber and base.housenumber < last:
penalty += 0.4
if len(query.source) > 1:
penalty += 0.25
if self.direction == 0 and query.dir_penalty < 0:
penalty -= query.dir_penalty
for i in range(last.start + 1, last.end):
addr, name = last.split(i)
log().comment(f'split last word = name ({i - last.start})')
@@ -379,11 +390,11 @@ class _TokenSequence:
if base.postcode and base.postcode.start == 0:
self.penalty += 0.1
# Right-to-left reading of the address
# Left-to-right reading of the address
if self.direction != -1:
yield from self._get_assignments_address_forward(base, query)
# Left-to-right reading of the address
# Right-to-left reading of the address
if self.direction != 1:
yield from self._get_assignments_address_backward(base, query)