mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-02-14 01:47:57 +00:00
Compare commits
15 Commits
1aeb8a262c
...
621d8e785b
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
621d8e785b | ||
|
|
830307484b | ||
|
|
5d6967a1d0 | ||
|
|
26903aec0b | ||
|
|
c39183e3a5 | ||
|
|
21ef3be433 | ||
|
|
99562a197e | ||
|
|
fe30663b21 | ||
|
|
73ee17af95 | ||
|
|
b9252cc348 | ||
|
|
71025f3f43 | ||
|
|
e4b671f8b1 | ||
|
|
7ebd121abc | ||
|
|
4634ad0720 | ||
|
|
4a9253a0a9 |
@@ -641,24 +641,6 @@ See also [NOMINATIM_DEFAULT_LANGUAGE](#nominatim_default_language).
|
|||||||
|
|
||||||
### Logging Settings
|
### Logging Settings
|
||||||
|
|
||||||
#### NOMINATIM_LOG_DB
|
|
||||||
|
|
||||||
| Summary | |
|
|
||||||
| -------------- | --------------------------------------------------- |
|
|
||||||
| **Description:** | Log requests into the database |
|
|
||||||
| **Format:** | boolean |
|
|
||||||
| **Default:** | no |
|
|
||||||
| **After Changes:** | run `nominatim refresh --website` |
|
|
||||||
|
|
||||||
Enable logging requests into a database table with this setting. The logs
|
|
||||||
can be found in the table `new_query_log`.
|
|
||||||
|
|
||||||
When using this logging method, it is advisable to set up a job that
|
|
||||||
regularly clears out old logging information. Nominatim will not do that
|
|
||||||
on its own.
|
|
||||||
|
|
||||||
Can be used as the same time as NOMINATIM_LOG_FILE.
|
|
||||||
|
|
||||||
#### NOMINATIM_LOG_FILE
|
#### NOMINATIM_LOG_FILE
|
||||||
|
|
||||||
| Summary | |
|
| Summary | |
|
||||||
@@ -682,8 +664,6 @@ given in seconds and includes the entire time the query was queued and executed
|
|||||||
in the frontend.
|
in the frontend.
|
||||||
type contains the name of the endpoint used.
|
type contains the name of the endpoint used.
|
||||||
|
|
||||||
Can be used as the same time as NOMINATIM_LOG_DB.
|
|
||||||
|
|
||||||
#### NOMINATIM_DEBUG_SQL
|
#### NOMINATIM_DEBUG_SQL
|
||||||
|
|
||||||
| Summary | |
|
| Summary | |
|
||||||
|
|||||||
@@ -5,7 +5,6 @@
|
|||||||
# Database connection string.
|
# Database connection string.
|
||||||
# Add host, port, user etc through additional semicolon-separated attributes.
|
# Add host, port, user etc through additional semicolon-separated attributes.
|
||||||
# e.g. ;host=...;port=...;user=...;password=...
|
# e.g. ;host=...;port=...;user=...;password=...
|
||||||
# Changing this variable requires to run 'nominatim refresh --website'.
|
|
||||||
NOMINATIM_DATABASE_DSN="pgsql:dbname=nominatim"
|
NOMINATIM_DATABASE_DSN="pgsql:dbname=nominatim"
|
||||||
|
|
||||||
# Database web user.
|
# Database web user.
|
||||||
@@ -36,11 +35,11 @@ NOMINATIM_TOKENIZER_CONFIG=
|
|||||||
|
|
||||||
# Search in the Tiger house number data for the US.
|
# Search in the Tiger house number data for the US.
|
||||||
# Note: The tables must already exist or queries will throw errors.
|
# Note: The tables must already exist or queries will throw errors.
|
||||||
# Changing this value requires to run ./utils/setup --create-functions --setup-website.
|
# Changing this value requires to run ./utils/setup --create-functions.
|
||||||
NOMINATIM_USE_US_TIGER_DATA=no
|
NOMINATIM_USE_US_TIGER_DATA=no
|
||||||
|
|
||||||
# Search in the auxiliary housenumber table.
|
# Search in the auxiliary housenumber table.
|
||||||
# Changing this value requires to run ./utils/setup --create-functions --setup-website.
|
# Changing this value requires to run ./utils/setup --create-functions.
|
||||||
NOMINATIM_USE_AUX_LOCATION_DATA=no
|
NOMINATIM_USE_AUX_LOCATION_DATA=no
|
||||||
|
|
||||||
# Proxy settings
|
# Proxy settings
|
||||||
@@ -143,8 +142,7 @@ NOMINATIM_REPLICATION_RECHECK_INTERVAL=60
|
|||||||
|
|
||||||
### API settings
|
### API settings
|
||||||
#
|
#
|
||||||
# The following settings configure the API responses. You must rerun
|
# The following settings configure the API responses.
|
||||||
# 'nominatim refresh --website' after changing any of them.
|
|
||||||
|
|
||||||
# Send permissive CORS access headers.
|
# Send permissive CORS access headers.
|
||||||
# When enabled, send CORS headers to allow access to everybody.
|
# When enabled, send CORS headers to allow access to everybody.
|
||||||
@@ -202,13 +200,7 @@ NOMINATIM_OUTPUT_NAMES=name:XX,name,brand,official_name:XX,short_name:XX,officia
|
|||||||
### Log settings
|
### Log settings
|
||||||
#
|
#
|
||||||
# The following options allow to enable logging of API requests.
|
# The following options allow to enable logging of API requests.
|
||||||
# You must rerun 'nominatim refresh --website' after changing any of them.
|
|
||||||
#
|
#
|
||||||
# Enable logging of requests into the DB.
|
|
||||||
# The request will be logged into the new_query_log table.
|
|
||||||
# You should set up a cron job that regularly clears out this table.
|
|
||||||
NOMINATIM_LOG_DB=no
|
|
||||||
|
|
||||||
# Enable logging of requests into a file.
|
# Enable logging of requests into a file.
|
||||||
# To enable logging set this setting to the file to log to.
|
# To enable logging set this setting to the file to log to.
|
||||||
NOMINATIM_LOG_FILE=
|
NOMINATIM_LOG_FILE=
|
||||||
|
|||||||
@@ -282,10 +282,14 @@ class SearchBuilder:
|
|||||||
""" Create a ranking expression for a name term in the given range.
|
""" Create a ranking expression for a name term in the given range.
|
||||||
"""
|
"""
|
||||||
name_fulls = self.query.get_tokens(trange, qmod.TOKEN_WORD)
|
name_fulls = self.query.get_tokens(trange, qmod.TOKEN_WORD)
|
||||||
ranks = [dbf.RankedTokens(t.penalty, [t.token]) for t in name_fulls]
|
full_word_penalty = self.query.get_in_word_penalty(trange)
|
||||||
|
ranks = [dbf.RankedTokens(t.penalty + full_word_penalty, [t.token])
|
||||||
|
for t in name_fulls]
|
||||||
ranks.sort(key=lambda r: r.penalty)
|
ranks.sort(key=lambda r: r.penalty)
|
||||||
# Fallback, sum of penalty for partials
|
# Fallback, sum of penalty for partials
|
||||||
default = sum(t.penalty for t in self.query.iter_partials(trange)) + 0.2
|
default = sum(t.penalty for t in self.query.iter_partials(trange)) + 0.2
|
||||||
|
default += sum(n.word_break_penalty
|
||||||
|
for n in self.query.nodes[trange.start + 1:trange.end])
|
||||||
return dbf.FieldRanking(db_field, default, ranks)
|
return dbf.FieldRanking(db_field, default, ranks)
|
||||||
|
|
||||||
def get_addr_ranking(self, trange: qmod.TokenRange) -> dbf.FieldRanking:
|
def get_addr_ranking(self, trange: qmod.TokenRange) -> dbf.FieldRanking:
|
||||||
@@ -297,14 +301,14 @@ class SearchBuilder:
|
|||||||
ranks: List[dbf.RankedTokens] = []
|
ranks: List[dbf.RankedTokens] = []
|
||||||
|
|
||||||
while todo:
|
while todo:
|
||||||
neglen, pos, rank = heapq.heappop(todo)
|
_, pos, rank = heapq.heappop(todo)
|
||||||
# partial node
|
# partial node
|
||||||
partial = self.query.nodes[pos].partial
|
partial = self.query.nodes[pos].partial
|
||||||
if partial is not None:
|
if partial is not None:
|
||||||
if pos + 1 < trange.end:
|
if pos + 1 < trange.end:
|
||||||
penalty = rank.penalty + partial.penalty \
|
penalty = rank.penalty + partial.penalty \
|
||||||
+ PENALTY_WORDCHANGE[self.query.nodes[pos + 1].btype]
|
+ self.query.nodes[pos + 1].word_break_penalty
|
||||||
heapq.heappush(todo, (neglen - 1, pos + 1,
|
heapq.heappush(todo, (-(pos + 1), pos + 1,
|
||||||
dbf.RankedTokens(penalty, rank.tokens)))
|
dbf.RankedTokens(penalty, rank.tokens)))
|
||||||
else:
|
else:
|
||||||
ranks.append(dbf.RankedTokens(rank.penalty + partial.penalty,
|
ranks.append(dbf.RankedTokens(rank.penalty + partial.penalty,
|
||||||
@@ -313,9 +317,11 @@ class SearchBuilder:
|
|||||||
for tlist in self.query.nodes[pos].starting:
|
for tlist in self.query.nodes[pos].starting:
|
||||||
if tlist.ttype == qmod.TOKEN_WORD:
|
if tlist.ttype == qmod.TOKEN_WORD:
|
||||||
if tlist.end < trange.end:
|
if tlist.end < trange.end:
|
||||||
chgpenalty = PENALTY_WORDCHANGE[self.query.nodes[tlist.end].btype]
|
chgpenalty = self.query.nodes[tlist.end].word_break_penalty \
|
||||||
|
+ self.query.get_in_word_penalty(
|
||||||
|
qmod.TokenRange(pos, tlist.end))
|
||||||
for t in tlist.tokens:
|
for t in tlist.tokens:
|
||||||
heapq.heappush(todo, (neglen - 1, tlist.end,
|
heapq.heappush(todo, (-tlist.end, tlist.end,
|
||||||
rank.with_token(t, chgpenalty)))
|
rank.with_token(t, chgpenalty)))
|
||||||
elif tlist.end == trange.end:
|
elif tlist.end == trange.end:
|
||||||
ranks.extend(rank.with_token(t, 0.0) for t in tlist.tokens)
|
ranks.extend(rank.with_token(t, 0.0) for t in tlist.tokens)
|
||||||
@@ -324,6 +330,8 @@ class SearchBuilder:
|
|||||||
# Too many variants, bail out and only add
|
# Too many variants, bail out and only add
|
||||||
# Worst-case Fallback: sum of penalty of partials
|
# Worst-case Fallback: sum of penalty of partials
|
||||||
default = sum(t.penalty for t in self.query.iter_partials(trange)) + 0.2
|
default = sum(t.penalty for t in self.query.iter_partials(trange)) + 0.2
|
||||||
|
default += sum(n.word_break_penalty
|
||||||
|
for n in self.query.nodes[trange.start + 1:trange.end])
|
||||||
ranks.append(dbf.RankedTokens(rank.penalty + default, []))
|
ranks.append(dbf.RankedTokens(rank.penalty + default, []))
|
||||||
# Bail out of outer loop
|
# Bail out of outer loop
|
||||||
break
|
break
|
||||||
@@ -346,6 +354,7 @@ class SearchBuilder:
|
|||||||
if not tokens:
|
if not tokens:
|
||||||
return None
|
return None
|
||||||
sdata.set_strings('countries', tokens)
|
sdata.set_strings('countries', tokens)
|
||||||
|
sdata.penalty += self.query.get_in_word_penalty(assignment.country)
|
||||||
elif self.details.countries:
|
elif self.details.countries:
|
||||||
sdata.countries = dbf.WeightedStrings(self.details.countries,
|
sdata.countries = dbf.WeightedStrings(self.details.countries,
|
||||||
[0.0] * len(self.details.countries))
|
[0.0] * len(self.details.countries))
|
||||||
@@ -353,29 +362,24 @@ class SearchBuilder:
|
|||||||
sdata.set_strings('housenumbers',
|
sdata.set_strings('housenumbers',
|
||||||
self.query.get_tokens(assignment.housenumber,
|
self.query.get_tokens(assignment.housenumber,
|
||||||
qmod.TOKEN_HOUSENUMBER))
|
qmod.TOKEN_HOUSENUMBER))
|
||||||
|
sdata.penalty += self.query.get_in_word_penalty(assignment.housenumber)
|
||||||
if assignment.postcode:
|
if assignment.postcode:
|
||||||
sdata.set_strings('postcodes',
|
sdata.set_strings('postcodes',
|
||||||
self.query.get_tokens(assignment.postcode,
|
self.query.get_tokens(assignment.postcode,
|
||||||
qmod.TOKEN_POSTCODE))
|
qmod.TOKEN_POSTCODE))
|
||||||
|
sdata.penalty += self.query.get_in_word_penalty(assignment.postcode)
|
||||||
if assignment.qualifier:
|
if assignment.qualifier:
|
||||||
tokens = self.get_qualifier_tokens(assignment.qualifier)
|
tokens = self.get_qualifier_tokens(assignment.qualifier)
|
||||||
if not tokens:
|
if not tokens:
|
||||||
return None
|
return None
|
||||||
sdata.set_qualifiers(tokens)
|
sdata.set_qualifiers(tokens)
|
||||||
|
sdata.penalty += self.query.get_in_word_penalty(assignment.qualifier)
|
||||||
elif self.details.categories:
|
elif self.details.categories:
|
||||||
sdata.qualifiers = dbf.WeightedCategories(self.details.categories,
|
sdata.qualifiers = dbf.WeightedCategories(self.details.categories,
|
||||||
[0.0] * len(self.details.categories))
|
[0.0] * len(self.details.categories))
|
||||||
|
|
||||||
if assignment.address:
|
if assignment.address:
|
||||||
if not assignment.name and assignment.housenumber:
|
sdata.set_ranking([self.get_addr_ranking(r) for r in assignment.address])
|
||||||
# housenumber search: the first item needs to be handled like
|
|
||||||
# a name in ranking or penalties are not comparable with
|
|
||||||
# normal searches.
|
|
||||||
sdata.set_ranking([self.get_name_ranking(assignment.address[0],
|
|
||||||
db_field='nameaddress_vector')]
|
|
||||||
+ [self.get_addr_ranking(r) for r in assignment.address[1:]])
|
|
||||||
else:
|
|
||||||
sdata.set_ranking([self.get_addr_ranking(r) for r in assignment.address])
|
|
||||||
else:
|
else:
|
||||||
sdata.rankings = []
|
sdata.rankings = []
|
||||||
|
|
||||||
@@ -421,14 +425,3 @@ class SearchBuilder:
|
|||||||
return dbf.WeightedCategories(list(tokens.keys()), list(tokens.values()))
|
return dbf.WeightedCategories(list(tokens.keys()), list(tokens.values()))
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
PENALTY_WORDCHANGE = {
|
|
||||||
qmod.BREAK_START: 0.0,
|
|
||||||
qmod.BREAK_END: 0.0,
|
|
||||||
qmod.BREAK_PHRASE: 0.0,
|
|
||||||
qmod.BREAK_SOFT_PHRASE: 0.0,
|
|
||||||
qmod.BREAK_WORD: 0.1,
|
|
||||||
qmod.BREAK_PART: 0.2,
|
|
||||||
qmod.BREAK_TOKEN: 0.4
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -83,7 +83,7 @@ class ForwardGeocoder:
|
|||||||
min_ranking = searches[0].penalty + 2.0
|
min_ranking = searches[0].penalty + 2.0
|
||||||
prev_penalty = 0.0
|
prev_penalty = 0.0
|
||||||
for i, search in enumerate(searches):
|
for i, search in enumerate(searches):
|
||||||
if search.penalty > prev_penalty and (search.penalty > min_ranking or i > 20):
|
if search.penalty > prev_penalty and (search.penalty > min_ranking or i > 15):
|
||||||
break
|
break
|
||||||
log().table_dump(f"{i + 1}. Search", _dump_searches([search], query))
|
log().table_dump(f"{i + 1}. Search", _dump_searches([search], query))
|
||||||
log().var_dump('Params', self.params)
|
log().var_dump('Params', self.params)
|
||||||
|
|||||||
@@ -37,14 +37,14 @@ DB_TO_TOKEN_TYPE = {
|
|||||||
'C': qmod.TOKEN_COUNTRY
|
'C': qmod.TOKEN_COUNTRY
|
||||||
}
|
}
|
||||||
|
|
||||||
PENALTY_IN_TOKEN_BREAK = {
|
PENALTY_BREAK = {
|
||||||
qmod.BREAK_START: 0.5,
|
qmod.BREAK_START: -0.5,
|
||||||
qmod.BREAK_END: 0.5,
|
qmod.BREAK_END: -0.5,
|
||||||
qmod.BREAK_PHRASE: 0.5,
|
qmod.BREAK_PHRASE: -0.5,
|
||||||
qmod.BREAK_SOFT_PHRASE: 0.5,
|
qmod.BREAK_SOFT_PHRASE: -0.5,
|
||||||
qmod.BREAK_WORD: 0.1,
|
qmod.BREAK_WORD: 0.1,
|
||||||
qmod.BREAK_PART: 0.0,
|
qmod.BREAK_PART: 0.2,
|
||||||
qmod.BREAK_TOKEN: 0.0
|
qmod.BREAK_TOKEN: 0.4
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -78,13 +78,13 @@ class ICUToken(qmod.Token):
|
|||||||
self.penalty += (distance/len(self.lookup_word))
|
self.penalty += (distance/len(self.lookup_word))
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def from_db_row(row: SaRow, base_penalty: float = 0.0) -> 'ICUToken':
|
def from_db_row(row: SaRow) -> 'ICUToken':
|
||||||
""" Create a ICUToken from the row of the word table.
|
""" Create a ICUToken from the row of the word table.
|
||||||
"""
|
"""
|
||||||
count = 1 if row.info is None else row.info.get('count', 1)
|
count = 1 if row.info is None else row.info.get('count', 1)
|
||||||
addr_count = 1 if row.info is None else row.info.get('addr_count', 1)
|
addr_count = 1 if row.info is None else row.info.get('addr_count', 1)
|
||||||
|
|
||||||
penalty = base_penalty
|
penalty = 0.0
|
||||||
if row.type == 'w':
|
if row.type == 'w':
|
||||||
penalty += 0.3
|
penalty += 0.3
|
||||||
elif row.type == 'W':
|
elif row.type == 'W':
|
||||||
@@ -174,11 +174,14 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
|
|||||||
|
|
||||||
self.split_query(query)
|
self.split_query(query)
|
||||||
log().var_dump('Transliterated query', lambda: query.get_transliterated_query())
|
log().var_dump('Transliterated query', lambda: query.get_transliterated_query())
|
||||||
words = query.extract_words(base_penalty=PENALTY_IN_TOKEN_BREAK[qmod.BREAK_WORD])
|
words = query.extract_words()
|
||||||
|
|
||||||
for row in await self.lookup_in_db(list(words.keys())):
|
for row in await self.lookup_in_db(list(words.keys())):
|
||||||
for trange in words[row.word_token]:
|
for trange in words[row.word_token]:
|
||||||
token = ICUToken.from_db_row(row, trange.penalty or 0.0)
|
# Create a new token for each position because the token
|
||||||
|
# penalty can vary depending on the position in the query.
|
||||||
|
# (See rerank_tokens() below.)
|
||||||
|
token = ICUToken.from_db_row(row)
|
||||||
if row.type == 'S':
|
if row.type == 'S':
|
||||||
if row.info['op'] in ('in', 'near'):
|
if row.info['op'] in ('in', 'near'):
|
||||||
if trange.start == 0:
|
if trange.start == 0:
|
||||||
@@ -200,6 +203,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
|
|||||||
lookup_word=pc, word_token=term,
|
lookup_word=pc, word_token=term,
|
||||||
info=None))
|
info=None))
|
||||||
self.rerank_tokens(query)
|
self.rerank_tokens(query)
|
||||||
|
self.compute_break_penalties(query)
|
||||||
|
|
||||||
log().table_dump('Word tokens', _dump_word_tokens(query))
|
log().table_dump('Word tokens', _dump_word_tokens(query))
|
||||||
|
|
||||||
@@ -229,13 +233,10 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
|
|||||||
if trans:
|
if trans:
|
||||||
for term in trans.split(' '):
|
for term in trans.split(' '):
|
||||||
if term:
|
if term:
|
||||||
query.add_node(qmod.BREAK_TOKEN, phrase.ptype,
|
query.add_node(qmod.BREAK_TOKEN, phrase.ptype, term, word)
|
||||||
PENALTY_IN_TOKEN_BREAK[qmod.BREAK_TOKEN],
|
query.nodes[-1].btype = breakchar
|
||||||
term, word)
|
|
||||||
query.nodes[-1].adjust_break(breakchar,
|
|
||||||
PENALTY_IN_TOKEN_BREAK[breakchar])
|
|
||||||
|
|
||||||
query.nodes[-1].adjust_break(qmod.BREAK_END, PENALTY_IN_TOKEN_BREAK[qmod.BREAK_END])
|
query.nodes[-1].btype = qmod.BREAK_END
|
||||||
|
|
||||||
async def lookup_in_db(self, words: List[str]) -> 'sa.Result[Any]':
|
async def lookup_in_db(self, words: List[str]) -> 'sa.Result[Any]':
|
||||||
""" Return the token information from the database for the
|
""" Return the token information from the database for the
|
||||||
@@ -300,6 +301,12 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
|
|||||||
for token in tokens:
|
for token in tokens:
|
||||||
cast(ICUToken, token).rematch(norm)
|
cast(ICUToken, token).rematch(norm)
|
||||||
|
|
||||||
|
def compute_break_penalties(self, query: qmod.QueryStruct) -> None:
|
||||||
|
""" Set the break penalties for the nodes in the query.
|
||||||
|
"""
|
||||||
|
for node in query.nodes:
|
||||||
|
node.penalty = PENALTY_BREAK[node.btype]
|
||||||
|
|
||||||
|
|
||||||
def _dump_word_tokens(query: qmod.QueryStruct) -> Iterator[List[Any]]:
|
def _dump_word_tokens(query: qmod.QueryStruct) -> Iterator[List[Any]]:
|
||||||
yield ['type', 'from', 'to', 'token', 'word_token', 'lookup_word', 'penalty', 'count', 'info']
|
yield ['type', 'from', 'to', 'token', 'word_token', 'lookup_word', 'penalty', 'count', 'info']
|
||||||
|
|||||||
@@ -134,7 +134,6 @@ class TokenRange:
|
|||||||
"""
|
"""
|
||||||
start: int
|
start: int
|
||||||
end: int
|
end: int
|
||||||
penalty: Optional[float] = None
|
|
||||||
|
|
||||||
def __lt__(self, other: 'TokenRange') -> bool:
|
def __lt__(self, other: 'TokenRange') -> bool:
|
||||||
return self.end <= other.start
|
return self.end <= other.start
|
||||||
@@ -191,7 +190,9 @@ class QueryNode:
|
|||||||
ptype: PhraseType
|
ptype: PhraseType
|
||||||
|
|
||||||
penalty: float
|
penalty: float
|
||||||
""" Penalty for the break at this node.
|
""" Penalty for having a word break at this position. The penalty
|
||||||
|
may be negative, when a word break is more likely than continuing
|
||||||
|
the word after the node.
|
||||||
"""
|
"""
|
||||||
term_lookup: str
|
term_lookup: str
|
||||||
""" Transliterated term ending at this node.
|
""" Transliterated term ending at this node.
|
||||||
@@ -212,6 +213,19 @@ class QueryNode:
|
|||||||
types of tokens spanning over the gap.
|
types of tokens spanning over the gap.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def word_break_penalty(self) -> float:
|
||||||
|
""" Penalty to apply when a words ends at this node.
|
||||||
|
"""
|
||||||
|
return max(0, self.penalty)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def word_continuation_penalty(self) -> float:
|
||||||
|
""" Penalty to apply when a word continues over this node
|
||||||
|
(i.e. is a multi-term word).
|
||||||
|
"""
|
||||||
|
return max(0, -self.penalty)
|
||||||
|
|
||||||
def name_address_ratio(self) -> float:
|
def name_address_ratio(self) -> float:
|
||||||
""" Return the propability that the partial token belonging to
|
""" Return the propability that the partial token belonging to
|
||||||
this node forms part of a name (as opposed of part of the address).
|
this node forms part of a name (as opposed of part of the address).
|
||||||
@@ -221,12 +235,6 @@ class QueryNode:
|
|||||||
|
|
||||||
return self.partial.count / (self.partial.count + self.partial.addr_count)
|
return self.partial.count / (self.partial.count + self.partial.addr_count)
|
||||||
|
|
||||||
def adjust_break(self, btype: BreakType, penalty: float) -> None:
|
|
||||||
""" Change the break type and penalty for this node.
|
|
||||||
"""
|
|
||||||
self.btype = btype
|
|
||||||
self.penalty = penalty
|
|
||||||
|
|
||||||
def has_tokens(self, end: int, *ttypes: TokenType) -> bool:
|
def has_tokens(self, end: int, *ttypes: TokenType) -> bool:
|
||||||
""" Check if there are tokens of the given types ending at the
|
""" Check if there are tokens of the given types ending at the
|
||||||
given node.
|
given node.
|
||||||
@@ -286,13 +294,12 @@ class QueryStruct:
|
|||||||
return len(self.nodes) - 1
|
return len(self.nodes) - 1
|
||||||
|
|
||||||
def add_node(self, btype: BreakType, ptype: PhraseType,
|
def add_node(self, btype: BreakType, ptype: PhraseType,
|
||||||
break_penalty: float = 0.0,
|
|
||||||
term_lookup: str = '', term_normalized: str = '') -> None:
|
term_lookup: str = '', term_normalized: str = '') -> None:
|
||||||
""" Append a new break node with the given break type.
|
""" Append a new break node with the given break type.
|
||||||
The phrase type denotes the type for any tokens starting
|
The phrase type denotes the type for any tokens starting
|
||||||
at the node.
|
at the node.
|
||||||
"""
|
"""
|
||||||
self.nodes.append(QueryNode(btype, ptype, break_penalty, term_lookup, term_normalized))
|
self.nodes.append(QueryNode(btype, ptype, 0.0, term_lookup, term_normalized))
|
||||||
|
|
||||||
def add_token(self, trange: TokenRange, ttype: TokenType, token: Token) -> None:
|
def add_token(self, trange: TokenRange, ttype: TokenType, token: Token) -> None:
|
||||||
""" Add a token to the query. 'start' and 'end' are the indexes of the
|
""" Add a token to the query. 'start' and 'end' are the indexes of the
|
||||||
@@ -324,7 +331,7 @@ class QueryStruct:
|
|||||||
of each node.
|
of each node.
|
||||||
"""
|
"""
|
||||||
n = len(self.nodes) - 1
|
n = len(self.nodes) - 1
|
||||||
if n == 1 or n >= 50:
|
if n <= 1 or n >= 50:
|
||||||
self.dir_penalty = 0
|
self.dir_penalty = 0
|
||||||
elif n == 2:
|
elif n == 2:
|
||||||
self.dir_penalty = (self.nodes[1].name_address_ratio()
|
self.dir_penalty = (self.nodes[1].name_address_ratio()
|
||||||
@@ -344,6 +351,13 @@ class QueryStruct:
|
|||||||
assert ttype != TOKEN_PARTIAL
|
assert ttype != TOKEN_PARTIAL
|
||||||
return self.nodes[trange.start].get_tokens(trange.end, ttype) or []
|
return self.nodes[trange.start].get_tokens(trange.end, ttype) or []
|
||||||
|
|
||||||
|
def get_in_word_penalty(self, trange: TokenRange) -> float:
|
||||||
|
""" Gets the sum of penalties for all token transitions
|
||||||
|
within the given range.
|
||||||
|
"""
|
||||||
|
return sum(n.word_continuation_penalty
|
||||||
|
for n in self.nodes[trange.start + 1:trange.end])
|
||||||
|
|
||||||
def iter_partials(self, trange: TokenRange) -> Iterator[Token]:
|
def iter_partials(self, trange: TokenRange) -> Iterator[Token]:
|
||||||
""" Iterate over the partial tokens between the given nodes.
|
""" Iterate over the partial tokens between the given nodes.
|
||||||
Missing partials are ignored.
|
Missing partials are ignored.
|
||||||
@@ -386,17 +400,14 @@ class QueryStruct:
|
|||||||
"""
|
"""
|
||||||
return ''.join(''.join((n.term_lookup, n.btype)) for n in self.nodes)
|
return ''.join(''.join((n.term_lookup, n.btype)) for n in self.nodes)
|
||||||
|
|
||||||
def extract_words(self, base_penalty: float = 0.0,
|
def extract_words(self, start: int = 0,
|
||||||
start: int = 0,
|
|
||||||
endpos: Optional[int] = None) -> Dict[str, List[TokenRange]]:
|
endpos: Optional[int] = None) -> Dict[str, List[TokenRange]]:
|
||||||
""" Add all combinations of words that can be formed from the terms
|
""" Add all combinations of words that can be formed from the terms
|
||||||
between the given start and endnode. The terms are joined with
|
between the given start and endnode. The terms are joined with
|
||||||
spaces for each break. Words can never go across a BREAK_PHRASE.
|
spaces for each break. Words can never go across a BREAK_PHRASE.
|
||||||
|
|
||||||
The functions returns a dictionary of possible words with their
|
The functions returns a dictionary of possible words with their
|
||||||
position within the query and a penalty. The penalty is computed
|
position within the query.
|
||||||
from the base_penalty plus the penalty for each node the word
|
|
||||||
crosses.
|
|
||||||
"""
|
"""
|
||||||
if endpos is None:
|
if endpos is None:
|
||||||
endpos = len(self.nodes)
|
endpos = len(self.nodes)
|
||||||
@@ -405,16 +416,13 @@ class QueryStruct:
|
|||||||
|
|
||||||
for first, first_node in enumerate(self.nodes[start + 1:endpos], start):
|
for first, first_node in enumerate(self.nodes[start + 1:endpos], start):
|
||||||
word = first_node.term_lookup
|
word = first_node.term_lookup
|
||||||
penalty = base_penalty
|
words[word].append(TokenRange(first, first + 1))
|
||||||
words[word].append(TokenRange(first, first + 1, penalty=penalty))
|
|
||||||
if first_node.btype != BREAK_PHRASE:
|
if first_node.btype != BREAK_PHRASE:
|
||||||
penalty += first_node.penalty
|
|
||||||
max_last = min(first + 20, endpos)
|
max_last = min(first + 20, endpos)
|
||||||
for last, last_node in enumerate(self.nodes[first + 2:max_last], first + 2):
|
for last, last_node in enumerate(self.nodes[first + 2:max_last], first + 2):
|
||||||
word = ' '.join((word, last_node.term_lookup))
|
word = ' '.join((word, last_node.term_lookup))
|
||||||
words[word].append(TokenRange(first, last, penalty=penalty))
|
words[word].append(TokenRange(first, last))
|
||||||
if last_node.btype == BREAK_PHRASE:
|
if last_node.btype == BREAK_PHRASE:
|
||||||
break
|
break
|
||||||
penalty += last_node.penalty
|
|
||||||
|
|
||||||
return words
|
return words
|
||||||
|
|||||||
@@ -23,16 +23,6 @@ class TypedRange:
|
|||||||
trange: qmod.TokenRange
|
trange: qmod.TokenRange
|
||||||
|
|
||||||
|
|
||||||
PENALTY_TOKENCHANGE = {
|
|
||||||
qmod.BREAK_START: 0.0,
|
|
||||||
qmod.BREAK_END: 0.0,
|
|
||||||
qmod.BREAK_PHRASE: 0.0,
|
|
||||||
qmod.BREAK_SOFT_PHRASE: 0.0,
|
|
||||||
qmod.BREAK_WORD: 0.1,
|
|
||||||
qmod.BREAK_PART: 0.2,
|
|
||||||
qmod.BREAK_TOKEN: 0.4
|
|
||||||
}
|
|
||||||
|
|
||||||
TypedRangeSeq = List[TypedRange]
|
TypedRangeSeq = List[TypedRange]
|
||||||
|
|
||||||
|
|
||||||
@@ -192,7 +182,7 @@ class _TokenSequence:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
def advance(self, ttype: qmod.TokenType, end_pos: int,
|
def advance(self, ttype: qmod.TokenType, end_pos: int,
|
||||||
btype: qmod.BreakType) -> Optional['_TokenSequence']:
|
force_break: bool, break_penalty: float) -> Optional['_TokenSequence']:
|
||||||
""" Return a new token sequence state with the given token type
|
""" Return a new token sequence state with the given token type
|
||||||
extended.
|
extended.
|
||||||
"""
|
"""
|
||||||
@@ -205,7 +195,7 @@ class _TokenSequence:
|
|||||||
new_penalty = 0.0
|
new_penalty = 0.0
|
||||||
else:
|
else:
|
||||||
last = self.seq[-1]
|
last = self.seq[-1]
|
||||||
if btype != qmod.BREAK_PHRASE and last.ttype == ttype:
|
if not force_break and last.ttype == ttype:
|
||||||
# extend the existing range
|
# extend the existing range
|
||||||
newseq = self.seq[:-1] + [TypedRange(ttype, last.trange.replace_end(end_pos))]
|
newseq = self.seq[:-1] + [TypedRange(ttype, last.trange.replace_end(end_pos))]
|
||||||
new_penalty = 0.0
|
new_penalty = 0.0
|
||||||
@@ -213,7 +203,7 @@ class _TokenSequence:
|
|||||||
# start a new range
|
# start a new range
|
||||||
newseq = list(self.seq) + [TypedRange(ttype,
|
newseq = list(self.seq) + [TypedRange(ttype,
|
||||||
qmod.TokenRange(last.trange.end, end_pos))]
|
qmod.TokenRange(last.trange.end, end_pos))]
|
||||||
new_penalty = PENALTY_TOKENCHANGE[btype]
|
new_penalty = break_penalty
|
||||||
|
|
||||||
return _TokenSequence(newseq, newdir, self.penalty + new_penalty)
|
return _TokenSequence(newseq, newdir, self.penalty + new_penalty)
|
||||||
|
|
||||||
@@ -317,7 +307,7 @@ class _TokenSequence:
|
|||||||
name, addr = first.split(i)
|
name, addr = first.split(i)
|
||||||
log().comment(f'split first word = name ({i - first.start})')
|
log().comment(f'split first word = name ({i - first.start})')
|
||||||
yield dataclasses.replace(base, name=name, address=[addr] + base.address[1:],
|
yield dataclasses.replace(base, name=name, address=[addr] + base.address[1:],
|
||||||
penalty=penalty + PENALTY_TOKENCHANGE[query.nodes[i].btype])
|
penalty=penalty + query.nodes[i].word_break_penalty)
|
||||||
|
|
||||||
def _get_assignments_address_backward(self, base: TokenAssignment,
|
def _get_assignments_address_backward(self, base: TokenAssignment,
|
||||||
query: qmod.QueryStruct) -> Iterator[TokenAssignment]:
|
query: qmod.QueryStruct) -> Iterator[TokenAssignment]:
|
||||||
@@ -362,7 +352,7 @@ class _TokenSequence:
|
|||||||
addr, name = last.split(i)
|
addr, name = last.split(i)
|
||||||
log().comment(f'split last word = name ({i - last.start})')
|
log().comment(f'split last word = name ({i - last.start})')
|
||||||
yield dataclasses.replace(base, name=name, address=base.address[:-1] + [addr],
|
yield dataclasses.replace(base, name=name, address=base.address[:-1] + [addr],
|
||||||
penalty=penalty + PENALTY_TOKENCHANGE[query.nodes[i].btype])
|
penalty=penalty + query.nodes[i].word_break_penalty)
|
||||||
|
|
||||||
def get_assignments(self, query: qmod.QueryStruct) -> Iterator[TokenAssignment]:
|
def get_assignments(self, query: qmod.QueryStruct) -> Iterator[TokenAssignment]:
|
||||||
""" Yield possible assignments for the current sequence.
|
""" Yield possible assignments for the current sequence.
|
||||||
@@ -422,12 +412,15 @@ def yield_token_assignments(query: qmod.QueryStruct) -> Iterator[TokenAssignment
|
|||||||
for tlist in node.starting:
|
for tlist in node.starting:
|
||||||
yield from _append_state_to_todo(
|
yield from _append_state_to_todo(
|
||||||
query, todo,
|
query, todo,
|
||||||
state.advance(tlist.ttype, tlist.end, node.btype))
|
state.advance(tlist.ttype, tlist.end,
|
||||||
|
True, node.word_break_penalty))
|
||||||
|
|
||||||
if node.partial is not None:
|
if node.partial is not None:
|
||||||
yield from _append_state_to_todo(
|
yield from _append_state_to_todo(
|
||||||
query, todo,
|
query, todo,
|
||||||
state.advance(qmod.TOKEN_PARTIAL, state.end_pos + 1, node.btype))
|
state.advance(qmod.TOKEN_PARTIAL, state.end_pos + 1,
|
||||||
|
node.btype == qmod.BREAK_PHRASE,
|
||||||
|
node.word_break_penalty))
|
||||||
|
|
||||||
|
|
||||||
def _append_state_to_todo(query: qmod.QueryStruct, todo: List[_TokenSequence],
|
def _append_state_to_todo(query: qmod.QueryStruct, todo: List[_TokenSequence],
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ Feature: Simple Tests
|
|||||||
| %#$@*&l;der#$! |
|
| %#$@*&l;der#$! |
|
||||||
| 234.23.14.5 |
|
| 234.23.14.5 |
|
||||||
| aussenstelle universitat lichtenstein wachterhaus aussenstelle universitat lichtenstein wachterhaus aussenstelle universitat lichtenstein wachterhaus aussenstelle universitat lichtenstein wachterhaus |
|
| aussenstelle universitat lichtenstein wachterhaus aussenstelle universitat lichtenstein wachterhaus aussenstelle universitat lichtenstein wachterhaus aussenstelle universitat lichtenstein wachterhaus |
|
||||||
|
| . |
|
||||||
|
|
||||||
Scenario: Empty XML search
|
Scenario: Empty XML search
|
||||||
When sending v1/search with format xml
|
When sending v1/search with format xml
|
||||||
|
|||||||
@@ -68,7 +68,7 @@ def mk_query(inp):
|
|||||||
phrase_split = re.split(r"([ ,:'-])", inp)
|
phrase_split = re.split(r"([ ,:'-])", inp)
|
||||||
|
|
||||||
for word, breakchar in zip_longest(*[iter(phrase_split)]*2, fillvalue='>'):
|
for word, breakchar in zip_longest(*[iter(phrase_split)]*2, fillvalue='>'):
|
||||||
query.add_node(breakchar, PHRASE_ANY, 0.1, word, word)
|
query.add_node(breakchar, PHRASE_ANY, word, word)
|
||||||
|
|
||||||
return query
|
return query
|
||||||
|
|
||||||
@@ -153,9 +153,9 @@ def test_postcode_inside_postcode_phrase(pc_config):
|
|||||||
|
|
||||||
query = QueryStruct([])
|
query = QueryStruct([])
|
||||||
query.nodes[-1].ptype = PHRASE_STREET
|
query.nodes[-1].ptype = PHRASE_STREET
|
||||||
query.add_node(',', PHRASE_STREET, 0.1, '12345', '12345')
|
query.add_node(',', PHRASE_STREET, '12345', '12345')
|
||||||
query.add_node(',', PHRASE_POSTCODE, 0.1, 'xz', 'xz')
|
query.add_node(',', PHRASE_POSTCODE, 'xz', 'xz')
|
||||||
query.add_node('>', PHRASE_POSTCODE, 0.1, '4444', '4444')
|
query.add_node('>', PHRASE_POSTCODE, '4444', '4444')
|
||||||
|
|
||||||
assert parser.parse(query) == {(2, 3, '4444')}
|
assert parser.parse(query) == {(2, 3, '4444')}
|
||||||
|
|
||||||
@@ -165,7 +165,7 @@ def test_partial_postcode_in_postcode_phrase(pc_config):
|
|||||||
|
|
||||||
query = QueryStruct([])
|
query = QueryStruct([])
|
||||||
query.nodes[-1].ptype = PHRASE_POSTCODE
|
query.nodes[-1].ptype = PHRASE_POSTCODE
|
||||||
query.add_node(' ', PHRASE_POSTCODE, 0.1, '2224', '2224')
|
query.add_node(' ', PHRASE_POSTCODE, '2224', '2224')
|
||||||
query.add_node('>', PHRASE_POSTCODE, 0.1, '12345', '12345')
|
query.add_node('>', PHRASE_POSTCODE, '12345', '12345')
|
||||||
|
|
||||||
assert not parser.parse(query)
|
assert not parser.parse(query)
|
||||||
|
|||||||
@@ -51,15 +51,15 @@ def test_token_range_unimplemented_ops():
|
|||||||
|
|
||||||
def test_query_extract_words():
|
def test_query_extract_words():
|
||||||
q = nq.QueryStruct([])
|
q = nq.QueryStruct([])
|
||||||
q.add_node(nq.BREAK_WORD, nq.PHRASE_ANY, 0.1, '12', '')
|
q.add_node(nq.BREAK_WORD, nq.PHRASE_ANY, '12', '')
|
||||||
q.add_node(nq.BREAK_TOKEN, nq.PHRASE_ANY, 0.0, 'ab', '')
|
q.add_node(nq.BREAK_TOKEN, nq.PHRASE_ANY, 'ab', '')
|
||||||
q.add_node(nq.BREAK_PHRASE, nq.PHRASE_ANY, 0.0, '12', '')
|
q.add_node(nq.BREAK_PHRASE, nq.PHRASE_ANY, '12', '')
|
||||||
q.add_node(nq.BREAK_END, nq.PHRASE_ANY, 0.5, 'hallo', '')
|
q.add_node(nq.BREAK_END, nq.PHRASE_ANY, 'hallo', '')
|
||||||
|
|
||||||
words = q.extract_words(base_penalty=1.0)
|
words = q.extract_words()
|
||||||
|
|
||||||
assert set(words.keys()) \
|
assert set(words.keys()) \
|
||||||
== {'12', 'ab', 'hallo', '12 ab', 'ab 12', '12 ab 12'}
|
== {'12', 'ab', 'hallo', '12 ab', 'ab 12', '12 ab 12'}
|
||||||
assert sorted(words['12']) == [nq.TokenRange(0, 1, 1.0), nq.TokenRange(2, 3, 1.0)]
|
assert sorted(words['12']) == [nq.TokenRange(0, 1), nq.TokenRange(2, 3)]
|
||||||
assert words['12 ab'] == [nq.TokenRange(0, 2, 1.1)]
|
assert words['12 ab'] == [nq.TokenRange(0, 2)]
|
||||||
assert words['hallo'] == [nq.TokenRange(3, 4, 1.0)]
|
assert words['hallo'] == [nq.TokenRange(3, 4)]
|
||||||
|
|||||||
@@ -12,8 +12,8 @@ import pytest
|
|||||||
from nominatim_api.search.query import QueryStruct, Phrase, TokenRange, Token
|
from nominatim_api.search.query import QueryStruct, Phrase, TokenRange, Token
|
||||||
import nominatim_api.search.query as qmod
|
import nominatim_api.search.query as qmod
|
||||||
from nominatim_api.search.token_assignment import (yield_token_assignments,
|
from nominatim_api.search.token_assignment import (yield_token_assignments,
|
||||||
TokenAssignment,
|
TokenAssignment)
|
||||||
PENALTY_TOKENCHANGE)
|
from nominatim_api.search.icu_tokenizer import PENALTY_BREAK
|
||||||
|
|
||||||
|
|
||||||
class MyToken(Token):
|
class MyToken(Token):
|
||||||
@@ -28,6 +28,7 @@ def make_query(*args):
|
|||||||
|
|
||||||
for btype, ptype, _ in args[1:]:
|
for btype, ptype, _ in args[1:]:
|
||||||
q.add_node(btype, ptype)
|
q.add_node(btype, ptype)
|
||||||
|
q.nodes[-1].penalty = PENALTY_BREAK[btype]
|
||||||
q.add_node(qmod.BREAK_END, qmod.PHRASE_ANY)
|
q.add_node(qmod.BREAK_END, qmod.PHRASE_ANY)
|
||||||
|
|
||||||
for start, t in enumerate(args):
|
for start, t in enumerate(args):
|
||||||
@@ -94,7 +95,7 @@ def test_multiple_simple_words(btype):
|
|||||||
(btype, qmod.PHRASE_ANY, [(2, qmod.TOKEN_PARTIAL)]),
|
(btype, qmod.PHRASE_ANY, [(2, qmod.TOKEN_PARTIAL)]),
|
||||||
(btype, qmod.PHRASE_ANY, [(3, qmod.TOKEN_PARTIAL)]))
|
(btype, qmod.PHRASE_ANY, [(3, qmod.TOKEN_PARTIAL)]))
|
||||||
|
|
||||||
penalty = PENALTY_TOKENCHANGE[btype]
|
penalty = PENALTY_BREAK[btype]
|
||||||
|
|
||||||
check_assignments(yield_token_assignments(q),
|
check_assignments(yield_token_assignments(q),
|
||||||
TokenAssignment(name=TokenRange(0, 3)),
|
TokenAssignment(name=TokenRange(0, 3)),
|
||||||
|
|||||||
Reference in New Issue
Block a user