mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-02-16 15:47:58 +00:00
add address counts to tokens
This commit is contained in:
@@ -97,6 +97,7 @@ class ICUToken(qmod.Token):
|
|||||||
""" Create a ICUToken from the row of the word table.
|
""" Create a ICUToken from the row of the word table.
|
||||||
"""
|
"""
|
||||||
count = 1 if row.info is None else row.info.get('count', 1)
|
count = 1 if row.info is None else row.info.get('count', 1)
|
||||||
|
addr_count = 1 if row.info is None else row.info.get('addr_count', 1)
|
||||||
|
|
||||||
penalty = 0.0
|
penalty = 0.0
|
||||||
if row.type == 'w':
|
if row.type == 'w':
|
||||||
@@ -123,7 +124,8 @@ class ICUToken(qmod.Token):
|
|||||||
|
|
||||||
return ICUToken(penalty=penalty, token=row.word_id, count=count,
|
return ICUToken(penalty=penalty, token=row.word_id, count=count,
|
||||||
lookup_word=lookup_word, is_indexed=True,
|
lookup_word=lookup_word, is_indexed=True,
|
||||||
word_token=row.word_token, info=row.info)
|
word_token=row.word_token, info=row.info,
|
||||||
|
addr_count=addr_count)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -257,7 +259,7 @@ class ICUQueryAnalyzer(AbstractQueryAnalyzer):
|
|||||||
if len(part.token) <= 4 and part[0].isdigit()\
|
if len(part.token) <= 4 and part[0].isdigit()\
|
||||||
and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER):
|
and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER):
|
||||||
query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER,
|
query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER,
|
||||||
ICUToken(0.5, 0, 1, part.token, True, part.token, None))
|
ICUToken(0.5, 0, 1, 1, part.token, True, part.token, None))
|
||||||
|
|
||||||
|
|
||||||
def rerank_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None:
|
def rerank_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None:
|
||||||
|
|||||||
@@ -210,6 +210,7 @@ class LegacyQueryAnalyzer(AbstractQueryAnalyzer):
|
|||||||
|
|
||||||
return LegacyToken(penalty=penalty, token=row.word_id,
|
return LegacyToken(penalty=penalty, token=row.word_id,
|
||||||
count=row.search_name_count or 1,
|
count=row.search_name_count or 1,
|
||||||
|
addr_count=1, # not supported
|
||||||
lookup_word=lookup_word,
|
lookup_word=lookup_word,
|
||||||
word_token=row.word_token.strip(),
|
word_token=row.word_token.strip(),
|
||||||
category=(rowclass, row.type) if rowclass is not None else None,
|
category=(rowclass, row.type) if rowclass is not None else None,
|
||||||
@@ -226,7 +227,7 @@ class LegacyQueryAnalyzer(AbstractQueryAnalyzer):
|
|||||||
if len(part) <= 4 and part.isdigit()\
|
if len(part) <= 4 and part.isdigit()\
|
||||||
and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER):
|
and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER):
|
||||||
query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER,
|
query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER,
|
||||||
LegacyToken(penalty=0.5, token=0, count=1,
|
LegacyToken(penalty=0.5, token=0, count=1, addr_count=1,
|
||||||
lookup_word=part, word_token=part,
|
lookup_word=part, word_token=part,
|
||||||
category=None, country=None,
|
category=None, country=None,
|
||||||
operator=None, is_indexed=True))
|
operator=None, is_indexed=True))
|
||||||
|
|||||||
@@ -99,10 +99,10 @@ class Token(ABC):
|
|||||||
penalty: float
|
penalty: float
|
||||||
token: int
|
token: int
|
||||||
count: int
|
count: int
|
||||||
|
addr_count: int
|
||||||
lookup_word: str
|
lookup_word: str
|
||||||
is_indexed: bool
|
is_indexed: bool
|
||||||
|
|
||||||
addr_count: int = 1
|
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def get_category(self) -> Tuple[str, str]:
|
def get_category(self) -> Tuple[str, str]:
|
||||||
|
|||||||
@@ -201,7 +201,7 @@ class AbstractTokenizer(ABC):
|
|||||||
|
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def update_statistics(self, config: Configuration) -> None:
|
def update_statistics(self, config: Configuration, threads: int = 1) -> None:
|
||||||
""" Recompute any tokenizer statistics necessary for efficient lookup.
|
""" Recompute any tokenizer statistics necessary for efficient lookup.
|
||||||
This function is meant to be called from time to time by the user
|
This function is meant to be called from time to time by the user
|
||||||
to improve performance. However, the tokenizer must not depend on
|
to improve performance. However, the tokenizer must not depend on
|
||||||
|
|||||||
@@ -210,7 +210,7 @@ class LegacyTokenizer(AbstractTokenizer):
|
|||||||
self._save_config(conn, config)
|
self._save_config(conn, config)
|
||||||
|
|
||||||
|
|
||||||
def update_statistics(self, _: Configuration) -> None:
|
def update_statistics(self, config: Configuration, threads: int = 1) -> None:
|
||||||
""" Recompute the frequency of full words.
|
""" Recompute the frequency of full words.
|
||||||
"""
|
"""
|
||||||
with connect(self.dsn) as conn:
|
with connect(self.dsn) as conn:
|
||||||
|
|||||||
@@ -18,7 +18,8 @@ class MyToken(query.Token):
|
|||||||
|
|
||||||
|
|
||||||
def mktoken(tid: int):
|
def mktoken(tid: int):
|
||||||
return MyToken(3.0, tid, 1, 'foo', True)
|
return MyToken(penalty=3.0, token=tid, count=1, addr_count=1,
|
||||||
|
lookup_word='foo', is_indexed=True)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('ptype,ttype', [('NONE', 'WORD'),
|
@pytest.mark.parametrize('ptype,ttype', [('NONE', 'WORD'),
|
||||||
|
|||||||
@@ -31,7 +31,9 @@ def make_query(*args):
|
|||||||
for end, ttype, tinfo in tlist:
|
for end, ttype, tinfo in tlist:
|
||||||
for tid, word in tinfo:
|
for tid, word in tinfo:
|
||||||
q.add_token(TokenRange(start, end), ttype,
|
q.add_token(TokenRange(start, end), ttype,
|
||||||
MyToken(0.5 if ttype == TokenType.PARTIAL else 0.0, tid, 1, word, True))
|
MyToken(penalty=0.5 if ttype == TokenType.PARTIAL else 0.0,
|
||||||
|
token=tid, count=1, addr_count=1,
|
||||||
|
lookup_word=word, is_indexed=True))
|
||||||
|
|
||||||
|
|
||||||
return q
|
return q
|
||||||
@@ -395,14 +397,14 @@ def make_counted_searches(name_part, name_full, address_part, address_full,
|
|||||||
q.add_node(BreakType.END, PhraseType.NONE)
|
q.add_node(BreakType.END, PhraseType.NONE)
|
||||||
|
|
||||||
q.add_token(TokenRange(0, 1), TokenType.PARTIAL,
|
q.add_token(TokenRange(0, 1), TokenType.PARTIAL,
|
||||||
MyToken(0.5, 1, name_part, 'name_part', True))
|
MyToken(0.5, 1, name_part, 1, 'name_part', True))
|
||||||
q.add_token(TokenRange(0, 1), TokenType.WORD,
|
q.add_token(TokenRange(0, 1), TokenType.WORD,
|
||||||
MyToken(0, 101, name_full, 'name_full', True))
|
MyToken(0, 101, name_full, 1, 'name_full', True))
|
||||||
for i in range(num_address_parts):
|
for i in range(num_address_parts):
|
||||||
q.add_token(TokenRange(i + 1, i + 2), TokenType.PARTIAL,
|
q.add_token(TokenRange(i + 1, i + 2), TokenType.PARTIAL,
|
||||||
MyToken(0.5, 2, address_part, 'address_part', True))
|
MyToken(0.5, 2, address_part, 1, 'address_part', True))
|
||||||
q.add_token(TokenRange(i + 1, i + 2), TokenType.WORD,
|
q.add_token(TokenRange(i + 1, i + 2), TokenType.WORD,
|
||||||
MyToken(0, 102, address_full, 'address_full', True))
|
MyToken(0, 102, address_full, 1, 'address_full', True))
|
||||||
|
|
||||||
builder = SearchBuilder(q, SearchDetails())
|
builder = SearchBuilder(q, SearchDetails())
|
||||||
|
|
||||||
|
|||||||
@@ -19,7 +19,8 @@ class MyToken(Token):
|
|||||||
|
|
||||||
def make_query(*args):
|
def make_query(*args):
|
||||||
q = QueryStruct([Phrase(args[0][1], '')])
|
q = QueryStruct([Phrase(args[0][1], '')])
|
||||||
dummy = MyToken(3.0, 45, 1, 'foo', True)
|
dummy = MyToken(penalty=3.0, token=45, count=1, addr_count=1,
|
||||||
|
lookup_word='foo', is_indexed=True)
|
||||||
|
|
||||||
for btype, ptype, _ in args[1:]:
|
for btype, ptype, _ in args[1:]:
|
||||||
q.add_node(btype, ptype)
|
q.add_node(btype, ptype)
|
||||||
|
|||||||
@@ -32,16 +32,16 @@ class DummyTokenizer:
|
|||||||
self.update_statistics_called = False
|
self.update_statistics_called = False
|
||||||
self.update_word_tokens_called = False
|
self.update_word_tokens_called = False
|
||||||
|
|
||||||
def update_sql_functions(self, *args):
|
def update_sql_functions(self, *args, **kwargs):
|
||||||
self.update_sql_functions_called = True
|
self.update_sql_functions_called = True
|
||||||
|
|
||||||
def finalize_import(self, *args):
|
def finalize_import(self, *args, **kwargs):
|
||||||
self.finalize_import_called = True
|
self.finalize_import_called = True
|
||||||
|
|
||||||
def update_statistics(self, *args):
|
def update_statistics(self, *args, **kwargs):
|
||||||
self.update_statistics_called = True
|
self.update_statistics_called = True
|
||||||
|
|
||||||
def update_word_tokens(self, *args):
|
def update_word_tokens(self, *args, **kwargs):
|
||||||
self.update_word_tokens_called = True
|
self.update_word_tokens_called = True
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -227,16 +227,20 @@ def test_update_statistics_reverse_only(word_table, tokenizer_factory, test_conf
|
|||||||
def test_update_statistics(word_table, table_factory, temp_db_cursor,
|
def test_update_statistics(word_table, table_factory, temp_db_cursor,
|
||||||
tokenizer_factory, test_config):
|
tokenizer_factory, test_config):
|
||||||
word_table.add_full_word(1000, 'hello')
|
word_table.add_full_word(1000, 'hello')
|
||||||
|
word_table.add_full_word(1001, 'bye')
|
||||||
table_factory('search_name',
|
table_factory('search_name',
|
||||||
'place_id BIGINT, name_vector INT[]',
|
'place_id BIGINT, name_vector INT[], nameaddress_vector INT[]',
|
||||||
[(12, [1000])])
|
[(12, [1000], [1001])])
|
||||||
tok = tokenizer_factory()
|
tok = tokenizer_factory()
|
||||||
|
|
||||||
tok.update_statistics(test_config)
|
tok.update_statistics(test_config)
|
||||||
|
|
||||||
assert temp_db_cursor.scalar("""SELECT count(*) FROM word
|
assert temp_db_cursor.scalar("""SELECT count(*) FROM word
|
||||||
WHERE type = 'W' and
|
WHERE type = 'W' and word_id = 1000 and
|
||||||
(info->>'count')::int > 0""") > 0
|
(info->>'count')::int > 0""") == 1
|
||||||
|
assert temp_db_cursor.scalar("""SELECT count(*) FROM word
|
||||||
|
WHERE type = 'W' and word_id = 1001 and
|
||||||
|
(info->>'addr_count')::int > 0""") == 1
|
||||||
|
|
||||||
|
|
||||||
def test_normalize_postcode(analyzer):
|
def test_normalize_postcode(analyzer):
|
||||||
|
|||||||
Reference in New Issue
Block a user