add full tokens to addresses

This is now needed to weigh results.
This commit is contained in:
Sarah Hoffmann
2024-04-19 17:52:22 +02:00
parent d2bf986eae
commit 8f3845660f
2 changed files with 10 additions and 38 deletions

View File

@@ -712,10 +712,11 @@ class ICUNameAnalyzer(AbstractAnalyzer):
token_info.add_street(self._retrieve_full_tokens(item.name)) token_info.add_street(self._retrieve_full_tokens(item.name))
elif item.kind == 'place': elif item.kind == 'place':
if not item.suffix: if not item.suffix:
token_info.add_place(self._compute_partial_tokens(item.name)) token_info.add_place(itertools.chain(*self._compute_name_tokens([item])))
elif not item.kind.startswith('_') and not item.suffix and \ elif not item.kind.startswith('_') and not item.suffix and \
item.kind not in ('country', 'full', 'inclusion'): item.kind not in ('country', 'full', 'inclusion'):
token_info.add_address_term(item.kind, self._compute_partial_tokens(item.name)) token_info.add_address_term(item.kind,
itertools.chain(*self._compute_name_tokens([item])))
def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]: def _compute_housenumber_token(self, hnr: PlaceName) -> Tuple[Optional[int], Optional[str]]:
@@ -756,36 +757,6 @@ class ICUNameAnalyzer(AbstractAnalyzer):
return result return result
def _compute_partial_tokens(self, name: str) -> List[int]:
""" Normalize the given term, split it into partial words and return
then token list for them.
"""
assert self.conn is not None
norm_name = self._search_normalized(name)
tokens = []
need_lookup = []
for partial in norm_name.split():
token = self._cache.partials.get(partial)
if token:
tokens.append(token)
else:
need_lookup.append(partial)
if need_lookup:
with self.conn.cursor() as cur:
cur.execute("""SELECT word, getorcreate_partial_word(word)
FROM unnest(%s) word""",
(need_lookup, ))
for partial, token in cur:
assert token is not None
tokens.append(token)
self._cache.partials[partial] = token
return tokens
def _retrieve_full_tokens(self, name: str) -> List[int]: def _retrieve_full_tokens(self, name: str) -> List[int]:
""" Get the full name token for the given name, if it exists. """ Get the full name token for the given name, if it exists.
The name is only retrieved for the standard analyser. The name is only retrieved for the standard analyser.
@@ -957,8 +928,9 @@ class _TokenInfo:
def add_address_term(self, key: str, partials: Iterable[int]) -> None: def add_address_term(self, key: str, partials: Iterable[int]) -> None:
""" Add additional address terms. """ Add additional address terms.
""" """
if partials: array = self._mk_array(partials)
self.address_tokens[key] = self._mk_array(partials) if len(array) > 2:
self.address_tokens[key] = array
def set_postcode(self, postcode: Optional[str]) -> None: def set_postcode(self, postcode: Optional[str]) -> None:
""" Set the postcode to the given one. """ Set the postcode to the given one.

View File

@@ -554,7 +554,7 @@ class TestPlaceAddress:
def test_process_place_place(self): def test_process_place_place(self):
info = self.process_address(place='Honu Lulu') info = self.process_address(place='Honu Lulu')
assert eval(info['place']) == self.name_token_set('HONU', 'LULU') assert eval(info['place']) == self.name_token_set('HONU', 'LULU', '#HONU LULU')
def test_process_place_place_extra(self): def test_process_place_place_extra(self):
@@ -574,8 +574,8 @@ class TestPlaceAddress:
suburb='Zwickau', street='Hauptstr', suburb='Zwickau', street='Hauptstr',
full='right behind the church') full='right behind the church')
city = self.name_token_set('ZWICKAU') city = self.name_token_set('ZWICKAU', '#ZWICKAU')
state = self.name_token_set('SACHSEN') state = self.name_token_set('SACHSEN', '#SACHSEN')
result = {k: eval(v) for k,v in info['addr'].items()} result = {k: eval(v) for k,v in info['addr'].items()}
@@ -587,7 +587,7 @@ class TestPlaceAddress:
result = {k: eval(v) for k,v in info['addr'].items()} result = {k: eval(v) for k,v in info['addr'].items()}
assert result == {'city': self.name_token_set('Bruxelles')} assert result == {'city': self.name_token_set('Bruxelles', '#Bruxelles')}
def test_process_place_address_terms_empty(self): def test_process_place_address_terms_empty(self):