do not clean housenumbers in reverse-only mode

This commit is contained in:
Sarah Hoffmann
2022-01-20 20:21:13 +01:00
parent d8b7a51ab6
commit 3ce123ab69

View File

@@ -116,20 +116,22 @@ class LegacyICUTokenizer(AbstractTokenizer):
""" Remove unused house numbers. """ Remove unused house numbers.
""" """
with connect(self.dsn) as conn: with connect(self.dsn) as conn:
if not conn.table_exists('search_name'):
return
with conn.cursor(name="hnr_counter") as cur: with conn.cursor(name="hnr_counter") as cur:
cur.execute("""SELECT word_id, word_token FROM word cur.execute("""SELECT word_id, word_token FROM word
WHERE type = 'H' WHERE type = 'H'
AND NOT EXISTS(SELECT * FROM search_name AND NOT EXISTS(SELECT * FROM search_name
WHERE ARRAY[word.word_id] && name_vector) WHERE ARRAY[word.word_id] && name_vector)
AND (char_length(word_token) > 6 AND (char_length(word_token) > 6
OR word_token not similar to '\d+') OR word_token not similar to '\\d+')
""") """)
candidates = {token: wid for wid, token in cur} candidates = {token: wid for wid, token in cur}
with conn.cursor(name="hnr_counter") as cur: with conn.cursor(name="hnr_counter") as cur:
cur.execute("""SELECT housenumber FROM placex cur.execute("""SELECT housenumber FROM placex
WHERE housenumber is not null WHERE housenumber is not null
AND (char_length(housenumber) > 6 AND (char_length(housenumber) > 6
OR housenumber not similar to '\d+') OR housenumber not similar to '\\d+')
""") """)
for row in cur: for row in cur:
for hnr in row[0].split(';'): for hnr in row[0].split(';'):
@@ -146,9 +148,9 @@ class LegacyICUTokenizer(AbstractTokenizer):
def update_word_tokens(self): def update_word_tokens(self):
""" Remove unused tokens. """ Remove unused tokens.
""" """
LOG.warn("Cleaning up housenumber tokens.") LOG.warning("Cleaning up housenumber tokens.")
self._cleanup_housenumbers() self._cleanup_housenumbers()
LOG.warn("Tokenizer house-keeping done.") LOG.warning("Tokenizer house-keeping done.")
def name_analyzer(self): def name_analyzer(self):