""" Tokenizer implementing normalisation as used before Nominatim 4 but using libICU instead of the PostgreSQL module. """ from collections import Counter import functools import io import itertools import json import logging import re from textwrap import dedent from pathlib import Path from icu import Transliterator import psycopg2.extras from nominatim.db.connection import connect from nominatim.db.properties import set_property, get_property from nominatim.db.sql_preprocessor import SQLPreprocessor DBCFG_NORMALIZATION = "tokenizer_normalization" DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq" DBCFG_TRANSLITERATION = "tokenizer_transliteration" DBCFG_ABBREVIATIONS = "tokenizer_abbreviations" LOG = logging.getLogger() def create(dsn, data_dir): """ Create a new instance of the tokenizer provided by this module. """ return LegacyICUTokenizer(dsn, data_dir) class LegacyICUTokenizer: """ This tokenizer uses libICU to covert names and queries to ASCII. Otherwise it uses the same algorithms and data structures as the normalization routines in Nominatim 3. """ def __init__(self, dsn, data_dir): self.dsn = dsn self.data_dir = data_dir self.normalization = None self.transliteration = None self.abbreviations = None def init_new_db(self, config, init_db=True): """ Set up a new tokenizer for the database. This copies all necessary data in the project directory to make sure the tokenizer remains stable even over updates. """ if config.TOKENIZER_CONFIG: cfgfile = Path(config.TOKENIZER_CONFIG) else: cfgfile = config.config_dir / 'legacy_icu_tokenizer.json' rules = json.loads(cfgfile.read_text()) self.transliteration = ';'.join(rules['normalization']) + ';' self.abbreviations = rules["abbreviations"] self.normalization = config.TERM_NORMALIZATION self._install_php(config) self._save_config(config) if init_db: self.update_sql_functions(config) self._init_db_tables(config) def init_from_project(self): """ Initialise the tokenizer from the project directory. """ with connect(self.dsn) as conn: self.normalization = get_property(conn, DBCFG_NORMALIZATION) self.transliteration = get_property(conn, DBCFG_TRANSLITERATION) self.abbreviations = json.loads(get_property(conn, DBCFG_ABBREVIATIONS)) def finalize_import(self, config): """ Do any required postprocessing to make the tokenizer data ready for use. """ with connect(self.dsn) as conn: sqlp = SQLPreprocessor(conn, config) sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql') def update_sql_functions(self, config): """ Reimport the SQL functions for this tokenizer. """ with connect(self.dsn) as conn: max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ) sqlp = SQLPreprocessor(conn, config) sqlp.run_sql_file(conn, 'tokenizer/legacy_icu_tokenizer.sql', max_word_freq=max_word_freq) def check_database(self): """ Check that the tokenizer is set up correctly. """ self.init_from_project() if self.normalization is None\ or self.transliteration is None\ or self.abbreviations is None: return "Configuration for tokenizer 'legacy_icu' are missing." return None def name_analyzer(self): """ Create a new analyzer for tokenizing names and queries using this tokinzer. Analyzers are context managers and should be used accordingly: ``` with tokenizer.name_analyzer() as analyzer: analyser.tokenize() ``` When used outside the with construct, the caller must ensure to call the close() function before destructing the analyzer. Analyzers are not thread-safe. You need to instantiate one per thread. """ norm = Transliterator.createFromRules("normalizer", self.normalization) trans = Transliterator.createFromRules("trans", self.transliteration) return LegacyICUNameAnalyzer(self.dsn, norm, trans, self.abbreviations) def _install_php(self, config): """ Install the php script for the tokenizer. """ abbr_inverse = list(zip(*self.abbreviations)) php_file = self.data_dir / "tokenizer.php" php_file.write_text(dedent("""\ 1 or ',' in hnrs[0] or ';' in hnrs[0]: # split numbers if necessary simple_list = [] for hnr in hnrs: simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr))) if len(simple_list) > 1: hnrs = list(set(simple_list)) else: hnrs = simple_list return hnrs class _TokenInfo: """ Collect token information to be sent back to the database. """ def __init__(self, cache): self.cache = cache self.data = {} @staticmethod def _mk_array(tokens): return '{%s}' % ','.join((str(s) for s in tokens)) def add_names(self, conn, names): """ Adds token information for the normalised names. """ # Start with all partial names terms = set((part for ns in names for part in ns.split())) # Add partials for the full terms (TO BE REMOVED) terms.update((n for n in names)) # Add the full names terms.update((' ' + n for n in names)) self.data['names'] = self._mk_array(self.cache.get_term_tokens(conn, terms)) def add_housenumbers(self, conn, hnrs): """ Extract housenumber information from a list of normalised housenumbers. """ self.data['hnr_tokens'] = self._mk_array(self.cache.get_hnr_tokens(conn, hnrs)) self.data['hnr'] = ';'.join(hnrs) def add_street(self, conn, street): """ Add addr:street match terms. """ if not street: return term = ' ' + street tid = self.cache.names.get(term) if tid is None: with conn.cursor() as cur: cur.execute("""SELECT word_id FROM word WHERE word_token = %s and class is null and type is null""", (term, )) if cur.rowcount > 0: tid = cur.fetchone()[0] self.cache.names[term] = tid if tid is not None: self.data['street'] = '{%d}' % tid def add_place(self, conn, place): """ Add addr:place search and match terms. """ if not place: return partial_ids = self.cache.get_term_tokens(conn, place.split()) tid = self.cache.get_term_tokens(conn, [' ' + place]) self.data['place_search'] = self._mk_array(itertools.chain(partial_ids, tid)) self.data['place_match'] = '{%s}' % tid[0] def add_address_terms(self, conn, terms): """ Add additional address terms. """ tokens = {} for key, value in terms: if not value: continue partial_ids = self.cache.get_term_tokens(conn, value.split()) term = ' ' + value tid = self.cache.names.get(term) if tid is None: with conn.cursor() as cur: cur.execute("""SELECT word_id FROM word WHERE word_token = %s and class is null and type is null""", (term, )) if cur.rowcount > 0: tid = cur.fetchone()[0] self.cache.names[term] = tid tokens[key] = [self._mk_array(partial_ids), '{%s}' % ('' if tid is None else str(tid))] if tokens: self.data['addr'] = tokens class _TokenCache: """ Cache for token information to avoid repeated database queries. This cache is not thread-safe and needs to be instantiated per analyzer. """ def __init__(self): self.names = {} self.postcodes = set() self.housenumbers = {} def get_term_tokens(self, conn, terms): """ Get token ids for a list of terms, looking them up in the database if necessary. """ tokens = [] askdb = [] for term in terms: token = self.names.get(term) if token is None: askdb.append(term) elif token != 0: tokens.append(token) if askdb: with conn.cursor() as cur: cur.execute("SELECT term, getorcreate_term_id(term) FROM unnest(%s) as term", (askdb, )) for term, tid in cur: self.names[term] = tid if tid != 0: tokens.append(tid) return tokens def get_hnr_tokens(self, conn, terms): """ Get token ids for a list of housenumbers, looking them up in the database if necessary. """ tokens = [] askdb = [] for term in terms: token = self.housenumbers.get(term) if token is None: askdb.append(term) else: tokens.append(token) if askdb: with conn.cursor() as cur: cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr", (askdb, )) for term, tid in cur: self.housenumbers[term] = tid tokens.append(tid) return tokens