move abbreviation computation into import phase

This adds precomputation of abbreviated terms for names and removes
abbreviation of terms in the query. Basic import works but still
needs some thorough testing as well as speed improvements during
import.

New dependency for python library datrie.
This commit is contained in:
Sarah Hoffmann
2021-05-28 22:06:13 +02:00
parent 6ba00e6aee
commit 8413075249
10 changed files with 665 additions and 206 deletions

View File

@@ -18,11 +18,11 @@ import psycopg2.extras
from nominatim.db.connection import connect
from nominatim.db.properties import set_property, get_property
from nominatim.db.sql_preprocessor import SQLPreprocessor
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
DBCFG_NORMALIZATION = "tokenizer_normalization"
DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
DBCFG_TRANSLITERATION = "tokenizer_transliteration"
DBCFG_ABBREVIATIONS = "tokenizer_abbreviations"
DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
LOG = logging.getLogger()
@@ -41,9 +41,9 @@ class LegacyICUTokenizer:
def __init__(self, dsn, data_dir):
self.dsn = dsn
self.data_dir = data_dir
self.normalization = None
self.transliteration = None
self.abbreviations = None
self.naming_rules = None
self.term_normalization = None
self.max_word_frequency = None
def init_new_db(self, config, init_db=True):
@@ -55,14 +55,14 @@ class LegacyICUTokenizer:
if config.TOKENIZER_CONFIG:
cfgfile = Path(config.TOKENIZER_CONFIG)
else:
cfgfile = config.config_dir / 'legacy_icu_tokenizer.json'
cfgfile = config.config_dir / 'legacy_icu_tokenizer.yaml'
rules = json.loads(cfgfile.read_text())
self._load_transliteration(rules['normalization'], cfgfile.parent)
self.abbreviations = rules["abbreviations"]
self.normalization = config.TERM_NORMALIZATION
loader = ICURuleLoader(cfgfile)
self.naming_rules = ICUNameProcessorRules(loader=loader)
self.term_normalization = config.TERM_NORMALIZATION
self.max_word_frequency = config.MAX_WORD_FREQUENCY
self._install_php(config)
self._install_php(config.lib_dir.php)
self._save_config(config)
if init_db:
@@ -70,19 +70,13 @@ class LegacyICUTokenizer:
self._init_db_tables(config)
def _load_transliteration(self, rules, cfg_path):
if isinstance(rules, str):
self.transliteration = (cfg_path / rules).read_text().replace('\n', ' ')
else:
self.transliteration = ';'.join(rules) + ';'
def init_from_project(self):
""" Initialise the tokenizer from the project directory.
"""
with connect(self.dsn) as conn:
self.normalization = get_property(conn, DBCFG_NORMALIZATION)
self.transliteration = get_property(conn, DBCFG_TRANSLITERATION)
self.abbreviations = json.loads(get_property(conn, DBCFG_ABBREVIATIONS))
self.naming_rules = ICUNameProcessorRules(conn=conn)
self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
def finalize_import(self, config):
@@ -132,26 +126,20 @@ class LegacyICUTokenizer:
Analyzers are not thread-safe. You need to instantiate one per thread.
"""
norm = Transliterator.createFromRules("normalizer", self.normalization)
trans = Transliterator.createFromRules("trans", self.transliteration)
return LegacyICUNameAnalyzer(self.dsn, norm, trans, self.abbreviations)
return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
def _install_php(self, config):
def _install_php(self, phpdir):
""" Install the php script for the tokenizer.
"""
abbr_inverse = list(zip(*self.abbreviations))
php_file = self.data_dir / "tokenizer.php"
php_file.write_text(dedent("""\
<?php
@define('CONST_Max_Word_Frequency', {1.MAX_WORD_FREQUENCY});
@define('CONST_Term_Normalization_Rules', "{0.normalization}");
@define('CONST_Transliteration', "{0.transliteration}");
@define('CONST_Abbreviations', array(array('{2}'), array('{3}')));
require_once('{1.lib_dir.php}/tokenizer/legacy_icu_tokenizer.php');
""".format(self, config,
"','".join(abbr_inverse[0]),
"','".join(abbr_inverse[1]))))
@define('CONST_Max_Word_Frequency', {0.max_word_frequency});
@define('CONST_Term_Normalization_Rules', "{0.term_normalization}");
@define('CONST_Transliteration', "{0.naming_rules.search_rules}");
require_once('{1}/tokenizer/legacy_icu_tokenizer.php');
""".format(self, phpdir)))
def _save_config(self, config):
@@ -159,10 +147,10 @@ class LegacyICUTokenizer:
database as database properties.
"""
with connect(self.dsn) as conn:
set_property(conn, DBCFG_NORMALIZATION, self.normalization)
self.naming_rules.save_rules(conn)
set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
set_property(conn, DBCFG_TRANSLITERATION, self.transliteration)
set_property(conn, DBCFG_ABBREVIATIONS, json.dumps(self.abbreviations))
set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
def _init_db_tables(self, config):
@@ -178,15 +166,14 @@ class LegacyICUTokenizer:
# get partial words and their frequencies
words = Counter()
with self.name_analyzer() as analyzer:
with conn.cursor(name="words") as cur:
cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
name_proc = ICUNameProcessor(self.naming_rules)
with conn.cursor(name="words") as cur:
cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
for name, cnt in cur:
term = analyzer.make_standard_word(name)
if term:
for word in term.split():
words[word] += cnt
for name, cnt in cur:
for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
for term in word.split():
words[term] += cnt
# copy them back into the word table
copystr = io.StringIO(''.join(('{}\t{}\n'.format(*args) for args in words.items())))
@@ -208,12 +195,10 @@ class LegacyICUNameAnalyzer:
normalization.
"""
def __init__(self, dsn, normalizer, transliterator, abbreviations):
def __init__(self, dsn, name_proc):
self.conn = connect(dsn).connection
self.conn.autocommit = True
self.normalizer = normalizer
self.transliterator = transliterator
self.abbreviations = abbreviations
self.name_processor = name_proc
self._cache = _TokenCache()
@@ -248,9 +233,9 @@ class LegacyICUNameAnalyzer:
tokens = {}
for word in words:
if word.startswith('#'):
tokens[word] = ' ' + self.make_standard_word(word[1:])
tokens[word] = ' ' + self.name_processor.get_normalized(word[1:])
else:
tokens[word] = self.make_standard_word(word)
tokens[word] = self.name_processor.get_normalized(word)
with conn.cursor() as cur:
cur.execute("""SELECT word_token, word_id
@@ -263,12 +248,6 @@ class LegacyICUNameAnalyzer:
return [(k, v, ids[v]) for k, v in tokens.items()]
def normalize(self, phrase):
""" Normalize the given phrase, i.e. remove all properties that
are irrelevant for search.
"""
return self.normalizer.transliterate(phrase)
@staticmethod
def normalize_postcode(postcode):
""" Convert the postcode to a standardized form.
@@ -279,27 +258,12 @@ class LegacyICUNameAnalyzer:
return postcode.strip().upper()
@functools.lru_cache(maxsize=1024)
def make_standard_word(self, name):
""" Create the normalised version of the input.
"""
norm = ' ' + self.transliterator.transliterate(name) + ' '
for full, abbr in self.abbreviations:
if full in norm:
norm = norm.replace(full, abbr)
return norm.strip()
def _make_standard_hnr(self, hnr):
""" Create a normalised version of a housenumber.
This function takes minor shortcuts on transliteration.
"""
if hnr.isdigit():
return hnr
return self.transliterator.transliterate(hnr)
return self.name_processor.get_search_normalized(hnr)
def update_postcodes_from_db(self):
""" Update postcode tokens in the word table from the location_postcode
@@ -325,7 +289,7 @@ class LegacyICUNameAnalyzer:
else:
copystr.write(postcode)
copystr.write('\t ')
copystr.write(self.transliterator.transliterate(postcode))
copystr.write(self.name_processor.get_search_normalized(postcode))
copystr.write('\tplace\tpostcode\t0\n')
if to_delete:
@@ -344,7 +308,7 @@ class LegacyICUNameAnalyzer:
def update_special_phrases(self, phrases, should_replace):
""" Replace the search index for special phrases with the new phrases.
"""
norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
norm_phrases = set(((self.name_processor.get_search_normalized(p[0]), p[1], p[2], p[3])
for p in phrases))
with self.conn.cursor() as cur:
@@ -362,7 +326,7 @@ class LegacyICUNameAnalyzer:
if to_add:
copystr = io.StringIO()
for word, cls, typ, oper in to_add:
term = self.make_standard_word(word)
term = self.name_processor.get_search_normalized(word)
if term:
copystr.write(word)
copystr.write('\t ')
@@ -395,15 +359,11 @@ class LegacyICUNameAnalyzer:
def add_country_names(self, country_code, names):
""" Add names for the given country to the search index.
"""
full_names = set((self.make_standard_word(n) for n in names))
full_names.discard('')
self._add_normalized_country_names(country_code, full_names)
word_tokens = set()
for name in self._compute_full_names(names):
if name:
word_tokens.add(' ' + self.name_processor.get_search_normalized(name))
def _add_normalized_country_names(self, country_code, names):
""" Add names for the given country to the search index.
"""
word_tokens = set((' ' + name for name in names))
with self.conn.cursor() as cur:
# Get existing names
cur.execute("SELECT word_token FROM word WHERE country_code = %s",
@@ -429,14 +389,13 @@ class LegacyICUNameAnalyzer:
names = place.get('name')
if names:
full_names = self._compute_full_names(names)
fulls, partials = self._compute_name_tokens(names)
token_info.add_names(self.conn, full_names)
token_info.add_names(fulls, partials)
country_feature = place.get('country_feature')
if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
self._add_normalized_country_names(country_feature.lower(),
full_names)
self.add_country_names(country_feature.lower(), names)
address = place.get('address')
@@ -449,38 +408,60 @@ class LegacyICUNameAnalyzer:
elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
hnrs.append(value)
elif key == 'street':
token_info.add_street(self.conn, self.make_standard_word(value))
token_info.add_street(*self._compute_name_tokens({'name': value}))
elif key == 'place':
token_info.add_place(self.conn, self.make_standard_word(value))
token_info.add_place(*self._compute_name_tokens({'name': value}))
elif not key.startswith('_') and \
key not in ('country', 'full'):
addr_terms.append((key, self.make_standard_word(value)))
addr_terms.append((key, *self._compute_name_tokens({'name': value})))
if hnrs:
hnrs = self._split_housenumbers(hnrs)
token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
if addr_terms:
token_info.add_address_terms(self.conn, addr_terms)
token_info.add_address_terms(addr_terms)
return token_info.data
def _compute_name_tokens(self, names):
""" Computes the full name and partial name tokens for the given
dictionary of names.
"""
full_names = self._compute_full_names(names)
full_tokens = set()
partial_tokens = set()
for name in full_names:
norm_name = self.name_processor.get_normalized(name)
full, part = self._cache.names.get(norm_name, (None, None))
if full is None:
variants = self.name_processor.get_variants_ascii(norm_name)
with self.conn.cursor() as cur:
cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
(norm_name, variants))
full, part = cur.fetchone()
self._cache.names[norm_name] = (full, part)
full_tokens.add(full)
partial_tokens.update(part)
return full_tokens, partial_tokens
def _compute_full_names(self, names):
""" Return the set of all full name word ids to be used with the
given dictionary of names.
"""
full_names = set()
for name in (n for ns in names.values() for n in re.split('[;,]', ns)):
word = self.make_standard_word(name)
if word:
full_names.add(word)
full_names.add(name.strip())
brace_split = name.split('(', 2)
if len(brace_split) > 1:
word = self.make_standard_word(brace_split[0])
if word:
full_names.add(word)
brace_idx = name.find('(')
if brace_idx >= 0:
full_names.add(name[:brace_idx].strip())
return full_names
@@ -492,7 +473,7 @@ class LegacyICUNameAnalyzer:
postcode = self.normalize_postcode(postcode)
if postcode not in self._cache.postcodes:
term = self.make_standard_word(postcode)
term = self.name_processor.get_search_normalized(postcode)
if not term:
return
@@ -508,6 +489,7 @@ class LegacyICUNameAnalyzer:
""", (' ' + term, postcode))
self._cache.postcodes.add(postcode)
@staticmethod
def _split_housenumbers(hnrs):
if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
@@ -530,7 +512,7 @@ class _TokenInfo:
""" Collect token information to be sent back to the database.
"""
def __init__(self, cache):
self.cache = cache
self._cache = cache
self.data = {}
@staticmethod
@@ -538,86 +520,44 @@ class _TokenInfo:
return '{%s}' % ','.join((str(s) for s in tokens))
def add_names(self, conn, names):
def add_names(self, fulls, partials):
""" Adds token information for the normalised names.
"""
# Start with all partial names
terms = set((part for ns in names for part in ns.split()))
# Add the full names
terms.update((' ' + n for n in names))
self.data['names'] = self._mk_array(self.cache.get_term_tokens(conn, terms))
self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
def add_housenumbers(self, conn, hnrs):
""" Extract housenumber information from a list of normalised
housenumbers.
"""
self.data['hnr_tokens'] = self._mk_array(self.cache.get_hnr_tokens(conn, hnrs))
self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
self.data['hnr'] = ';'.join(hnrs)
def add_street(self, conn, street):
def add_street(self, fulls, partials):
""" Add addr:street match terms.
"""
if not street:
return
term = ' ' + street
tid = self.cache.names.get(term)
if tid is None:
with conn.cursor() as cur:
cur.execute("""SELECT word_id FROM word
WHERE word_token = %s
and class is null and type is null""",
(term, ))
if cur.rowcount > 0:
tid = cur.fetchone()[0]
self.cache.names[term] = tid
if tid is not None:
self.data['street'] = '{%d}' % tid
if fulls:
self.data['street'] = self._mk_array(fulls)
def add_place(self, conn, place):
def add_place(self, fulls, partials):
""" Add addr:place search and match terms.
"""
if not place:
return
partial_ids = self.cache.get_term_tokens(conn, place.split())
tid = self.cache.get_term_tokens(conn, [' ' + place])
self.data['place_search'] = self._mk_array(itertools.chain(partial_ids, tid))
self.data['place_match'] = '{%s}' % tid[0]
if fulls:
self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials))
self.data['place_match'] = self._mk_array(fulls)
def add_address_terms(self, conn, terms):
def add_address_terms(self, terms):
""" Add additional address terms.
"""
tokens = {}
for key, value in terms:
if not value:
continue
partial_ids = self.cache.get_term_tokens(conn, value.split())
term = ' ' + value
tid = self.cache.names.get(term)
if tid is None:
with conn.cursor() as cur:
cur.execute("""SELECT word_id FROM word
WHERE word_token = %s
and class is null and type is null""",
(term, ))
if cur.rowcount > 0:
tid = cur.fetchone()[0]
self.cache.names[term] = tid
tokens[key] = [self._mk_array(partial_ids),
'{%s}' % ('' if tid is None else str(tid))]
for key, fulls, partials in terms:
if fulls:
tokens[key] = [self._mk_array(itertools.chain(fulls, partials)),
self._mk_array(fulls)]
if tokens:
self.data['addr'] = tokens
@@ -635,32 +575,6 @@ class _TokenCache:
self.housenumbers = {}
def get_term_tokens(self, conn, terms):
""" Get token ids for a list of terms, looking them up in the database
if necessary.
"""
tokens = []
askdb = []
for term in terms:
token = self.names.get(term)
if token is None:
askdb.append(term)
elif token != 0:
tokens.append(token)
if askdb:
with conn.cursor() as cur:
cur.execute("SELECT term, getorcreate_term_id(term) FROM unnest(%s) as term",
(askdb, ))
for term, tid in cur:
self.names[term] = tid
if tid != 0:
tokens.append(tid)
return tokens
def get_hnr_tokens(self, conn, terms):
""" Get token ids for a list of housenumbers, looking them up in the
database if necessary.