forked from hans/Nominatim
move abbreviation computation into import phase
This adds precomputation of abbreviated terms for names and removes abbreviation of terms in the query. Basic import works but still needs some thorough testing as well as speed improvements during import. New dependency for python library datrie.
This commit is contained in:
111
nominatim/tokenizer/icu_name_processor.py
Normal file
111
nominatim/tokenizer/icu_name_processor.py
Normal file
@@ -0,0 +1,111 @@
|
||||
"""
|
||||
Processor for names that are imported into the database based on the
|
||||
ICU library.
|
||||
"""
|
||||
import json
|
||||
import itertools
|
||||
|
||||
from icu import Transliterator
|
||||
import datrie
|
||||
|
||||
from nominatim.db.properties import set_property, get_property
|
||||
|
||||
DBCFG_IMPORT_NORM_RULES = "tokenizer_import_normalisation"
|
||||
DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration"
|
||||
DBCFG_IMPORT_REPLACEMENTS = "tokenizer_import_replacements"
|
||||
DBCFG_SEARCH_STD_RULES = "tokenizer_search_standardization"
|
||||
|
||||
|
||||
class ICUNameProcessorRules:
|
||||
""" Data object that saves the rules needed for the name processor.
|
||||
|
||||
The rules can either be initialised through an ICURuleLoader or
|
||||
be loaded from a database when a connection is given.
|
||||
"""
|
||||
def __init__(self, loader=None, conn=None):
|
||||
if loader is not None:
|
||||
self.norm_rules = loader.get_normalization_rules()
|
||||
self.trans_rules = loader.get_transliteration_rules()
|
||||
self.replacements = loader.get_replacement_pairs()
|
||||
self.search_rules = loader.get_search_rules()
|
||||
elif conn is not None:
|
||||
self.norm_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
|
||||
self.trans_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
|
||||
self.replacements = json.loads(get_property(conn, DBCFG_IMPORT_REPLACEMENTS))
|
||||
self.search_rules = get_property(conn, DBCFG_SEARCH_STD_RULES)
|
||||
else:
|
||||
assert False, "Parameter loader or conn required."
|
||||
|
||||
# Compute the set of characters used in the replacement list.
|
||||
# We need this later when computing the tree.
|
||||
chars = set()
|
||||
for full, repl in self.replacements:
|
||||
chars.update(full)
|
||||
for word in repl:
|
||||
chars.update(word)
|
||||
self.replacement_charset = ''.join(chars)
|
||||
|
||||
|
||||
def save_rules(self, conn):
|
||||
""" Save the rules in the property table of the given database.
|
||||
the rules can be loaded again by handing in a connection into
|
||||
the constructor of the class.
|
||||
"""
|
||||
set_property(conn, DBCFG_IMPORT_NORM_RULES, self.norm_rules)
|
||||
set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.trans_rules)
|
||||
set_property(conn, DBCFG_IMPORT_REPLACEMENTS, json.dumps(self.replacements))
|
||||
set_property(conn, DBCFG_SEARCH_STD_RULES, self.search_rules)
|
||||
|
||||
|
||||
class ICUNameProcessor:
|
||||
|
||||
def __init__(self, rules):
|
||||
self.normalizer = Transliterator.createFromRules("icu_normalization",
|
||||
rules.norm_rules)
|
||||
self.to_ascii = Transliterator.createFromRules("icu_to_ascii",
|
||||
rules.trans_rules)
|
||||
self.search = Transliterator.createFromRules("icu_search",
|
||||
rules.search_rules)
|
||||
|
||||
self.replacements = datrie.Trie(rules.replacement_charset)
|
||||
for full, repl in rules.replacements:
|
||||
self.replacements[full] = repl
|
||||
|
||||
|
||||
def get_normalized(self, name):
|
||||
""" Normalize the given name, i.e. remove all elements not relevant
|
||||
for search.
|
||||
"""
|
||||
return self.normalizer.transliterate(name)
|
||||
|
||||
def get_variants_ascii(self, norm_name):
|
||||
""" Compute the spelling variants for the given normalized name
|
||||
and transliterate the result.
|
||||
"""
|
||||
baseform = ' ' + norm_name + ' '
|
||||
variants = ['']
|
||||
|
||||
startpos = 0
|
||||
pos = 0
|
||||
while pos < len(baseform):
|
||||
full, repl = self.replacements.longest_prefix_item(baseform[pos:],
|
||||
(None, None))
|
||||
if full is not None:
|
||||
done = baseform[startpos:pos]
|
||||
variants = [v + done + r for v, r in itertools.product(variants, repl)]
|
||||
startpos = pos + len(full)
|
||||
pos = startpos
|
||||
else:
|
||||
pos += 1
|
||||
|
||||
if startpos == 0:
|
||||
return [self.to_ascii.transliterate(norm_name)]
|
||||
|
||||
return [self.to_ascii.transliterate(v + baseform[startpos:pos]).strip() for v in variants]
|
||||
|
||||
|
||||
def get_search_normalized(self, name):
|
||||
""" Return the normalized version of the name (including transliteration)
|
||||
to be applied at search time.
|
||||
"""
|
||||
return self.search.transliterate(name)
|
||||
161
nominatim/tokenizer/icu_rule_loader.py
Normal file
161
nominatim/tokenizer/icu_rule_loader.py
Normal file
@@ -0,0 +1,161 @@
|
||||
"""
|
||||
Helper class to create ICU rules from a configuration file.
|
||||
"""
|
||||
import io
|
||||
import yaml
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
import itertools
|
||||
|
||||
from icu import Transliterator
|
||||
|
||||
from nominatim.errors import UsageError
|
||||
|
||||
LOG = logging.getLogger()
|
||||
|
||||
|
||||
class ICURuleLoader:
|
||||
""" Compiler for ICU rules from a tokenizer configuration file.
|
||||
"""
|
||||
|
||||
def __init__(self, configfile):
|
||||
self.configfile = configfile
|
||||
|
||||
if configfile.suffix == '.yaml':
|
||||
self._load_from_yaml()
|
||||
else:
|
||||
raise UsageError("Unknown format of tokenizer configuration.")
|
||||
|
||||
|
||||
def get_search_rules(self):
|
||||
""" Returns the ICU rules to be used during search.
|
||||
The rules combine normalization, compound decomposition (including
|
||||
abbreviated compounds) and transliteration.
|
||||
"""
|
||||
# First apply the normalization rules.
|
||||
rules = io.StringIO()
|
||||
rules.write(self.normalization_rules)
|
||||
|
||||
# For all compound suffixes: add them in their full and any abbreviated form.
|
||||
suffixes = set()
|
||||
for suffix in self.compound_suffixes:
|
||||
suffixes.add(suffix)
|
||||
suffixes.update(self.abbreviations.get(suffix, []))
|
||||
|
||||
for suffix in sorted(suffixes, key=lambda x:len(x), reverse=True):
|
||||
rules.write("'{0} ' > ' {0} ';".format(suffix))
|
||||
|
||||
# Finally add transliteration.
|
||||
rules.write(self.transliteration_rules)
|
||||
return rules.getvalue()
|
||||
|
||||
def get_normalization_rules(self):
|
||||
""" Return rules for normalisation of a term.
|
||||
"""
|
||||
return self.normalization_rules
|
||||
|
||||
def get_transliteration_rules(self):
|
||||
""" Return the rules for converting a string into its asciii representation.
|
||||
"""
|
||||
return self.transliteration_rules
|
||||
|
||||
def get_replacement_pairs(self):
|
||||
""" Returns the list of possible compound decompositions with
|
||||
application of abbreviations included.
|
||||
The result is a list of pairs: the first item is the sequence to
|
||||
replace, the second is a list of replacements.
|
||||
"""
|
||||
synonyms = defaultdict(set)
|
||||
|
||||
for full, abbr in self.abbreviations.items():
|
||||
key = ' ' + full + ' '
|
||||
# Entries in the abbreviation list always apply to full words:
|
||||
synonyms[key].update((' ' + a + ' ' for a in abbr))
|
||||
# Replacements are optional, so add a noop
|
||||
synonyms[key].add(key)
|
||||
|
||||
# Entries in the compound list expand to themselves and to
|
||||
# abbreviations.
|
||||
for suffix in self.compound_suffixes:
|
||||
keyset = synonyms[suffix + ' ']
|
||||
keyset.add(' ' + suffix + ' ')
|
||||
keyset.update((' ' + a + ' ' for a in self.abbreviations.get(suffix, [])))
|
||||
# The terms the entries are shortended to, need to be decompunded as well.
|
||||
for abbr in self.abbreviations.get(suffix, []):
|
||||
synonyms[abbr + ' '].add(' ' + abbr + ' ')
|
||||
|
||||
# sort the resulting list by descending length (longer matches are prefered).
|
||||
sorted_keys = sorted(synonyms.keys(), key=lambda x: len(x), reverse=True)
|
||||
|
||||
return [(k, list(synonyms[k])) for k in sorted_keys]
|
||||
|
||||
|
||||
def _load_from_yaml(self):
|
||||
rules = yaml.load(self.configfile.read_text())
|
||||
|
||||
self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
|
||||
self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
|
||||
self._parse_compound_suffix_list(self._get_section(rules, 'compound_suffixes'))
|
||||
self._parse_abbreviation_list(self._get_section(rules, 'abbreviations'))
|
||||
|
||||
|
||||
def _get_section(self, rules, section):
|
||||
""" Get the section named 'section' from the rules. If the section does
|
||||
not exist, raise a usage error with a meaningful message.
|
||||
"""
|
||||
if section not in rules:
|
||||
LOG.fatal("Section '%s' not found in tokenizer config '%s'.",
|
||||
section, str(self.configfile))
|
||||
raise UsageError("Syntax error in tokenizer configuration file.")
|
||||
|
||||
return rules[section]
|
||||
|
||||
|
||||
def _cfg_to_icu_rules(self, rules, section):
|
||||
""" Load an ICU ruleset from the given section. If the section is a
|
||||
simple string, it is interpreted as a file name and the rules are
|
||||
loaded verbatim from the given file. The filename is expected to be
|
||||
relative to the tokenizer rule file. If the section is a list then
|
||||
each line is assumed to be a rule. All rules are concatenated and returned.
|
||||
"""
|
||||
content = self._get_section(rules, section)
|
||||
|
||||
if isinstance(content, str):
|
||||
return (self.configfile.parent / content).read_text().replace('\n', ' ')
|
||||
|
||||
return ';'.join(content) + ';'
|
||||
|
||||
|
||||
def _parse_compound_suffix_list(self, rules):
|
||||
if not rules:
|
||||
self.compound_suffixes = set()
|
||||
return
|
||||
|
||||
norm = Transliterator.createFromRules("rule_loader_normalization",
|
||||
self.normalization_rules)
|
||||
|
||||
# Make sure all suffixes are in their normalised form.
|
||||
self.compound_suffixes = set((norm.transliterate(s) for s in rules))
|
||||
|
||||
|
||||
def _parse_abbreviation_list(self, rules):
|
||||
self.abbreviations = defaultdict(list)
|
||||
|
||||
if not rules:
|
||||
return
|
||||
|
||||
norm = Transliterator.createFromRules("rule_loader_normalization",
|
||||
self.normalization_rules)
|
||||
|
||||
for rule in rules:
|
||||
parts = rule.split('=>')
|
||||
if len(parts) != 2:
|
||||
LOG.fatal("Syntax error in abbreviation section, line: %s", rule)
|
||||
raise UsageError("Syntax error in tokenizer configuration file.")
|
||||
|
||||
# Make sure all terms match the normalised version.
|
||||
fullterms = (norm.transliterate(t.strip()) for t in parts[0].split(','))
|
||||
abbrterms = (norm.transliterate(t.strip()) for t in parts[1].split(','))
|
||||
|
||||
for full, abbr in itertools.product(fullterms, abbrterms):
|
||||
self.abbreviations[full].append(abbr)
|
||||
@@ -18,11 +18,11 @@ import psycopg2.extras
|
||||
from nominatim.db.connection import connect
|
||||
from nominatim.db.properties import set_property, get_property
|
||||
from nominatim.db.sql_preprocessor import SQLPreprocessor
|
||||
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
|
||||
from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
|
||||
|
||||
DBCFG_NORMALIZATION = "tokenizer_normalization"
|
||||
DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
|
||||
DBCFG_TRANSLITERATION = "tokenizer_transliteration"
|
||||
DBCFG_ABBREVIATIONS = "tokenizer_abbreviations"
|
||||
DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
|
||||
|
||||
LOG = logging.getLogger()
|
||||
|
||||
@@ -41,9 +41,9 @@ class LegacyICUTokenizer:
|
||||
def __init__(self, dsn, data_dir):
|
||||
self.dsn = dsn
|
||||
self.data_dir = data_dir
|
||||
self.normalization = None
|
||||
self.transliteration = None
|
||||
self.abbreviations = None
|
||||
self.naming_rules = None
|
||||
self.term_normalization = None
|
||||
self.max_word_frequency = None
|
||||
|
||||
|
||||
def init_new_db(self, config, init_db=True):
|
||||
@@ -55,14 +55,14 @@ class LegacyICUTokenizer:
|
||||
if config.TOKENIZER_CONFIG:
|
||||
cfgfile = Path(config.TOKENIZER_CONFIG)
|
||||
else:
|
||||
cfgfile = config.config_dir / 'legacy_icu_tokenizer.json'
|
||||
cfgfile = config.config_dir / 'legacy_icu_tokenizer.yaml'
|
||||
|
||||
rules = json.loads(cfgfile.read_text())
|
||||
self._load_transliteration(rules['normalization'], cfgfile.parent)
|
||||
self.abbreviations = rules["abbreviations"]
|
||||
self.normalization = config.TERM_NORMALIZATION
|
||||
loader = ICURuleLoader(cfgfile)
|
||||
self.naming_rules = ICUNameProcessorRules(loader=loader)
|
||||
self.term_normalization = config.TERM_NORMALIZATION
|
||||
self.max_word_frequency = config.MAX_WORD_FREQUENCY
|
||||
|
||||
self._install_php(config)
|
||||
self._install_php(config.lib_dir.php)
|
||||
self._save_config(config)
|
||||
|
||||
if init_db:
|
||||
@@ -70,19 +70,13 @@ class LegacyICUTokenizer:
|
||||
self._init_db_tables(config)
|
||||
|
||||
|
||||
def _load_transliteration(self, rules, cfg_path):
|
||||
if isinstance(rules, str):
|
||||
self.transliteration = (cfg_path / rules).read_text().replace('\n', ' ')
|
||||
else:
|
||||
self.transliteration = ';'.join(rules) + ';'
|
||||
|
||||
def init_from_project(self):
|
||||
""" Initialise the tokenizer from the project directory.
|
||||
"""
|
||||
with connect(self.dsn) as conn:
|
||||
self.normalization = get_property(conn, DBCFG_NORMALIZATION)
|
||||
self.transliteration = get_property(conn, DBCFG_TRANSLITERATION)
|
||||
self.abbreviations = json.loads(get_property(conn, DBCFG_ABBREVIATIONS))
|
||||
self.naming_rules = ICUNameProcessorRules(conn=conn)
|
||||
self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
|
||||
self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
|
||||
|
||||
|
||||
def finalize_import(self, config):
|
||||
@@ -132,26 +126,20 @@ class LegacyICUTokenizer:
|
||||
|
||||
Analyzers are not thread-safe. You need to instantiate one per thread.
|
||||
"""
|
||||
norm = Transliterator.createFromRules("normalizer", self.normalization)
|
||||
trans = Transliterator.createFromRules("trans", self.transliteration)
|
||||
return LegacyICUNameAnalyzer(self.dsn, norm, trans, self.abbreviations)
|
||||
return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
|
||||
|
||||
|
||||
def _install_php(self, config):
|
||||
def _install_php(self, phpdir):
|
||||
""" Install the php script for the tokenizer.
|
||||
"""
|
||||
abbr_inverse = list(zip(*self.abbreviations))
|
||||
php_file = self.data_dir / "tokenizer.php"
|
||||
php_file.write_text(dedent("""\
|
||||
<?php
|
||||
@define('CONST_Max_Word_Frequency', {1.MAX_WORD_FREQUENCY});
|
||||
@define('CONST_Term_Normalization_Rules', "{0.normalization}");
|
||||
@define('CONST_Transliteration', "{0.transliteration}");
|
||||
@define('CONST_Abbreviations', array(array('{2}'), array('{3}')));
|
||||
require_once('{1.lib_dir.php}/tokenizer/legacy_icu_tokenizer.php');
|
||||
""".format(self, config,
|
||||
"','".join(abbr_inverse[0]),
|
||||
"','".join(abbr_inverse[1]))))
|
||||
@define('CONST_Max_Word_Frequency', {0.max_word_frequency});
|
||||
@define('CONST_Term_Normalization_Rules', "{0.term_normalization}");
|
||||
@define('CONST_Transliteration', "{0.naming_rules.search_rules}");
|
||||
require_once('{1}/tokenizer/legacy_icu_tokenizer.php');
|
||||
""".format(self, phpdir)))
|
||||
|
||||
|
||||
def _save_config(self, config):
|
||||
@@ -159,10 +147,10 @@ class LegacyICUTokenizer:
|
||||
database as database properties.
|
||||
"""
|
||||
with connect(self.dsn) as conn:
|
||||
set_property(conn, DBCFG_NORMALIZATION, self.normalization)
|
||||
self.naming_rules.save_rules(conn)
|
||||
|
||||
set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
|
||||
set_property(conn, DBCFG_TRANSLITERATION, self.transliteration)
|
||||
set_property(conn, DBCFG_ABBREVIATIONS, json.dumps(self.abbreviations))
|
||||
set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
|
||||
|
||||
|
||||
def _init_db_tables(self, config):
|
||||
@@ -178,15 +166,14 @@ class LegacyICUTokenizer:
|
||||
|
||||
# get partial words and their frequencies
|
||||
words = Counter()
|
||||
with self.name_analyzer() as analyzer:
|
||||
with conn.cursor(name="words") as cur:
|
||||
cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
|
||||
name_proc = ICUNameProcessor(self.naming_rules)
|
||||
with conn.cursor(name="words") as cur:
|
||||
cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
|
||||
|
||||
for name, cnt in cur:
|
||||
term = analyzer.make_standard_word(name)
|
||||
if term:
|
||||
for word in term.split():
|
||||
words[word] += cnt
|
||||
for name, cnt in cur:
|
||||
for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
|
||||
for term in word.split():
|
||||
words[term] += cnt
|
||||
|
||||
# copy them back into the word table
|
||||
copystr = io.StringIO(''.join(('{}\t{}\n'.format(*args) for args in words.items())))
|
||||
@@ -208,12 +195,10 @@ class LegacyICUNameAnalyzer:
|
||||
normalization.
|
||||
"""
|
||||
|
||||
def __init__(self, dsn, normalizer, transliterator, abbreviations):
|
||||
def __init__(self, dsn, name_proc):
|
||||
self.conn = connect(dsn).connection
|
||||
self.conn.autocommit = True
|
||||
self.normalizer = normalizer
|
||||
self.transliterator = transliterator
|
||||
self.abbreviations = abbreviations
|
||||
self.name_processor = name_proc
|
||||
|
||||
self._cache = _TokenCache()
|
||||
|
||||
@@ -248,9 +233,9 @@ class LegacyICUNameAnalyzer:
|
||||
tokens = {}
|
||||
for word in words:
|
||||
if word.startswith('#'):
|
||||
tokens[word] = ' ' + self.make_standard_word(word[1:])
|
||||
tokens[word] = ' ' + self.name_processor.get_normalized(word[1:])
|
||||
else:
|
||||
tokens[word] = self.make_standard_word(word)
|
||||
tokens[word] = self.name_processor.get_normalized(word)
|
||||
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""SELECT word_token, word_id
|
||||
@@ -263,12 +248,6 @@ class LegacyICUNameAnalyzer:
|
||||
return [(k, v, ids[v]) for k, v in tokens.items()]
|
||||
|
||||
|
||||
def normalize(self, phrase):
|
||||
""" Normalize the given phrase, i.e. remove all properties that
|
||||
are irrelevant for search.
|
||||
"""
|
||||
return self.normalizer.transliterate(phrase)
|
||||
|
||||
@staticmethod
|
||||
def normalize_postcode(postcode):
|
||||
""" Convert the postcode to a standardized form.
|
||||
@@ -279,27 +258,12 @@ class LegacyICUNameAnalyzer:
|
||||
return postcode.strip().upper()
|
||||
|
||||
|
||||
@functools.lru_cache(maxsize=1024)
|
||||
def make_standard_word(self, name):
|
||||
""" Create the normalised version of the input.
|
||||
"""
|
||||
norm = ' ' + self.transliterator.transliterate(name) + ' '
|
||||
for full, abbr in self.abbreviations:
|
||||
if full in norm:
|
||||
norm = norm.replace(full, abbr)
|
||||
|
||||
return norm.strip()
|
||||
|
||||
|
||||
def _make_standard_hnr(self, hnr):
|
||||
""" Create a normalised version of a housenumber.
|
||||
|
||||
This function takes minor shortcuts on transliteration.
|
||||
"""
|
||||
if hnr.isdigit():
|
||||
return hnr
|
||||
|
||||
return self.transliterator.transliterate(hnr)
|
||||
return self.name_processor.get_search_normalized(hnr)
|
||||
|
||||
def update_postcodes_from_db(self):
|
||||
""" Update postcode tokens in the word table from the location_postcode
|
||||
@@ -325,7 +289,7 @@ class LegacyICUNameAnalyzer:
|
||||
else:
|
||||
copystr.write(postcode)
|
||||
copystr.write('\t ')
|
||||
copystr.write(self.transliterator.transliterate(postcode))
|
||||
copystr.write(self.name_processor.get_search_normalized(postcode))
|
||||
copystr.write('\tplace\tpostcode\t0\n')
|
||||
|
||||
if to_delete:
|
||||
@@ -344,7 +308,7 @@ class LegacyICUNameAnalyzer:
|
||||
def update_special_phrases(self, phrases, should_replace):
|
||||
""" Replace the search index for special phrases with the new phrases.
|
||||
"""
|
||||
norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
|
||||
norm_phrases = set(((self.name_processor.get_search_normalized(p[0]), p[1], p[2], p[3])
|
||||
for p in phrases))
|
||||
|
||||
with self.conn.cursor() as cur:
|
||||
@@ -362,7 +326,7 @@ class LegacyICUNameAnalyzer:
|
||||
if to_add:
|
||||
copystr = io.StringIO()
|
||||
for word, cls, typ, oper in to_add:
|
||||
term = self.make_standard_word(word)
|
||||
term = self.name_processor.get_search_normalized(word)
|
||||
if term:
|
||||
copystr.write(word)
|
||||
copystr.write('\t ')
|
||||
@@ -395,15 +359,11 @@ class LegacyICUNameAnalyzer:
|
||||
def add_country_names(self, country_code, names):
|
||||
""" Add names for the given country to the search index.
|
||||
"""
|
||||
full_names = set((self.make_standard_word(n) for n in names))
|
||||
full_names.discard('')
|
||||
self._add_normalized_country_names(country_code, full_names)
|
||||
word_tokens = set()
|
||||
for name in self._compute_full_names(names):
|
||||
if name:
|
||||
word_tokens.add(' ' + self.name_processor.get_search_normalized(name))
|
||||
|
||||
|
||||
def _add_normalized_country_names(self, country_code, names):
|
||||
""" Add names for the given country to the search index.
|
||||
"""
|
||||
word_tokens = set((' ' + name for name in names))
|
||||
with self.conn.cursor() as cur:
|
||||
# Get existing names
|
||||
cur.execute("SELECT word_token FROM word WHERE country_code = %s",
|
||||
@@ -429,14 +389,13 @@ class LegacyICUNameAnalyzer:
|
||||
names = place.get('name')
|
||||
|
||||
if names:
|
||||
full_names = self._compute_full_names(names)
|
||||
fulls, partials = self._compute_name_tokens(names)
|
||||
|
||||
token_info.add_names(self.conn, full_names)
|
||||
token_info.add_names(fulls, partials)
|
||||
|
||||
country_feature = place.get('country_feature')
|
||||
if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
|
||||
self._add_normalized_country_names(country_feature.lower(),
|
||||
full_names)
|
||||
self.add_country_names(country_feature.lower(), names)
|
||||
|
||||
address = place.get('address')
|
||||
|
||||
@@ -449,38 +408,60 @@ class LegacyICUNameAnalyzer:
|
||||
elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
|
||||
hnrs.append(value)
|
||||
elif key == 'street':
|
||||
token_info.add_street(self.conn, self.make_standard_word(value))
|
||||
token_info.add_street(*self._compute_name_tokens({'name': value}))
|
||||
elif key == 'place':
|
||||
token_info.add_place(self.conn, self.make_standard_word(value))
|
||||
token_info.add_place(*self._compute_name_tokens({'name': value}))
|
||||
elif not key.startswith('_') and \
|
||||
key not in ('country', 'full'):
|
||||
addr_terms.append((key, self.make_standard_word(value)))
|
||||
addr_terms.append((key, *self._compute_name_tokens({'name': value})))
|
||||
|
||||
if hnrs:
|
||||
hnrs = self._split_housenumbers(hnrs)
|
||||
token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
|
||||
|
||||
if addr_terms:
|
||||
token_info.add_address_terms(self.conn, addr_terms)
|
||||
token_info.add_address_terms(addr_terms)
|
||||
|
||||
return token_info.data
|
||||
|
||||
|
||||
def _compute_name_tokens(self, names):
|
||||
""" Computes the full name and partial name tokens for the given
|
||||
dictionary of names.
|
||||
"""
|
||||
full_names = self._compute_full_names(names)
|
||||
full_tokens = set()
|
||||
partial_tokens = set()
|
||||
|
||||
for name in full_names:
|
||||
norm_name = self.name_processor.get_normalized(name)
|
||||
full, part = self._cache.names.get(norm_name, (None, None))
|
||||
if full is None:
|
||||
variants = self.name_processor.get_variants_ascii(norm_name)
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
|
||||
(norm_name, variants))
|
||||
full, part = cur.fetchone()
|
||||
|
||||
self._cache.names[norm_name] = (full, part)
|
||||
|
||||
full_tokens.add(full)
|
||||
partial_tokens.update(part)
|
||||
|
||||
return full_tokens, partial_tokens
|
||||
|
||||
|
||||
def _compute_full_names(self, names):
|
||||
""" Return the set of all full name word ids to be used with the
|
||||
given dictionary of names.
|
||||
"""
|
||||
full_names = set()
|
||||
for name in (n for ns in names.values() for n in re.split('[;,]', ns)):
|
||||
word = self.make_standard_word(name)
|
||||
if word:
|
||||
full_names.add(word)
|
||||
full_names.add(name.strip())
|
||||
|
||||
brace_split = name.split('(', 2)
|
||||
if len(brace_split) > 1:
|
||||
word = self.make_standard_word(brace_split[0])
|
||||
if word:
|
||||
full_names.add(word)
|
||||
brace_idx = name.find('(')
|
||||
if brace_idx >= 0:
|
||||
full_names.add(name[:brace_idx].strip())
|
||||
|
||||
return full_names
|
||||
|
||||
@@ -492,7 +473,7 @@ class LegacyICUNameAnalyzer:
|
||||
postcode = self.normalize_postcode(postcode)
|
||||
|
||||
if postcode not in self._cache.postcodes:
|
||||
term = self.make_standard_word(postcode)
|
||||
term = self.name_processor.get_search_normalized(postcode)
|
||||
if not term:
|
||||
return
|
||||
|
||||
@@ -508,6 +489,7 @@ class LegacyICUNameAnalyzer:
|
||||
""", (' ' + term, postcode))
|
||||
self._cache.postcodes.add(postcode)
|
||||
|
||||
|
||||
@staticmethod
|
||||
def _split_housenumbers(hnrs):
|
||||
if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
|
||||
@@ -530,7 +512,7 @@ class _TokenInfo:
|
||||
""" Collect token information to be sent back to the database.
|
||||
"""
|
||||
def __init__(self, cache):
|
||||
self.cache = cache
|
||||
self._cache = cache
|
||||
self.data = {}
|
||||
|
||||
@staticmethod
|
||||
@@ -538,86 +520,44 @@ class _TokenInfo:
|
||||
return '{%s}' % ','.join((str(s) for s in tokens))
|
||||
|
||||
|
||||
def add_names(self, conn, names):
|
||||
def add_names(self, fulls, partials):
|
||||
""" Adds token information for the normalised names.
|
||||
"""
|
||||
# Start with all partial names
|
||||
terms = set((part for ns in names for part in ns.split()))
|
||||
# Add the full names
|
||||
terms.update((' ' + n for n in names))
|
||||
|
||||
self.data['names'] = self._mk_array(self.cache.get_term_tokens(conn, terms))
|
||||
self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
|
||||
|
||||
|
||||
def add_housenumbers(self, conn, hnrs):
|
||||
""" Extract housenumber information from a list of normalised
|
||||
housenumbers.
|
||||
"""
|
||||
self.data['hnr_tokens'] = self._mk_array(self.cache.get_hnr_tokens(conn, hnrs))
|
||||
self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
|
||||
self.data['hnr'] = ';'.join(hnrs)
|
||||
|
||||
|
||||
def add_street(self, conn, street):
|
||||
def add_street(self, fulls, partials):
|
||||
""" Add addr:street match terms.
|
||||
"""
|
||||
if not street:
|
||||
return
|
||||
|
||||
term = ' ' + street
|
||||
|
||||
tid = self.cache.names.get(term)
|
||||
|
||||
if tid is None:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""SELECT word_id FROM word
|
||||
WHERE word_token = %s
|
||||
and class is null and type is null""",
|
||||
(term, ))
|
||||
if cur.rowcount > 0:
|
||||
tid = cur.fetchone()[0]
|
||||
self.cache.names[term] = tid
|
||||
|
||||
if tid is not None:
|
||||
self.data['street'] = '{%d}' % tid
|
||||
if fulls:
|
||||
self.data['street'] = self._mk_array(fulls)
|
||||
|
||||
|
||||
def add_place(self, conn, place):
|
||||
def add_place(self, fulls, partials):
|
||||
""" Add addr:place search and match terms.
|
||||
"""
|
||||
if not place:
|
||||
return
|
||||
|
||||
partial_ids = self.cache.get_term_tokens(conn, place.split())
|
||||
tid = self.cache.get_term_tokens(conn, [' ' + place])
|
||||
|
||||
self.data['place_search'] = self._mk_array(itertools.chain(partial_ids, tid))
|
||||
self.data['place_match'] = '{%s}' % tid[0]
|
||||
if fulls:
|
||||
self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials))
|
||||
self.data['place_match'] = self._mk_array(fulls)
|
||||
|
||||
|
||||
def add_address_terms(self, conn, terms):
|
||||
def add_address_terms(self, terms):
|
||||
""" Add additional address terms.
|
||||
"""
|
||||
tokens = {}
|
||||
|
||||
for key, value in terms:
|
||||
if not value:
|
||||
continue
|
||||
partial_ids = self.cache.get_term_tokens(conn, value.split())
|
||||
term = ' ' + value
|
||||
tid = self.cache.names.get(term)
|
||||
|
||||
if tid is None:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""SELECT word_id FROM word
|
||||
WHERE word_token = %s
|
||||
and class is null and type is null""",
|
||||
(term, ))
|
||||
if cur.rowcount > 0:
|
||||
tid = cur.fetchone()[0]
|
||||
self.cache.names[term] = tid
|
||||
|
||||
tokens[key] = [self._mk_array(partial_ids),
|
||||
'{%s}' % ('' if tid is None else str(tid))]
|
||||
for key, fulls, partials in terms:
|
||||
if fulls:
|
||||
tokens[key] = [self._mk_array(itertools.chain(fulls, partials)),
|
||||
self._mk_array(fulls)]
|
||||
|
||||
if tokens:
|
||||
self.data['addr'] = tokens
|
||||
@@ -635,32 +575,6 @@ class _TokenCache:
|
||||
self.housenumbers = {}
|
||||
|
||||
|
||||
def get_term_tokens(self, conn, terms):
|
||||
""" Get token ids for a list of terms, looking them up in the database
|
||||
if necessary.
|
||||
"""
|
||||
tokens = []
|
||||
askdb = []
|
||||
|
||||
for term in terms:
|
||||
token = self.names.get(term)
|
||||
if token is None:
|
||||
askdb.append(term)
|
||||
elif token != 0:
|
||||
tokens.append(token)
|
||||
|
||||
if askdb:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("SELECT term, getorcreate_term_id(term) FROM unnest(%s) as term",
|
||||
(askdb, ))
|
||||
for term, tid in cur:
|
||||
self.names[term] = tid
|
||||
if tid != 0:
|
||||
tokens.append(tid)
|
||||
|
||||
return tokens
|
||||
|
||||
|
||||
def get_hnr_tokens(self, conn, terms):
|
||||
""" Get token ids for a list of housenumbers, looking them up in the
|
||||
database if necessary.
|
||||
|
||||
@@ -404,7 +404,7 @@ class LegacyNameAnalyzer:
|
||||
FROM unnest(%s)n) y
|
||||
WHERE NOT EXISTS(SELECT * FROM word
|
||||
WHERE word_token = lookup_token and country_code = %s))
|
||||
""", (country_code, names, country_code))
|
||||
""", (country_code, list(names.values()), country_code))
|
||||
|
||||
|
||||
def process_place(self, place):
|
||||
@@ -422,7 +422,7 @@ class LegacyNameAnalyzer:
|
||||
|
||||
country_feature = place.get('country_feature')
|
||||
if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
|
||||
self.add_country_names(country_feature.lower(), list(names.values()))
|
||||
self.add_country_names(country_feature.lower(), names)
|
||||
|
||||
address = place.get('address')
|
||||
|
||||
|
||||
Reference in New Issue
Block a user