mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-02-26 11:08:13 +00:00
Merge pull request #2458 from lonvia/add-tokenizer-preprocessing
Add a "sanitation" step for name and address tags before token processing
This commit is contained in:
@@ -1,30 +1,33 @@
|
|||||||
-- Trigger functions for the placex table.
|
-- Trigger functions for the placex table.
|
||||||
|
|
||||||
|
-- Information returned by update preparation.
|
||||||
|
DROP TYPE IF EXISTS prepare_update_info CASCADE;
|
||||||
|
CREATE TYPE prepare_update_info AS (
|
||||||
|
name HSTORE,
|
||||||
|
address HSTORE,
|
||||||
|
rank_address SMALLINT,
|
||||||
|
country_code TEXT,
|
||||||
|
class TEXT,
|
||||||
|
type TEXT,
|
||||||
|
linked_place_id BIGINT
|
||||||
|
);
|
||||||
|
|
||||||
-- Retrieve the data needed by the indexer for updating the place.
|
-- Retrieve the data needed by the indexer for updating the place.
|
||||||
--
|
CREATE OR REPLACE FUNCTION placex_indexing_prepare(p placex)
|
||||||
-- Return parameters:
|
RETURNS prepare_update_info
|
||||||
-- name list of names
|
|
||||||
-- address list of address tags, either from the object or a surrounding
|
|
||||||
-- building
|
|
||||||
-- country_feature If the place is a country feature, this contains the
|
|
||||||
-- country code, otherwise it is null.
|
|
||||||
CREATE OR REPLACE FUNCTION placex_prepare_update(p placex,
|
|
||||||
OUT name HSTORE,
|
|
||||||
OUT address HSTORE,
|
|
||||||
OUT country_feature VARCHAR,
|
|
||||||
OUT linked_place_id BIGINT)
|
|
||||||
AS $$
|
AS $$
|
||||||
DECLARE
|
DECLARE
|
||||||
location RECORD;
|
location RECORD;
|
||||||
|
result prepare_update_info;
|
||||||
BEGIN
|
BEGIN
|
||||||
-- For POI nodes, check if the address should be derived from a surrounding
|
-- For POI nodes, check if the address should be derived from a surrounding
|
||||||
-- building.
|
-- building.
|
||||||
IF p.rank_search < 30 OR p.osm_type != 'N' OR p.address is not null THEN
|
IF p.rank_search < 30 OR p.osm_type != 'N' OR p.address is not null THEN
|
||||||
address := p.address;
|
result.address := p.address;
|
||||||
ELSE
|
ELSE
|
||||||
-- The additional && condition works around the misguided query
|
-- The additional && condition works around the misguided query
|
||||||
-- planner of postgis 3.0.
|
-- planner of postgis 3.0.
|
||||||
SELECT placex.address || hstore('_inherited', '') INTO address
|
SELECT placex.address || hstore('_inherited', '') INTO result.address
|
||||||
FROM placex
|
FROM placex
|
||||||
WHERE ST_Covers(geometry, p.centroid)
|
WHERE ST_Covers(geometry, p.centroid)
|
||||||
and geometry && p.centroid
|
and geometry && p.centroid
|
||||||
@@ -34,27 +37,26 @@ BEGIN
|
|||||||
LIMIT 1;
|
LIMIT 1;
|
||||||
END IF;
|
END IF;
|
||||||
|
|
||||||
address := address - '_unlisted_place'::TEXT;
|
result.address := result.address - '_unlisted_place'::TEXT;
|
||||||
name := p.name;
|
result.name := p.name;
|
||||||
|
result.class := p.class;
|
||||||
|
result.type := p.type;
|
||||||
|
result.country_code := p.country_code;
|
||||||
|
result.rank_address := p.rank_address;
|
||||||
|
|
||||||
-- Names of linked places need to be merged in, so search for a linkable
|
-- Names of linked places need to be merged in, so search for a linkable
|
||||||
-- place already here.
|
-- place already here.
|
||||||
SELECT * INTO location FROM find_linked_place(p);
|
SELECT * INTO location FROM find_linked_place(p);
|
||||||
|
|
||||||
IF location.place_id is not NULL THEN
|
IF location.place_id is not NULL THEN
|
||||||
linked_place_id := location.place_id;
|
result.linked_place_id := location.place_id;
|
||||||
|
|
||||||
IF NOT location.name IS NULL THEN
|
IF NOT location.name IS NULL THEN
|
||||||
name := location.name || name;
|
result.name := location.name || result.name;
|
||||||
END IF;
|
END IF;
|
||||||
END IF;
|
END IF;
|
||||||
|
|
||||||
country_feature := CASE WHEN p.admin_level = 2
|
RETURN result;
|
||||||
and p.class = 'boundary' and p.type = 'administrative'
|
|
||||||
and p.osm_type = 'R'
|
|
||||||
THEN p.country_code
|
|
||||||
ELSE null
|
|
||||||
END;
|
|
||||||
END;
|
END;
|
||||||
$$
|
$$
|
||||||
LANGUAGE plpgsql STABLE;
|
LANGUAGE plpgsql STABLE;
|
||||||
|
|||||||
68
nominatim/indexer/place_info.py
Normal file
68
nominatim/indexer/place_info.py
Normal file
@@ -0,0 +1,68 @@
|
|||||||
|
"""
|
||||||
|
Wrapper around place information the indexer gets from the database and hands to
|
||||||
|
the tokenizer.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import psycopg2.extras
|
||||||
|
|
||||||
|
class PlaceInfo:
|
||||||
|
""" Data class containing all information the tokenizer gets about a
|
||||||
|
place it should process the names for.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, info):
|
||||||
|
self._info = info
|
||||||
|
|
||||||
|
|
||||||
|
def analyze(self, analyzer):
|
||||||
|
""" Process this place with the given tokenizer and return the
|
||||||
|
result in psycopg2-compatible Json.
|
||||||
|
"""
|
||||||
|
return psycopg2.extras.Json(analyzer.process_place(self))
|
||||||
|
|
||||||
|
|
||||||
|
@property
|
||||||
|
def name(self):
|
||||||
|
""" A dictionary with the names of the place or None if the place
|
||||||
|
has no names.
|
||||||
|
"""
|
||||||
|
return self._info.get('name')
|
||||||
|
|
||||||
|
|
||||||
|
@property
|
||||||
|
def address(self):
|
||||||
|
""" A dictionary with the address elements of the place
|
||||||
|
or None if no address information is available.
|
||||||
|
"""
|
||||||
|
return self._info.get('address')
|
||||||
|
|
||||||
|
|
||||||
|
@property
|
||||||
|
def country_code(self):
|
||||||
|
""" The country code of the country the place is in. Guaranteed
|
||||||
|
to be a two-letter lower-case string or None, if no country
|
||||||
|
could be found.
|
||||||
|
"""
|
||||||
|
return self._info.get('country_code')
|
||||||
|
|
||||||
|
|
||||||
|
@property
|
||||||
|
def rank_address(self):
|
||||||
|
""" The computed rank address before rank correction.
|
||||||
|
"""
|
||||||
|
return self._info.get('rank_address')
|
||||||
|
|
||||||
|
|
||||||
|
def is_a(self, key, value):
|
||||||
|
""" Check if the place's primary tag corresponds to the given
|
||||||
|
key and value.
|
||||||
|
"""
|
||||||
|
return self._info.get('class') == key and self._info.get('type') == value
|
||||||
|
|
||||||
|
|
||||||
|
def is_country(self):
|
||||||
|
""" Check if the place is a valid country boundary.
|
||||||
|
"""
|
||||||
|
return self.rank_address == 4 \
|
||||||
|
and self.is_a('boundary', 'administrative') \
|
||||||
|
and self.country_code is not None
|
||||||
@@ -4,14 +4,16 @@ tasks.
|
|||||||
"""
|
"""
|
||||||
import functools
|
import functools
|
||||||
|
|
||||||
import psycopg2.extras
|
|
||||||
from psycopg2 import sql as pysql
|
from psycopg2 import sql as pysql
|
||||||
|
|
||||||
|
from nominatim.indexer.place_info import PlaceInfo
|
||||||
|
|
||||||
# pylint: disable=C0111
|
# pylint: disable=C0111
|
||||||
|
|
||||||
def _mk_valuelist(template, num):
|
def _mk_valuelist(template, num):
|
||||||
return pysql.SQL(',').join([pysql.SQL(template)] * num)
|
return pysql.SQL(',').join([pysql.SQL(template)] * num)
|
||||||
|
|
||||||
|
|
||||||
class AbstractPlacexRunner:
|
class AbstractPlacexRunner:
|
||||||
""" Returns SQL commands for indexing of the placex table.
|
""" Returns SQL commands for indexing of the placex table.
|
||||||
"""
|
"""
|
||||||
@@ -37,7 +39,7 @@ class AbstractPlacexRunner:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_place_details(worker, ids):
|
def get_place_details(worker, ids):
|
||||||
worker.perform("""SELECT place_id, (placex_prepare_update(placex)).*
|
worker.perform("""SELECT place_id, (placex_indexing_prepare(placex)).*
|
||||||
FROM placex WHERE place_id IN %s""",
|
FROM placex WHERE place_id IN %s""",
|
||||||
(tuple((p[0] for p in ids)), ))
|
(tuple((p[0] for p in ids)), ))
|
||||||
|
|
||||||
@@ -47,7 +49,7 @@ class AbstractPlacexRunner:
|
|||||||
for place in places:
|
for place in places:
|
||||||
for field in ('place_id', 'name', 'address', 'linked_place_id'):
|
for field in ('place_id', 'name', 'address', 'linked_place_id'):
|
||||||
values.append(place[field])
|
values.append(place[field])
|
||||||
values.append(psycopg2.extras.Json(self.analyzer.process_place(place)))
|
values.append(PlaceInfo(place).analyze(self.analyzer))
|
||||||
|
|
||||||
worker.perform(self._index_sql(len(places)), values)
|
worker.perform(self._index_sql(len(places)), values)
|
||||||
|
|
||||||
@@ -141,7 +143,7 @@ class InterpolationRunner:
|
|||||||
values = []
|
values = []
|
||||||
for place in places:
|
for place in places:
|
||||||
values.extend((place[x] for x in ('place_id', 'address')))
|
values.extend((place[x] for x in ('place_id', 'address')))
|
||||||
values.append(psycopg2.extras.Json(self.analyzer.process_place(place)))
|
values.append(PlaceInfo(place).analyze(self.analyzer))
|
||||||
|
|
||||||
worker.perform(self._index_sql(len(places)), values)
|
worker.perform(self._index_sql(len(places)), values)
|
||||||
|
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ from abc import ABC, abstractmethod
|
|||||||
from typing import List, Tuple, Dict, Any
|
from typing import List, Tuple, Dict, Any
|
||||||
|
|
||||||
from nominatim.config import Configuration
|
from nominatim.config import Configuration
|
||||||
|
from nominatim.indexer.place_info import PlaceInfo
|
||||||
|
|
||||||
# pylint: disable=unnecessary-pass
|
# pylint: disable=unnecessary-pass
|
||||||
|
|
||||||
@@ -105,20 +106,13 @@ class AbstractAnalyzer(ABC):
|
|||||||
|
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def process_place(self, place: Dict) -> Any:
|
def process_place(self, place: PlaceInfo) -> Any:
|
||||||
""" Extract tokens for the given place and compute the
|
""" Extract tokens for the given place and compute the
|
||||||
information to be handed to the PL/pgSQL processor for building
|
information to be handed to the PL/pgSQL processor for building
|
||||||
the search index.
|
the search index.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
place: Dictionary with the information about the place. Currently
|
place: Place information retrived from the database.
|
||||||
the following fields may be present:
|
|
||||||
|
|
||||||
- *name* is a dictionary of names for the place together
|
|
||||||
with the designation of the name.
|
|
||||||
- *address* is a dictionary of address terms.
|
|
||||||
- *country_feature* is set to a country code when the
|
|
||||||
place describes a country.
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A JSON-serialisable structure that will be handed into
|
A JSON-serialisable structure that will be handed into
|
||||||
@@ -142,7 +136,7 @@ class AbstractTokenizer(ABC):
|
|||||||
the tokenizer remains stable over updates.
|
the tokenizer remains stable over updates.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
config: Read-only object with configuration obtions.
|
config: Read-only object with configuration options.
|
||||||
|
|
||||||
init_db: When set to False, then initialisation of database
|
init_db: When set to False, then initialisation of database
|
||||||
tables should be skipped. This option is only required for
|
tables should be skipped. This option is only required for
|
||||||
@@ -155,11 +149,14 @@ class AbstractTokenizer(ABC):
|
|||||||
|
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def init_from_project(self) -> None:
|
def init_from_project(self, config: Configuration) -> None:
|
||||||
""" Initialise the tokenizer from an existing database setup.
|
""" Initialise the tokenizer from an existing database setup.
|
||||||
|
|
||||||
The function should load all previously saved configuration from
|
The function should load all previously saved configuration from
|
||||||
the project directory and/or the property table.
|
the project directory and/or the property table.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
config: Read-only object with configuration options.
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@@ -172,7 +169,7 @@ class AbstractTokenizer(ABC):
|
|||||||
during query time.
|
during query time.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
config: Read-only object with configuration obtions.
|
config: Read-only object with configuration options.
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@@ -187,13 +184,13 @@ class AbstractTokenizer(ABC):
|
|||||||
data structures or data itself must not be changed by this function.
|
data structures or data itself must not be changed by this function.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
config: Read-only object with configuration obtions.
|
config: Read-only object with configuration options.
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def check_database(self) -> str:
|
def check_database(self, config: Configuration) -> str:
|
||||||
""" Check that the database is set up correctly and ready for being
|
""" Check that the database is set up correctly and ready for being
|
||||||
queried.
|
queried.
|
||||||
|
|
||||||
@@ -202,6 +199,9 @@ class AbstractTokenizer(ABC):
|
|||||||
description of the issue as well as hints for the user on
|
description of the issue as well as hints for the user on
|
||||||
how to resolve the issue.
|
how to resolve the issue.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
config: Read-only object with configuration options.
|
||||||
|
|
||||||
Return `None`, if no issue was found.
|
Return `None`, if no issue was found.
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|||||||
@@ -85,6 +85,6 @@ def get_tokenizer_for_db(config):
|
|||||||
tokenizer_module = _import_tokenizer(name)
|
tokenizer_module = _import_tokenizer(name)
|
||||||
|
|
||||||
tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
|
tokenizer = tokenizer_module.create(config.get_libpq_dsn(), basedir)
|
||||||
tokenizer.init_from_project()
|
tokenizer.init_from_project(config)
|
||||||
|
|
||||||
return tokenizer
|
return tokenizer
|
||||||
|
|||||||
@@ -8,67 +8,25 @@ import itertools
|
|||||||
from icu import Transliterator
|
from icu import Transliterator
|
||||||
import datrie
|
import datrie
|
||||||
|
|
||||||
from nominatim.db.properties import set_property, get_property
|
|
||||||
from nominatim.tokenizer import icu_variants as variants
|
|
||||||
|
|
||||||
DBCFG_IMPORT_NORM_RULES = "tokenizer_import_normalisation"
|
|
||||||
DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration"
|
|
||||||
DBCFG_IMPORT_REPLACEMENTS = "tokenizer_import_replacements"
|
|
||||||
DBCFG_SEARCH_STD_RULES = "tokenizer_search_standardization"
|
|
||||||
|
|
||||||
|
|
||||||
class ICUNameProcessorRules:
|
|
||||||
""" Data object that saves the rules needed for the name processor.
|
|
||||||
|
|
||||||
The rules can either be initialised through an ICURuleLoader or
|
|
||||||
be loaded from a database when a connection is given.
|
|
||||||
"""
|
|
||||||
def __init__(self, loader=None, conn=None):
|
|
||||||
if loader is not None:
|
|
||||||
self.norm_rules = loader.get_normalization_rules()
|
|
||||||
self.trans_rules = loader.get_transliteration_rules()
|
|
||||||
self.replacements = loader.get_replacement_pairs()
|
|
||||||
self.search_rules = loader.get_search_rules()
|
|
||||||
elif conn is not None:
|
|
||||||
self.norm_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
|
|
||||||
self.trans_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
|
|
||||||
self.replacements = \
|
|
||||||
variants.unpickle_variant_set(get_property(conn, DBCFG_IMPORT_REPLACEMENTS))
|
|
||||||
self.search_rules = get_property(conn, DBCFG_SEARCH_STD_RULES)
|
|
||||||
else:
|
|
||||||
assert False, "Parameter loader or conn required."
|
|
||||||
|
|
||||||
|
|
||||||
def save_rules(self, conn):
|
|
||||||
""" Save the rules in the property table of the given database.
|
|
||||||
the rules can be loaded again by handing in a connection into
|
|
||||||
the constructor of the class.
|
|
||||||
"""
|
|
||||||
set_property(conn, DBCFG_IMPORT_NORM_RULES, self.norm_rules)
|
|
||||||
set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.trans_rules)
|
|
||||||
set_property(conn, DBCFG_IMPORT_REPLACEMENTS,
|
|
||||||
variants.pickle_variant_set(self.replacements))
|
|
||||||
set_property(conn, DBCFG_SEARCH_STD_RULES, self.search_rules)
|
|
||||||
|
|
||||||
|
|
||||||
class ICUNameProcessor:
|
class ICUNameProcessor:
|
||||||
""" Collects the different transformation rules for normalisation of names
|
""" Collects the different transformation rules for normalisation of names
|
||||||
and provides the functions to aply the transformations.
|
and provides the functions to apply the transformations.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, rules):
|
def __init__(self, norm_rules, trans_rules, replacements):
|
||||||
self.normalizer = Transliterator.createFromRules("icu_normalization",
|
self.normalizer = Transliterator.createFromRules("icu_normalization",
|
||||||
rules.norm_rules)
|
norm_rules)
|
||||||
self.to_ascii = Transliterator.createFromRules("icu_to_ascii",
|
self.to_ascii = Transliterator.createFromRules("icu_to_ascii",
|
||||||
rules.trans_rules +
|
trans_rules +
|
||||||
";[:Space:]+ > ' '")
|
";[:Space:]+ > ' '")
|
||||||
self.search = Transliterator.createFromRules("icu_search",
|
self.search = Transliterator.createFromRules("icu_search",
|
||||||
rules.search_rules)
|
norm_rules + trans_rules)
|
||||||
|
|
||||||
# Intermediate reorder by source. Also compute required character set.
|
# Intermediate reorder by source. Also compute required character set.
|
||||||
immediate = defaultdict(list)
|
immediate = defaultdict(list)
|
||||||
chars = set()
|
chars = set()
|
||||||
for variant in rules.replacements:
|
for variant in replacements:
|
||||||
if variant.source[-1] == ' ' and variant.replacement[-1] == ' ':
|
if variant.source[-1] == ' ' and variant.replacement[-1] == ' ':
|
||||||
replstr = variant.replacement[:-1]
|
replstr = variant.replacement[:-1]
|
||||||
else:
|
else:
|
||||||
|
|||||||
@@ -2,17 +2,26 @@
|
|||||||
Helper class to create ICU rules from a configuration file.
|
Helper class to create ICU rules from a configuration file.
|
||||||
"""
|
"""
|
||||||
import io
|
import io
|
||||||
|
import json
|
||||||
import logging
|
import logging
|
||||||
import itertools
|
import itertools
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from icu import Transliterator
|
from icu import Transliterator
|
||||||
|
|
||||||
|
from nominatim.db.properties import set_property, get_property
|
||||||
from nominatim.errors import UsageError
|
from nominatim.errors import UsageError
|
||||||
|
from nominatim.tokenizer.icu_name_processor import ICUNameProcessor
|
||||||
|
from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
|
||||||
import nominatim.tokenizer.icu_variants as variants
|
import nominatim.tokenizer.icu_variants as variants
|
||||||
|
|
||||||
LOG = logging.getLogger()
|
LOG = logging.getLogger()
|
||||||
|
|
||||||
|
DBCFG_IMPORT_NORM_RULES = "tokenizer_import_normalisation"
|
||||||
|
DBCFG_IMPORT_TRANS_RULES = "tokenizer_import_transliteration"
|
||||||
|
DBCFG_IMPORT_ANALYSIS_RULES = "tokenizer_import_analysis_rules"
|
||||||
|
|
||||||
|
|
||||||
def _flatten_config_list(content):
|
def _flatten_config_list(content):
|
||||||
if not content:
|
if not content:
|
||||||
return []
|
return []
|
||||||
@@ -46,12 +55,52 @@ class ICURuleLoader:
|
|||||||
""" Compiler for ICU rules from a tokenizer configuration file.
|
""" Compiler for ICU rules from a tokenizer configuration file.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, rules):
|
def __init__(self, config):
|
||||||
|
rules = config.load_sub_configuration('icu_tokenizer.yaml',
|
||||||
|
config='TOKENIZER_CONFIG')
|
||||||
|
|
||||||
self.variants = set()
|
self.variants = set()
|
||||||
|
|
||||||
self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
|
self.normalization_rules = self._cfg_to_icu_rules(rules, 'normalization')
|
||||||
self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
|
self.transliteration_rules = self._cfg_to_icu_rules(rules, 'transliteration')
|
||||||
self._parse_variant_list(self._get_section(rules, 'variants'))
|
self.analysis_rules = self._get_section(rules, 'variants')
|
||||||
|
self._parse_variant_list()
|
||||||
|
|
||||||
|
# Load optional sanitizer rule set.
|
||||||
|
self.sanitizer_rules = rules.get('sanitizers', [])
|
||||||
|
|
||||||
|
|
||||||
|
def load_config_from_db(self, conn):
|
||||||
|
""" Get previously saved parts of the configuration from the
|
||||||
|
database.
|
||||||
|
"""
|
||||||
|
self.normalization_rules = get_property(conn, DBCFG_IMPORT_NORM_RULES)
|
||||||
|
self.transliteration_rules = get_property(conn, DBCFG_IMPORT_TRANS_RULES)
|
||||||
|
self.analysis_rules = json.loads(get_property(conn, DBCFG_IMPORT_ANALYSIS_RULES))
|
||||||
|
self._parse_variant_list()
|
||||||
|
|
||||||
|
|
||||||
|
def save_config_to_db(self, conn):
|
||||||
|
""" Save the part of the configuration that cannot be changed into
|
||||||
|
the database.
|
||||||
|
"""
|
||||||
|
set_property(conn, DBCFG_IMPORT_NORM_RULES, self.normalization_rules)
|
||||||
|
set_property(conn, DBCFG_IMPORT_TRANS_RULES, self.transliteration_rules)
|
||||||
|
set_property(conn, DBCFG_IMPORT_ANALYSIS_RULES, json.dumps(self.analysis_rules))
|
||||||
|
|
||||||
|
|
||||||
|
def make_sanitizer(self):
|
||||||
|
""" Create a place sanitizer from the configured rules.
|
||||||
|
"""
|
||||||
|
return PlaceSanitizer(self.sanitizer_rules)
|
||||||
|
|
||||||
|
|
||||||
|
def make_token_analysis(self):
|
||||||
|
""" Create a token analyser from the reviouly loaded rules.
|
||||||
|
"""
|
||||||
|
return ICUNameProcessor(self.normalization_rules,
|
||||||
|
self.transliteration_rules,
|
||||||
|
self.variants)
|
||||||
|
|
||||||
|
|
||||||
def get_search_rules(self):
|
def get_search_rules(self):
|
||||||
@@ -112,7 +161,9 @@ class ICURuleLoader:
|
|||||||
return ';'.join(_flatten_config_list(content)) + ';'
|
return ';'.join(_flatten_config_list(content)) + ';'
|
||||||
|
|
||||||
|
|
||||||
def _parse_variant_list(self, rules):
|
def _parse_variant_list(self):
|
||||||
|
rules = self.analysis_rules
|
||||||
|
|
||||||
self.variants.clear()
|
self.variants.clear()
|
||||||
|
|
||||||
if not rules:
|
if not rules:
|
||||||
|
|||||||
@@ -13,8 +13,8 @@ from nominatim.db.connection import connect
|
|||||||
from nominatim.db.properties import set_property, get_property
|
from nominatim.db.properties import set_property, get_property
|
||||||
from nominatim.db.utils import CopyBuffer
|
from nominatim.db.utils import CopyBuffer
|
||||||
from nominatim.db.sql_preprocessor import SQLPreprocessor
|
from nominatim.db.sql_preprocessor import SQLPreprocessor
|
||||||
|
from nominatim.indexer.place_info import PlaceInfo
|
||||||
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
|
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
|
||||||
from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
|
|
||||||
from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
|
from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
|
||||||
|
|
||||||
DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
|
DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
|
||||||
@@ -36,7 +36,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
|
|||||||
def __init__(self, dsn, data_dir):
|
def __init__(self, dsn, data_dir):
|
||||||
self.dsn = dsn
|
self.dsn = dsn
|
||||||
self.data_dir = data_dir
|
self.data_dir = data_dir
|
||||||
self.naming_rules = None
|
self.loader = None
|
||||||
self.term_normalization = None
|
self.term_normalization = None
|
||||||
|
|
||||||
|
|
||||||
@@ -46,9 +46,8 @@ class LegacyICUTokenizer(AbstractTokenizer):
|
|||||||
This copies all necessary data in the project directory to make
|
This copies all necessary data in the project directory to make
|
||||||
sure the tokenizer remains stable even over updates.
|
sure the tokenizer remains stable even over updates.
|
||||||
"""
|
"""
|
||||||
loader = ICURuleLoader(config.load_sub_configuration('icu_tokenizer.yaml',
|
self.loader = ICURuleLoader(config)
|
||||||
config='TOKENIZER_CONFIG'))
|
|
||||||
self.naming_rules = ICUNameProcessorRules(loader=loader)
|
|
||||||
self.term_normalization = config.TERM_NORMALIZATION
|
self.term_normalization = config.TERM_NORMALIZATION
|
||||||
|
|
||||||
self._install_php(config.lib_dir.php)
|
self._install_php(config.lib_dir.php)
|
||||||
@@ -59,11 +58,13 @@ class LegacyICUTokenizer(AbstractTokenizer):
|
|||||||
self._init_db_tables(config)
|
self._init_db_tables(config)
|
||||||
|
|
||||||
|
|
||||||
def init_from_project(self):
|
def init_from_project(self, config):
|
||||||
""" Initialise the tokenizer from the project directory.
|
""" Initialise the tokenizer from the project directory.
|
||||||
"""
|
"""
|
||||||
|
self.loader = ICURuleLoader(config)
|
||||||
|
|
||||||
with connect(self.dsn) as conn:
|
with connect(self.dsn) as conn:
|
||||||
self.naming_rules = ICUNameProcessorRules(conn=conn)
|
self.loader.load_config_from_db(conn)
|
||||||
self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
|
self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
|
||||||
|
|
||||||
|
|
||||||
@@ -81,12 +82,12 @@ class LegacyICUTokenizer(AbstractTokenizer):
|
|||||||
sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
|
sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
|
||||||
|
|
||||||
|
|
||||||
def check_database(self):
|
def check_database(self, config):
|
||||||
""" Check that the tokenizer is set up correctly.
|
""" Check that the tokenizer is set up correctly.
|
||||||
"""
|
"""
|
||||||
self.init_from_project()
|
self.init_from_project(config)
|
||||||
|
|
||||||
if self.naming_rules is None:
|
if self.term_normalization is None:
|
||||||
return "Configuration for tokenizer 'icu' are missing."
|
return "Configuration for tokenizer 'icu' are missing."
|
||||||
|
|
||||||
return None
|
return None
|
||||||
@@ -107,7 +108,8 @@ class LegacyICUTokenizer(AbstractTokenizer):
|
|||||||
|
|
||||||
Analyzers are not thread-safe. You need to instantiate one per thread.
|
Analyzers are not thread-safe. You need to instantiate one per thread.
|
||||||
"""
|
"""
|
||||||
return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
|
return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
|
||||||
|
self.loader.make_token_analysis())
|
||||||
|
|
||||||
|
|
||||||
def _install_php(self, phpdir):
|
def _install_php(self, phpdir):
|
||||||
@@ -118,7 +120,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
|
|||||||
<?php
|
<?php
|
||||||
@define('CONST_Max_Word_Frequency', 10000000);
|
@define('CONST_Max_Word_Frequency', 10000000);
|
||||||
@define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
|
@define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
|
||||||
@define('CONST_Transliteration', "{self.naming_rules.search_rules}");
|
@define('CONST_Transliteration', "{self.loader.get_search_rules()}");
|
||||||
require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
|
require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
|
||||||
|
|
||||||
|
|
||||||
@@ -127,8 +129,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
|
|||||||
database as database properties.
|
database as database properties.
|
||||||
"""
|
"""
|
||||||
with connect(self.dsn) as conn:
|
with connect(self.dsn) as conn:
|
||||||
self.naming_rules.save_rules(conn)
|
self.loader.save_config_to_db(conn)
|
||||||
|
|
||||||
set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
|
set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
|
||||||
|
|
||||||
|
|
||||||
@@ -163,7 +164,7 @@ class LegacyICUTokenizer(AbstractTokenizer):
|
|||||||
""" Count the partial terms from the names in the place table.
|
""" Count the partial terms from the names in the place table.
|
||||||
"""
|
"""
|
||||||
words = Counter()
|
words = Counter()
|
||||||
name_proc = ICUNameProcessor(self.naming_rules)
|
name_proc = self.loader.make_token_analysis()
|
||||||
|
|
||||||
with conn.cursor(name="words") as cur:
|
with conn.cursor(name="words") as cur:
|
||||||
cur.execute(""" SELECT v, count(*) FROM
|
cur.execute(""" SELECT v, count(*) FROM
|
||||||
@@ -188,10 +189,11 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
normalization.
|
normalization.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, dsn, name_proc):
|
def __init__(self, dsn, sanitizer, token_analysis):
|
||||||
self.conn = connect(dsn).connection
|
self.conn = connect(dsn).connection
|
||||||
self.conn.autocommit = True
|
self.conn.autocommit = True
|
||||||
self.name_processor = name_proc
|
self.sanitizer = sanitizer
|
||||||
|
self.token_analysis = token_analysis
|
||||||
|
|
||||||
self._cache = _TokenCache()
|
self._cache = _TokenCache()
|
||||||
|
|
||||||
@@ -204,6 +206,19 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
self.conn = None
|
self.conn = None
|
||||||
|
|
||||||
|
|
||||||
|
def _search_normalized(self, name):
|
||||||
|
""" Return the search token transliteration of the given name.
|
||||||
|
"""
|
||||||
|
return self.token_analysis.get_search_normalized(name)
|
||||||
|
|
||||||
|
|
||||||
|
def _normalized(self, name):
|
||||||
|
""" Return the normalized version of the given name with all
|
||||||
|
non-relevant information removed.
|
||||||
|
"""
|
||||||
|
return self.token_analysis.get_normalized(name)
|
||||||
|
|
||||||
|
|
||||||
def get_word_token_info(self, words):
|
def get_word_token_info(self, words):
|
||||||
""" Return token information for the given list of words.
|
""" Return token information for the given list of words.
|
||||||
If a word starts with # it is assumed to be a full name
|
If a word starts with # it is assumed to be a full name
|
||||||
@@ -219,9 +234,9 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
partial_tokens = {}
|
partial_tokens = {}
|
||||||
for word in words:
|
for word in words:
|
||||||
if word.startswith('#'):
|
if word.startswith('#'):
|
||||||
full_tokens[word] = self.name_processor.get_search_normalized(word[1:])
|
full_tokens[word] = self._search_normalized(word[1:])
|
||||||
else:
|
else:
|
||||||
partial_tokens[word] = self.name_processor.get_search_normalized(word)
|
partial_tokens[word] = self._search_normalized(word)
|
||||||
|
|
||||||
with self.conn.cursor() as cur:
|
with self.conn.cursor() as cur:
|
||||||
cur.execute("""SELECT word_token, word_id
|
cur.execute("""SELECT word_token, word_id
|
||||||
@@ -252,7 +267,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
|
|
||||||
This function takes minor shortcuts on transliteration.
|
This function takes minor shortcuts on transliteration.
|
||||||
"""
|
"""
|
||||||
return self.name_processor.get_search_normalized(hnr)
|
return self._search_normalized(hnr)
|
||||||
|
|
||||||
def update_postcodes_from_db(self):
|
def update_postcodes_from_db(self):
|
||||||
""" Update postcode tokens in the word table from the location_postcode
|
""" Update postcode tokens in the word table from the location_postcode
|
||||||
@@ -275,7 +290,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
if postcode is None:
|
if postcode is None:
|
||||||
to_delete.append(word)
|
to_delete.append(word)
|
||||||
else:
|
else:
|
||||||
copystr.add(self.name_processor.get_search_normalized(postcode),
|
copystr.add(self._search_normalized(postcode),
|
||||||
'P', postcode)
|
'P', postcode)
|
||||||
|
|
||||||
if to_delete:
|
if to_delete:
|
||||||
@@ -293,7 +308,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
completely replaced. Otherwise the phrases are added to the
|
completely replaced. Otherwise the phrases are added to the
|
||||||
already existing ones.
|
already existing ones.
|
||||||
"""
|
"""
|
||||||
norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
|
norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
|
||||||
for p in phrases))
|
for p in phrases))
|
||||||
|
|
||||||
with self.conn.cursor() as cur:
|
with self.conn.cursor() as cur:
|
||||||
@@ -323,7 +338,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
added = 0
|
added = 0
|
||||||
with CopyBuffer() as copystr:
|
with CopyBuffer() as copystr:
|
||||||
for word, cls, typ, oper in to_add:
|
for word, cls, typ, oper in to_add:
|
||||||
term = self.name_processor.get_search_normalized(word)
|
term = self._search_normalized(word)
|
||||||
if term:
|
if term:
|
||||||
copystr.add(term, 'S', word,
|
copystr.add(term, 'S', word,
|
||||||
json.dumps({'class': cls, 'type': typ,
|
json.dumps({'class': cls, 'type': typ,
|
||||||
@@ -357,9 +372,21 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
def add_country_names(self, country_code, names):
|
def add_country_names(self, country_code, names):
|
||||||
""" Add names for the given country to the search index.
|
""" Add names for the given country to the search index.
|
||||||
"""
|
"""
|
||||||
|
# Make sure any name preprocessing for country names applies.
|
||||||
|
info = PlaceInfo({'name': names, 'country_code': country_code,
|
||||||
|
'rank_address': 4, 'class': 'boundary',
|
||||||
|
'type': 'administrative'})
|
||||||
|
self._add_country_full_names(country_code,
|
||||||
|
self.sanitizer.process_names(info)[0])
|
||||||
|
|
||||||
|
|
||||||
|
def _add_country_full_names(self, country_code, names):
|
||||||
|
""" Add names for the given country from an already sanitized
|
||||||
|
name list.
|
||||||
|
"""
|
||||||
word_tokens = set()
|
word_tokens = set()
|
||||||
for name in self._compute_full_names(names):
|
for name in names:
|
||||||
norm_name = self.name_processor.get_search_normalized(name)
|
norm_name = self._search_normalized(name.name)
|
||||||
if norm_name:
|
if norm_name:
|
||||||
word_tokens.add(norm_name)
|
word_tokens.add(norm_name)
|
||||||
|
|
||||||
@@ -385,23 +412,21 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
def process_place(self, place):
|
def process_place(self, place):
|
||||||
""" Determine tokenizer information about the given place.
|
""" Determine tokenizer information about the given place.
|
||||||
|
|
||||||
Returns a JSON-serialisable structure that will be handed into
|
Returns a JSON-serializable structure that will be handed into
|
||||||
the database via the token_info field.
|
the database via the token_info field.
|
||||||
"""
|
"""
|
||||||
token_info = _TokenInfo(self._cache)
|
token_info = _TokenInfo(self._cache)
|
||||||
|
|
||||||
names = place.get('name')
|
names, address = self.sanitizer.process_names(place)
|
||||||
|
|
||||||
if names:
|
if names:
|
||||||
fulls, partials = self._compute_name_tokens(names)
|
fulls, partials = self._compute_name_tokens(names)
|
||||||
|
|
||||||
token_info.add_names(fulls, partials)
|
token_info.add_names(fulls, partials)
|
||||||
|
|
||||||
country_feature = place.get('country_feature')
|
if place.is_country():
|
||||||
if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
|
self._add_country_full_names(place.country_code, names)
|
||||||
self.add_country_names(country_feature.lower(), names)
|
|
||||||
|
|
||||||
address = place.get('address')
|
|
||||||
if address:
|
if address:
|
||||||
self._process_place_address(token_info, address)
|
self._process_place_address(token_info, address)
|
||||||
|
|
||||||
@@ -411,18 +436,18 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
def _process_place_address(self, token_info, address):
|
def _process_place_address(self, token_info, address):
|
||||||
hnrs = []
|
hnrs = []
|
||||||
addr_terms = []
|
addr_terms = []
|
||||||
for key, value in address.items():
|
for item in address:
|
||||||
if key == 'postcode':
|
if item.kind == 'postcode':
|
||||||
self._add_postcode(value)
|
self._add_postcode(item.name)
|
||||||
elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
|
elif item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'):
|
||||||
hnrs.append(value)
|
hnrs.append(item.name)
|
||||||
elif key == 'street':
|
elif item.kind == 'street':
|
||||||
token_info.add_street(self._compute_partial_tokens(value))
|
token_info.add_street(self._compute_partial_tokens(item.name))
|
||||||
elif key == 'place':
|
elif item.kind == 'place':
|
||||||
token_info.add_place(self._compute_partial_tokens(value))
|
token_info.add_place(self._compute_partial_tokens(item.name))
|
||||||
elif not key.startswith('_') and \
|
elif not item.kind.startswith('_') and \
|
||||||
key not in ('country', 'full'):
|
item.kind not in ('country', 'full'):
|
||||||
addr_terms.append((key, self._compute_partial_tokens(value)))
|
addr_terms.append((item.kind, self._compute_partial_tokens(item.name)))
|
||||||
|
|
||||||
if hnrs:
|
if hnrs:
|
||||||
hnrs = self._split_housenumbers(hnrs)
|
hnrs = self._split_housenumbers(hnrs)
|
||||||
@@ -435,7 +460,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
""" Normalize the given term, split it into partial words and return
|
""" Normalize the given term, split it into partial words and return
|
||||||
then token list for them.
|
then token list for them.
|
||||||
"""
|
"""
|
||||||
norm_name = self.name_processor.get_search_normalized(name)
|
norm_name = self._search_normalized(name)
|
||||||
|
|
||||||
tokens = []
|
tokens = []
|
||||||
need_lookup = []
|
need_lookup = []
|
||||||
@@ -458,19 +483,19 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
|
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
|
|
||||||
def _compute_name_tokens(self, names):
|
def _compute_name_tokens(self, names):
|
||||||
""" Computes the full name and partial name tokens for the given
|
""" Computes the full name and partial name tokens for the given
|
||||||
dictionary of names.
|
dictionary of names.
|
||||||
"""
|
"""
|
||||||
full_names = self._compute_full_names(names)
|
|
||||||
full_tokens = set()
|
full_tokens = set()
|
||||||
partial_tokens = set()
|
partial_tokens = set()
|
||||||
|
|
||||||
for name in full_names:
|
for name in names:
|
||||||
norm_name = self.name_processor.get_normalized(name)
|
norm_name = self._normalized(name.name)
|
||||||
full, part = self._cache.names.get(norm_name, (None, None))
|
full, part = self._cache.names.get(norm_name, (None, None))
|
||||||
if full is None:
|
if full is None:
|
||||||
variants = self.name_processor.get_variants_ascii(norm_name)
|
variants = self.token_analysis.get_variants_ascii(norm_name)
|
||||||
if not variants:
|
if not variants:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@@ -487,23 +512,6 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
return full_tokens, partial_tokens
|
return full_tokens, partial_tokens
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _compute_full_names(names):
|
|
||||||
""" Return the set of all full name word ids to be used with the
|
|
||||||
given dictionary of names.
|
|
||||||
"""
|
|
||||||
full_names = set()
|
|
||||||
for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)):
|
|
||||||
if name:
|
|
||||||
full_names.add(name)
|
|
||||||
|
|
||||||
brace_idx = name.find('(')
|
|
||||||
if brace_idx >= 0:
|
|
||||||
full_names.add(name[:brace_idx].strip())
|
|
||||||
|
|
||||||
return full_names
|
|
||||||
|
|
||||||
|
|
||||||
def _add_postcode(self, postcode):
|
def _add_postcode(self, postcode):
|
||||||
""" Make sure the normalized postcode is present in the word table.
|
""" Make sure the normalized postcode is present in the word table.
|
||||||
"""
|
"""
|
||||||
@@ -511,7 +519,7 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
postcode = self.normalize_postcode(postcode)
|
postcode = self.normalize_postcode(postcode)
|
||||||
|
|
||||||
if postcode not in self._cache.postcodes:
|
if postcode not in self._cache.postcodes:
|
||||||
term = self.name_processor.get_search_normalized(postcode)
|
term = self._search_normalized(postcode)
|
||||||
if not term:
|
if not term:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|||||||
@@ -2,7 +2,6 @@
|
|||||||
Data structures for saving variant expansions for ICU tokenizer.
|
Data structures for saving variant expansions for ICU tokenizer.
|
||||||
"""
|
"""
|
||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
import json
|
|
||||||
|
|
||||||
_ICU_VARIANT_PORPERTY_FIELDS = ['lang']
|
_ICU_VARIANT_PORPERTY_FIELDS = ['lang']
|
||||||
|
|
||||||
@@ -24,34 +23,3 @@ class ICUVariantProperties(namedtuple('_ICUVariantProperties', _ICU_VARIANT_PORP
|
|||||||
|
|
||||||
|
|
||||||
ICUVariant = namedtuple('ICUVariant', ['source', 'replacement', 'properties'])
|
ICUVariant = namedtuple('ICUVariant', ['source', 'replacement', 'properties'])
|
||||||
|
|
||||||
|
|
||||||
def pickle_variant_set(variants):
|
|
||||||
""" Serializes an iterable of variant rules to a string.
|
|
||||||
"""
|
|
||||||
# Create a list of property sets. So they don't need to be duplicated
|
|
||||||
properties = {}
|
|
||||||
pid = 1
|
|
||||||
for variant in variants:
|
|
||||||
if variant.properties not in properties:
|
|
||||||
properties[variant.properties] = pid
|
|
||||||
pid += 1
|
|
||||||
|
|
||||||
# Convert the variants into a simple list.
|
|
||||||
variants = [(v.source, v.replacement, properties[v.properties]) for v in variants]
|
|
||||||
|
|
||||||
# Convert everythin to json.
|
|
||||||
return json.dumps({'properties': {v: k._asdict() for k, v in properties.items()},
|
|
||||||
'variants': variants})
|
|
||||||
|
|
||||||
|
|
||||||
def unpickle_variant_set(variant_string):
|
|
||||||
""" Deserializes a variant string that was previously created with
|
|
||||||
pickle_variant_set() into a set of ICUVariants.
|
|
||||||
"""
|
|
||||||
data = json.loads(variant_string)
|
|
||||||
|
|
||||||
properties = {int(k): ICUVariantProperties.from_rules(v)
|
|
||||||
for k, v in data['properties'].items()}
|
|
||||||
|
|
||||||
return set((ICUVariant(src, repl, properties[pid]) for src, repl, pid in data['variants']))
|
|
||||||
|
|||||||
@@ -113,7 +113,7 @@ class LegacyTokenizer(AbstractTokenizer):
|
|||||||
self._init_db_tables(config)
|
self._init_db_tables(config)
|
||||||
|
|
||||||
|
|
||||||
def init_from_project(self):
|
def init_from_project(self, _):
|
||||||
""" Initialise the tokenizer from the project directory.
|
""" Initialise the tokenizer from the project directory.
|
||||||
"""
|
"""
|
||||||
with connect(self.dsn) as conn:
|
with connect(self.dsn) as conn:
|
||||||
@@ -142,7 +142,7 @@ class LegacyTokenizer(AbstractTokenizer):
|
|||||||
modulepath=modulepath)
|
modulepath=modulepath)
|
||||||
|
|
||||||
|
|
||||||
def check_database(self):
|
def check_database(self, _):
|
||||||
""" Check that the tokenizer is set up correctly.
|
""" Check that the tokenizer is set up correctly.
|
||||||
"""
|
"""
|
||||||
hint = """\
|
hint = """\
|
||||||
@@ -405,16 +405,15 @@ class LegacyNameAnalyzer(AbstractAnalyzer):
|
|||||||
"""
|
"""
|
||||||
token_info = _TokenInfo(self._cache)
|
token_info = _TokenInfo(self._cache)
|
||||||
|
|
||||||
names = place.get('name')
|
names = place.name
|
||||||
|
|
||||||
if names:
|
if names:
|
||||||
token_info.add_names(self.conn, names)
|
token_info.add_names(self.conn, names)
|
||||||
|
|
||||||
country_feature = place.get('country_feature')
|
if place.is_country():
|
||||||
if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
|
self.add_country_names(place.country_code, names)
|
||||||
self.add_country_names(country_feature.lower(), names)
|
|
||||||
|
|
||||||
address = place.get('address')
|
address = place.address
|
||||||
if address:
|
if address:
|
||||||
self._process_place_address(token_info, address)
|
self._process_place_address(token_info, address)
|
||||||
|
|
||||||
|
|||||||
127
nominatim/tokenizer/place_sanitizer.py
Normal file
127
nominatim/tokenizer/place_sanitizer.py
Normal file
@@ -0,0 +1,127 @@
|
|||||||
|
"""
|
||||||
|
Handler for cleaning name and address tags in place information before it
|
||||||
|
is handed to the token analysis.
|
||||||
|
"""
|
||||||
|
import importlib
|
||||||
|
|
||||||
|
from nominatim.errors import UsageError
|
||||||
|
|
||||||
|
class PlaceName:
|
||||||
|
""" A searchable name for a place together with properties.
|
||||||
|
Every name object saves the name proper and two basic properties:
|
||||||
|
* 'kind' describes the name of the OSM key used without any suffixes
|
||||||
|
(i.e. the part after the colon removed)
|
||||||
|
* 'suffix' contains the suffix of the OSM tag, if any. The suffix
|
||||||
|
is the part of the key after the first colon.
|
||||||
|
In addition to that, the name may have arbitrary additional attributes.
|
||||||
|
Which attributes are used, depends on the token analyser.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, name, kind, suffix):
|
||||||
|
self.name = name
|
||||||
|
self.kind = kind
|
||||||
|
self.suffix = suffix
|
||||||
|
self.attr = {}
|
||||||
|
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return f"PlaceName(name='{self.name}',kind='{self.kind}',suffix='{self.suffix}')"
|
||||||
|
|
||||||
|
|
||||||
|
def clone(self, name=None, kind=None, suffix=None, attr=None):
|
||||||
|
""" Create a deep copy of the place name, optionally with the
|
||||||
|
given parameters replaced. In the attribute list only the given
|
||||||
|
keys are updated. The list is not replaced completely.
|
||||||
|
In particular, the function cannot to be used to remove an
|
||||||
|
attribute from a place name.
|
||||||
|
"""
|
||||||
|
newobj = PlaceName(name or self.name,
|
||||||
|
kind or self.kind,
|
||||||
|
suffix or self.suffix)
|
||||||
|
|
||||||
|
newobj.attr.update(self.attr)
|
||||||
|
if attr:
|
||||||
|
newobj.attr.update(attr)
|
||||||
|
|
||||||
|
return newobj
|
||||||
|
|
||||||
|
|
||||||
|
def set_attr(self, key, value):
|
||||||
|
""" Add the given property to the name. If the property was already
|
||||||
|
set, then the value is overwritten.
|
||||||
|
"""
|
||||||
|
self.attr[key] = value
|
||||||
|
|
||||||
|
|
||||||
|
def get_attr(self, key, default=None):
|
||||||
|
""" Return the given property or the value of 'default' if it
|
||||||
|
is not set.
|
||||||
|
"""
|
||||||
|
return self.attr.get(key, default)
|
||||||
|
|
||||||
|
|
||||||
|
def has_attr(self, key):
|
||||||
|
""" Check if the given attribute is set.
|
||||||
|
"""
|
||||||
|
return key in self.attr
|
||||||
|
|
||||||
|
|
||||||
|
class _ProcessInfo:
|
||||||
|
""" Container class for information handed into to handler functions.
|
||||||
|
The 'names' and 'address' members are mutable. A handler must change
|
||||||
|
them by either modifying the lists place or replacing the old content
|
||||||
|
with a new list.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, place):
|
||||||
|
self.place = place
|
||||||
|
self.names = self._convert_name_dict(place.name)
|
||||||
|
self.address = self._convert_name_dict(place.address)
|
||||||
|
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _convert_name_dict(names):
|
||||||
|
""" Convert a dictionary of names into a list of PlaceNames.
|
||||||
|
The dictionary key is split into the primary part of the key
|
||||||
|
and the suffix (the part after an optional colon).
|
||||||
|
"""
|
||||||
|
out = []
|
||||||
|
|
||||||
|
if names:
|
||||||
|
for key, value in names.items():
|
||||||
|
parts = key.split(':', 1)
|
||||||
|
out.append(PlaceName(value.strip(),
|
||||||
|
parts[0].strip(),
|
||||||
|
parts[1].strip() if len(parts) > 1 else None))
|
||||||
|
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
class PlaceSanitizer:
|
||||||
|
""" Controller class which applies sanitizer functions on the place
|
||||||
|
names and address before they are used by the token analysers.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, rules):
|
||||||
|
self.handlers = []
|
||||||
|
|
||||||
|
if rules:
|
||||||
|
for func in rules:
|
||||||
|
if 'step' not in func:
|
||||||
|
raise UsageError("Sanitizer rule is missing the 'step' attribute.")
|
||||||
|
module_name = 'nominatim.tokenizer.sanitizers.' + func['step'].replace('-', '_')
|
||||||
|
handler_module = importlib.import_module(module_name)
|
||||||
|
self.handlers.append(handler_module.create(func))
|
||||||
|
|
||||||
|
|
||||||
|
def process_names(self, place):
|
||||||
|
""" Extract a sanitized list of names and address parts from the
|
||||||
|
given place. The function returns a tuple
|
||||||
|
(list of names, list of address names)
|
||||||
|
"""
|
||||||
|
obj = _ProcessInfo(place)
|
||||||
|
|
||||||
|
for func in self.handlers:
|
||||||
|
func(obj)
|
||||||
|
|
||||||
|
return obj.names, obj.address
|
||||||
0
nominatim/tokenizer/sanitizers/__init__.py
Normal file
0
nominatim/tokenizer/sanitizers/__init__.py
Normal file
35
nominatim/tokenizer/sanitizers/split_name_list.py
Normal file
35
nominatim/tokenizer/sanitizers/split_name_list.py
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
"""
|
||||||
|
Name processor that splits name values with multiple values into their components.
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
|
||||||
|
from nominatim.errors import UsageError
|
||||||
|
|
||||||
|
def create(func):
|
||||||
|
""" Create a name processing function that splits name values with
|
||||||
|
multiple values into their components. The optional parameter
|
||||||
|
'delimiters' can be used to define the characters that should be used
|
||||||
|
for splitting. The default is ',;'.
|
||||||
|
"""
|
||||||
|
delimiter_set = set(func.get('delimiters', ',;'))
|
||||||
|
if not delimiter_set:
|
||||||
|
raise UsageError("Set of delimiters in split-name-list sanitizer is empty.")
|
||||||
|
|
||||||
|
regexp = re.compile('\\s*[{}]\\s*'.format(''.join('\\' + d for d in delimiter_set)))
|
||||||
|
|
||||||
|
def _process(obj):
|
||||||
|
if not obj.names:
|
||||||
|
return
|
||||||
|
|
||||||
|
new_names = []
|
||||||
|
for name in obj.names:
|
||||||
|
split_names = regexp.split(name.name)
|
||||||
|
print(split_names)
|
||||||
|
if len(split_names) == 1:
|
||||||
|
new_names.append(name)
|
||||||
|
else:
|
||||||
|
new_names.extend(name.clone(name=n) for n in split_names if n)
|
||||||
|
|
||||||
|
obj.names = new_names
|
||||||
|
|
||||||
|
return _process
|
||||||
22
nominatim/tokenizer/sanitizers/strip_brace_terms.py
Normal file
22
nominatim/tokenizer/sanitizers/strip_brace_terms.py
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
"""
|
||||||
|
Sanitizer handling names with addendums in braces.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def create(_):
|
||||||
|
""" Create a name processing function that creates additional name variants
|
||||||
|
when a name has an addendum in brackets (e.g. "Halle (Saale)"). The
|
||||||
|
additional variant only contains the main name without the bracket part.
|
||||||
|
"""
|
||||||
|
def _process(obj):
|
||||||
|
""" Add variants for names that have a bracket extension.
|
||||||
|
"""
|
||||||
|
if obj.names:
|
||||||
|
new_names = []
|
||||||
|
for name in (n for n in obj.names if '(' in n.name):
|
||||||
|
new_name = name.name.split('(')[0].strip()
|
||||||
|
if new_name:
|
||||||
|
new_names.append(name.clone(name=new_name))
|
||||||
|
|
||||||
|
obj.names.extend(new_names)
|
||||||
|
|
||||||
|
return _process
|
||||||
@@ -166,7 +166,7 @@ def check_tokenizer(_, config):
|
|||||||
return CheckState.FAIL, dict(msg="""\
|
return CheckState.FAIL, dict(msg="""\
|
||||||
Cannot load tokenizer. Did the import finish sucessfully?""")
|
Cannot load tokenizer. Did the import finish sucessfully?""")
|
||||||
|
|
||||||
result = tokenizer.check_database()
|
result = tokenizer.check_database(config)
|
||||||
|
|
||||||
if result is None:
|
if result is None:
|
||||||
return CheckState.OK
|
return CheckState.OK
|
||||||
|
|||||||
@@ -7,12 +7,11 @@ import logging
|
|||||||
import os
|
import os
|
||||||
import tarfile
|
import tarfile
|
||||||
|
|
||||||
import psycopg2.extras
|
|
||||||
|
|
||||||
from nominatim.db.connection import connect
|
from nominatim.db.connection import connect
|
||||||
from nominatim.db.async_connection import WorkerPool
|
from nominatim.db.async_connection import WorkerPool
|
||||||
from nominatim.db.sql_preprocessor import SQLPreprocessor
|
from nominatim.db.sql_preprocessor import SQLPreprocessor
|
||||||
from nominatim.errors import UsageError
|
from nominatim.errors import UsageError
|
||||||
|
from nominatim.indexer.place_info import PlaceInfo
|
||||||
|
|
||||||
LOG = logging.getLogger()
|
LOG = logging.getLogger()
|
||||||
|
|
||||||
@@ -58,7 +57,7 @@ def handle_threaded_sql_statements(pool, fd, analyzer):
|
|||||||
address = dict(street=row['street'], postcode=row['postcode'])
|
address = dict(street=row['street'], postcode=row['postcode'])
|
||||||
args = ('SRID=4326;' + row['geometry'],
|
args = ('SRID=4326;' + row['geometry'],
|
||||||
int(row['from']), int(row['to']), row['interpolation'],
|
int(row['from']), int(row['to']), row['interpolation'],
|
||||||
psycopg2.extras.Json(analyzer.process_place(dict(address=address))),
|
PlaceInfo({'address': address}).analyze(analyzer),
|
||||||
analyzer.normalize_postcode(row['postcode']))
|
analyzer.normalize_postcode(row['postcode']))
|
||||||
except ValueError:
|
except ValueError:
|
||||||
continue
|
continue
|
||||||
|
|||||||
@@ -24,6 +24,9 @@ transliteration:
|
|||||||
- "[^[:Ascii:]] >"
|
- "[^[:Ascii:]] >"
|
||||||
- ":: lower ()"
|
- ":: lower ()"
|
||||||
- ":: NFC ()"
|
- ":: NFC ()"
|
||||||
|
sanitizers:
|
||||||
|
- step: split-name-list
|
||||||
|
- step: strip-brace-terms
|
||||||
variants:
|
variants:
|
||||||
- !include icu-rules/variants-bg.yaml
|
- !include icu-rules/variants-bg.yaml
|
||||||
- !include icu-rules/variants-ca.yaml
|
- !include icu-rules/variants-ca.yaml
|
||||||
|
|||||||
@@ -1,6 +1,8 @@
|
|||||||
"""
|
"""
|
||||||
Tokenizer for testing.
|
Tokenizer for testing.
|
||||||
"""
|
"""
|
||||||
|
from nominatim.indexer.place_info import PlaceInfo
|
||||||
|
from nominatim.config import Configuration
|
||||||
|
|
||||||
def create(dsn, data_dir):
|
def create(dsn, data_dir):
|
||||||
""" Create a new instance of the tokenizer provided by this module.
|
""" Create a new instance of the tokenizer provided by this module.
|
||||||
@@ -21,7 +23,8 @@ class DummyTokenizer:
|
|||||||
self.init_state = "new"
|
self.init_state = "new"
|
||||||
|
|
||||||
|
|
||||||
def init_from_project(self):
|
def init_from_project(self, config):
|
||||||
|
assert isinstance(config, Configuration)
|
||||||
assert self.init_state is None
|
assert self.init_state is None
|
||||||
self.init_state = "loaded"
|
self.init_state = "loaded"
|
||||||
|
|
||||||
@@ -68,4 +71,5 @@ class DummyNameAnalyzer:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def process_place(place):
|
def process_place(place):
|
||||||
|
assert isinstance(place, PlaceInfo)
|
||||||
return {}
|
return {}
|
||||||
|
|||||||
@@ -100,6 +100,6 @@ def test_get_pg_env_overwrite_variable(monkeypatch):
|
|||||||
|
|
||||||
|
|
||||||
def test_get_pg_env_ignore_unknown():
|
def test_get_pg_env_ignore_unknown():
|
||||||
env = get_pg_env('tty=stuff', base_env={})
|
env = get_pg_env('client_encoding=stuff', base_env={})
|
||||||
|
|
||||||
assert env == {}
|
assert env == {}
|
||||||
|
|||||||
@@ -29,6 +29,7 @@ class IndexerTestDB:
|
|||||||
indexed_date TIMESTAMP,
|
indexed_date TIMESTAMP,
|
||||||
partition SMALLINT,
|
partition SMALLINT,
|
||||||
admin_level SMALLINT,
|
admin_level SMALLINT,
|
||||||
|
country_code TEXT,
|
||||||
address HSTORE,
|
address HSTORE,
|
||||||
token_info JSONB,
|
token_info JSONB,
|
||||||
geometry_sector INTEGER)""")
|
geometry_sector INTEGER)""")
|
||||||
@@ -54,15 +55,26 @@ class IndexerTestDB:
|
|||||||
END IF;
|
END IF;
|
||||||
RETURN NEW;
|
RETURN NEW;
|
||||||
END; $$ LANGUAGE plpgsql;""")
|
END; $$ LANGUAGE plpgsql;""")
|
||||||
cur.execute("""CREATE OR REPLACE FUNCTION placex_prepare_update(p placex,
|
cur.execute("DROP TYPE IF EXISTS prepare_update_info CASCADE")
|
||||||
OUT name HSTORE,
|
cur.execute("""CREATE TYPE prepare_update_info AS (
|
||||||
OUT address HSTORE,
|
name HSTORE,
|
||||||
OUT country_feature VARCHAR,
|
address HSTORE,
|
||||||
OUT linked_place_id BIGINT)
|
rank_address SMALLINT,
|
||||||
|
country_code TEXT,
|
||||||
|
class TEXT,
|
||||||
|
type TEXT,
|
||||||
|
linked_place_id BIGINT
|
||||||
|
)""")
|
||||||
|
cur.execute("""CREATE OR REPLACE FUNCTION placex_indexing_prepare(p placex,
|
||||||
|
OUT result prepare_update_info)
|
||||||
AS $$
|
AS $$
|
||||||
BEGIN
|
BEGIN
|
||||||
address := p.address;
|
result.address := p.address;
|
||||||
name := p.name;
|
result.name := p.name;
|
||||||
|
result.class := p.class;
|
||||||
|
result.type := p.type;
|
||||||
|
result.country_code := p.country_code;
|
||||||
|
result.rank_address := p.rank_address;
|
||||||
END;
|
END;
|
||||||
$$ LANGUAGE plpgsql STABLE;
|
$$ LANGUAGE plpgsql STABLE;
|
||||||
""")
|
""")
|
||||||
|
|||||||
@@ -7,10 +7,10 @@ import yaml
|
|||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from nominatim.tokenizer import icu_tokenizer
|
from nominatim.tokenizer import icu_tokenizer
|
||||||
from nominatim.tokenizer.icu_name_processor import ICUNameProcessorRules
|
|
||||||
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
|
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
|
||||||
from nominatim.db import properties
|
from nominatim.db import properties
|
||||||
from nominatim.db.sql_preprocessor import SQLPreprocessor
|
from nominatim.db.sql_preprocessor import SQLPreprocessor
|
||||||
|
from nominatim.indexer.place_info import PlaceInfo
|
||||||
|
|
||||||
from mock_icu_word_table import MockIcuWordTable
|
from mock_icu_word_table import MockIcuWordTable
|
||||||
|
|
||||||
@@ -67,11 +67,14 @@ def analyzer(tokenizer_factory, test_config, monkeypatch,
|
|||||||
monkeypatch.undo()
|
monkeypatch.undo()
|
||||||
|
|
||||||
def _mk_analyser(norm=("[[:Punctuation:][:Space:]]+ > ' '",), trans=(':: upper()',),
|
def _mk_analyser(norm=("[[:Punctuation:][:Space:]]+ > ' '",), trans=(':: upper()',),
|
||||||
variants=('~gasse -> gasse', 'street => st', )):
|
variants=('~gasse -> gasse', 'street => st', ),
|
||||||
|
sanitizers=[]):
|
||||||
cfgstr = {'normalization' : list(norm),
|
cfgstr = {'normalization' : list(norm),
|
||||||
'transliteration' : list(trans),
|
'sanitizers' : sanitizers,
|
||||||
'variants' : [ {'words': list(variants)}]}
|
'transliteration' : list(trans),
|
||||||
tok.naming_rules = ICUNameProcessorRules(loader=ICURuleLoader(cfgstr))
|
'variants' : [ {'words': list(variants)}]}
|
||||||
|
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(cfgstr))
|
||||||
|
tok.loader = ICURuleLoader(test_config)
|
||||||
|
|
||||||
return tok.name_analyzer()
|
return tok.name_analyzer()
|
||||||
|
|
||||||
@@ -177,9 +180,9 @@ def test_init_from_project(monkeypatch, test_config, tokenizer_factory):
|
|||||||
monkeypatch.undo()
|
monkeypatch.undo()
|
||||||
|
|
||||||
tok = tokenizer_factory()
|
tok = tokenizer_factory()
|
||||||
tok.init_from_project()
|
tok.init_from_project(test_config)
|
||||||
|
|
||||||
assert tok.naming_rules is not None
|
assert tok.loader is not None
|
||||||
assert tok.term_normalization == ':: lower();'
|
assert tok.term_normalization == ':: lower();'
|
||||||
|
|
||||||
|
|
||||||
@@ -308,44 +311,54 @@ class TestPlaceNames:
|
|||||||
|
|
||||||
@pytest.fixture(autouse=True)
|
@pytest.fixture(autouse=True)
|
||||||
def setup(self, analyzer, sql_functions):
|
def setup(self, analyzer, sql_functions):
|
||||||
with analyzer() as anl:
|
sanitizers = [{'step': 'split-name-list'},
|
||||||
|
{'step': 'strip-brace-terms'}]
|
||||||
|
with analyzer(sanitizers=sanitizers) as anl:
|
||||||
self.analyzer = anl
|
self.analyzer = anl
|
||||||
yield anl
|
yield anl
|
||||||
|
|
||||||
|
|
||||||
def expect_name_terms(self, info, *expected_terms):
|
def expect_name_terms(self, info, *expected_terms):
|
||||||
tokens = self.analyzer.get_word_token_info(expected_terms)
|
tokens = self.analyzer.get_word_token_info(expected_terms)
|
||||||
print (tokens)
|
|
||||||
for token in tokens:
|
for token in tokens:
|
||||||
assert token[2] is not None, "No token for {0}".format(token)
|
assert token[2] is not None, "No token for {0}".format(token)
|
||||||
|
|
||||||
assert eval(info['names']) == set((t[2] for t in tokens))
|
assert eval(info['names']) == set((t[2] for t in tokens))
|
||||||
|
|
||||||
|
|
||||||
|
def process_named_place(self, names):
|
||||||
|
return self.analyzer.process_place(PlaceInfo({'name': names}))
|
||||||
|
|
||||||
|
|
||||||
def test_simple_names(self):
|
def test_simple_names(self):
|
||||||
info = self.analyzer.process_place({'name': {'name': 'Soft bAr', 'ref': '34'}})
|
info = self.process_named_place({'name': 'Soft bAr', 'ref': '34'})
|
||||||
|
|
||||||
self.expect_name_terms(info, '#Soft bAr', '#34', 'Soft', 'bAr', '34')
|
self.expect_name_terms(info, '#Soft bAr', '#34', 'Soft', 'bAr', '34')
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('sep', [',' , ';'])
|
@pytest.mark.parametrize('sep', [',' , ';'])
|
||||||
def test_names_with_separator(self, sep):
|
def test_names_with_separator(self, sep):
|
||||||
info = self.analyzer.process_place({'name': {'name': sep.join(('New York', 'Big Apple'))}})
|
info = self.process_named_place({'name': sep.join(('New York', 'Big Apple'))})
|
||||||
|
|
||||||
self.expect_name_terms(info, '#New York', '#Big Apple',
|
self.expect_name_terms(info, '#New York', '#Big Apple',
|
||||||
'new', 'york', 'big', 'apple')
|
'new', 'york', 'big', 'apple')
|
||||||
|
|
||||||
|
|
||||||
def test_full_names_with_bracket(self):
|
def test_full_names_with_bracket(self):
|
||||||
info = self.analyzer.process_place({'name': {'name': 'Houseboat (left)'}})
|
info = self.process_named_place({'name': 'Houseboat (left)'})
|
||||||
|
|
||||||
self.expect_name_terms(info, '#Houseboat (left)', '#Houseboat',
|
self.expect_name_terms(info, '#Houseboat (left)', '#Houseboat',
|
||||||
'houseboat', 'left')
|
'houseboat', 'left')
|
||||||
|
|
||||||
|
|
||||||
def test_country_name(self, word_table):
|
def test_country_name(self, word_table):
|
||||||
info = self.analyzer.process_place({'name': {'name': 'Norge'},
|
place = PlaceInfo({'name' : {'name': 'Norge'},
|
||||||
'country_feature': 'no'})
|
'country_code': 'no',
|
||||||
|
'rank_address': 4,
|
||||||
|
'class': 'boundary',
|
||||||
|
'type': 'administrative'})
|
||||||
|
|
||||||
|
info = self.analyzer.process_place(place)
|
||||||
|
|
||||||
self.expect_name_terms(info, '#norge', 'norge')
|
self.expect_name_terms(info, '#norge', 'norge')
|
||||||
assert word_table.get_country() == {('no', 'NORGE')}
|
assert word_table.get_country() == {('no', 'NORGE')}
|
||||||
@@ -361,7 +374,7 @@ class TestPlaceAddress:
|
|||||||
|
|
||||||
|
|
||||||
def process_address(self, **kwargs):
|
def process_address(self, **kwargs):
|
||||||
return self.analyzer.process_place({'address': kwargs})
|
return self.analyzer.process_place(PlaceInfo({'address': kwargs}))
|
||||||
|
|
||||||
|
|
||||||
def name_token_set(self, *expected_terms):
|
def name_token_set(self, *expected_terms):
|
||||||
|
|||||||
@@ -4,15 +4,17 @@ Tests for import name normalisation and variant generation.
|
|||||||
from textwrap import dedent
|
from textwrap import dedent
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import yaml
|
|
||||||
|
|
||||||
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
|
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
|
||||||
from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
|
|
||||||
|
|
||||||
from nominatim.errors import UsageError
|
from nominatim.errors import UsageError
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def cfgfile():
|
def cfgfile(def_config, tmp_path):
|
||||||
|
project_dir = tmp_path / 'project_dir'
|
||||||
|
project_dir.mkdir()
|
||||||
|
def_config.project_dir = project_dir
|
||||||
|
|
||||||
def _create_config(*variants, **kwargs):
|
def _create_config(*variants, **kwargs):
|
||||||
content = dedent("""\
|
content = dedent("""\
|
||||||
normalization:
|
normalization:
|
||||||
@@ -30,7 +32,9 @@ def cfgfile():
|
|||||||
content += '\n'.join((" - " + s for s in variants)) + '\n'
|
content += '\n'.join((" - " + s for s in variants)) + '\n'
|
||||||
for k, v in kwargs:
|
for k, v in kwargs:
|
||||||
content += " {}: {}\n".format(k, v)
|
content += " {}: {}\n".format(k, v)
|
||||||
return yaml.safe_load(content)
|
(project_dir / 'icu_tokenizer.yaml').write_text(content)
|
||||||
|
|
||||||
|
return def_config
|
||||||
|
|
||||||
return _create_config
|
return _create_config
|
||||||
|
|
||||||
@@ -40,10 +44,9 @@ def get_normalized_variants(proc, name):
|
|||||||
|
|
||||||
|
|
||||||
def test_variants_empty(cfgfile):
|
def test_variants_empty(cfgfile):
|
||||||
fpath = cfgfile('saint -> 🜵', 'street -> st')
|
config = cfgfile('saint -> 🜵', 'street -> st')
|
||||||
|
|
||||||
rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath))
|
proc = ICURuleLoader(config).make_token_analysis()
|
||||||
proc = ICUNameProcessor(rules)
|
|
||||||
|
|
||||||
assert get_normalized_variants(proc, '🜵') == []
|
assert get_normalized_variants(proc, '🜵') == []
|
||||||
assert get_normalized_variants(proc, '🜳') == []
|
assert get_normalized_variants(proc, '🜳') == []
|
||||||
@@ -83,8 +86,8 @@ VARIANT_TESTS = [
|
|||||||
|
|
||||||
@pytest.mark.parametrize("rules,name,variants", VARIANT_TESTS)
|
@pytest.mark.parametrize("rules,name,variants", VARIANT_TESTS)
|
||||||
def test_variants(cfgfile, rules, name, variants):
|
def test_variants(cfgfile, rules, name, variants):
|
||||||
fpath = cfgfile(*rules)
|
config = cfgfile(*rules)
|
||||||
proc = ICUNameProcessor(ICUNameProcessorRules(loader=ICURuleLoader(fpath)))
|
proc = ICURuleLoader(config).make_token_analysis()
|
||||||
|
|
||||||
result = get_normalized_variants(proc, name)
|
result = get_normalized_variants(proc, name)
|
||||||
|
|
||||||
@@ -93,10 +96,8 @@ def test_variants(cfgfile, rules, name, variants):
|
|||||||
|
|
||||||
|
|
||||||
def test_search_normalized(cfgfile):
|
def test_search_normalized(cfgfile):
|
||||||
fpath = cfgfile('~street => s,st', 'master => mstr')
|
config = cfgfile('~street => s,st', 'master => mstr')
|
||||||
|
proc = ICURuleLoader(config).make_token_analysis()
|
||||||
rules = ICUNameProcessorRules(loader=ICURuleLoader(fpath))
|
|
||||||
proc = ICUNameProcessor(rules)
|
|
||||||
|
|
||||||
assert proc.get_search_normalized('Master Street') == 'master street'
|
assert proc.get_search_normalized('Master Street') == 'master street'
|
||||||
assert proc.get_search_normalized('Earnes St') == 'earnes st'
|
assert proc.get_search_normalized('Earnes St') == 'earnes st'
|
||||||
|
|||||||
@@ -12,7 +12,16 @@ from nominatim.errors import UsageError
|
|||||||
from icu import Transliterator
|
from icu import Transliterator
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def cfgrules():
|
def test_config(def_config, tmp_path):
|
||||||
|
project_dir = tmp_path / 'project_dir'
|
||||||
|
project_dir.mkdir()
|
||||||
|
def_config.project_dir = project_dir
|
||||||
|
|
||||||
|
return def_config
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def cfgrules(test_config):
|
||||||
def _create_config(*variants, **kwargs):
|
def _create_config(*variants, **kwargs):
|
||||||
content = dedent("""\
|
content = dedent("""\
|
||||||
normalization:
|
normalization:
|
||||||
@@ -29,19 +38,21 @@ def cfgrules():
|
|||||||
content += '\n'.join((" - " + s for s in variants)) + '\n'
|
content += '\n'.join((" - " + s for s in variants)) + '\n'
|
||||||
for k, v in kwargs:
|
for k, v in kwargs:
|
||||||
content += " {}: {}\n".format(k, v)
|
content += " {}: {}\n".format(k, v)
|
||||||
return yaml.safe_load(content)
|
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(content)
|
||||||
|
|
||||||
|
return test_config
|
||||||
|
|
||||||
return _create_config
|
return _create_config
|
||||||
|
|
||||||
|
|
||||||
def test_empty_rule_set():
|
def test_empty_rule_set(test_config):
|
||||||
rule_cfg = yaml.safe_load(dedent("""\
|
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(dedent("""\
|
||||||
normalization:
|
normalization:
|
||||||
transliteration:
|
transliteration:
|
||||||
variants:
|
variants:
|
||||||
"""))
|
"""))
|
||||||
|
|
||||||
rules = ICURuleLoader(rule_cfg)
|
rules = ICURuleLoader(test_config)
|
||||||
assert rules.get_search_rules() == ''
|
assert rules.get_search_rules() == ''
|
||||||
assert rules.get_normalization_rules() == ''
|
assert rules.get_normalization_rules() == ''
|
||||||
assert rules.get_transliteration_rules() == ''
|
assert rules.get_transliteration_rules() == ''
|
||||||
@@ -50,11 +61,12 @@ def test_empty_rule_set():
|
|||||||
CONFIG_SECTIONS = ('normalization', 'transliteration', 'variants')
|
CONFIG_SECTIONS = ('normalization', 'transliteration', 'variants')
|
||||||
|
|
||||||
@pytest.mark.parametrize("section", CONFIG_SECTIONS)
|
@pytest.mark.parametrize("section", CONFIG_SECTIONS)
|
||||||
def test_missing_section(section):
|
def test_missing_section(section, test_config):
|
||||||
rule_cfg = { s: {} for s in CONFIG_SECTIONS if s != section}
|
rule_cfg = { s: {} for s in CONFIG_SECTIONS if s != section}
|
||||||
|
(test_config.project_dir / 'icu_tokenizer.yaml').write_text(yaml.dump(rule_cfg))
|
||||||
|
|
||||||
with pytest.raises(UsageError):
|
with pytest.raises(UsageError):
|
||||||
ICURuleLoader(rule_cfg)
|
ICURuleLoader(test_config)
|
||||||
|
|
||||||
|
|
||||||
def test_get_search_rules(cfgrules):
|
def test_get_search_rules(cfgrules):
|
||||||
@@ -88,9 +100,8 @@ def test_get_transliteration_rules(cfgrules):
|
|||||||
assert trans.transliterate(" проспект-Prospekt ") == " prospekt Prospekt "
|
assert trans.transliterate(" проспект-Prospekt ") == " prospekt Prospekt "
|
||||||
|
|
||||||
|
|
||||||
def test_transliteration_rules_from_file(def_config, tmp_path):
|
def test_transliteration_rules_from_file(test_config):
|
||||||
def_config.project_dir = tmp_path
|
cfgpath = test_config.project_dir / ('icu_tokenizer.yaml')
|
||||||
cfgpath = tmp_path / ('test_config.yaml')
|
|
||||||
cfgpath.write_text(dedent("""\
|
cfgpath.write_text(dedent("""\
|
||||||
normalization:
|
normalization:
|
||||||
transliteration:
|
transliteration:
|
||||||
@@ -98,10 +109,10 @@ def test_transliteration_rules_from_file(def_config, tmp_path):
|
|||||||
- !include transliteration.yaml
|
- !include transliteration.yaml
|
||||||
variants:
|
variants:
|
||||||
"""))
|
"""))
|
||||||
transpath = tmp_path / ('transliteration.yaml')
|
transpath = test_config.project_dir / ('transliteration.yaml')
|
||||||
transpath.write_text('- "x > y"')
|
transpath.write_text('- "x > y"')
|
||||||
|
|
||||||
loader = ICURuleLoader(def_config.load_sub_configuration('test_config.yaml'))
|
loader = ICURuleLoader(test_config)
|
||||||
rules = loader.get_transliteration_rules()
|
rules = loader.get_transliteration_rules()
|
||||||
trans = Transliterator.createFromRules("test", rules)
|
trans = Transliterator.createFromRules("test", rules)
|
||||||
|
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ import shutil
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
from nominatim.indexer.place_info import PlaceInfo
|
||||||
from nominatim.tokenizer import legacy_tokenizer
|
from nominatim.tokenizer import legacy_tokenizer
|
||||||
from nominatim.db import properties
|
from nominatim.db import properties
|
||||||
from nominatim.errors import UsageError
|
from nominatim.errors import UsageError
|
||||||
@@ -131,10 +132,10 @@ def test_init_module_custom(tokenizer_factory, test_config,
|
|||||||
assert not (test_config.project_dir / 'module').exists()
|
assert not (test_config.project_dir / 'module').exists()
|
||||||
|
|
||||||
|
|
||||||
def test_init_from_project(tokenizer_setup, tokenizer_factory):
|
def test_init_from_project(tokenizer_setup, tokenizer_factory, test_config):
|
||||||
tok = tokenizer_factory()
|
tok = tokenizer_factory()
|
||||||
|
|
||||||
tok.init_from_project()
|
tok.init_from_project(test_config)
|
||||||
|
|
||||||
assert tok.normalization is not None
|
assert tok.normalization is not None
|
||||||
|
|
||||||
@@ -284,21 +285,21 @@ def test_add_more_country_names(analyzer, word_table, make_standard_name):
|
|||||||
|
|
||||||
|
|
||||||
def test_process_place_names(analyzer, make_keywords):
|
def test_process_place_names(analyzer, make_keywords):
|
||||||
info = analyzer.process_place({'name' : {'name' : 'Soft bAr', 'ref': '34'}})
|
info = analyzer.process_place(PlaceInfo({'name' : {'name' : 'Soft bAr', 'ref': '34'}}))
|
||||||
|
|
||||||
assert info['names'] == '{1,2,3}'
|
assert info['names'] == '{1,2,3}'
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('pcode', ['12345', 'AB 123', '34-345'])
|
@pytest.mark.parametrize('pcode', ['12345', 'AB 123', '34-345'])
|
||||||
def test_process_place_postcode(analyzer, create_postcode_id, word_table, pcode):
|
def test_process_place_postcode(analyzer, create_postcode_id, word_table, pcode):
|
||||||
analyzer.process_place({'address': {'postcode' : pcode}})
|
analyzer.process_place(PlaceInfo({'address': {'postcode' : pcode}}))
|
||||||
|
|
||||||
assert word_table.get_postcodes() == {pcode, }
|
assert word_table.get_postcodes() == {pcode, }
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('pcode', ['12:23', 'ab;cd;f', '123;836'])
|
@pytest.mark.parametrize('pcode', ['12:23', 'ab;cd;f', '123;836'])
|
||||||
def test_process_place_bad_postcode(analyzer, create_postcode_id, word_table, pcode):
|
def test_process_place_bad_postcode(analyzer, create_postcode_id, word_table, pcode):
|
||||||
analyzer.process_place({'address': {'postcode' : pcode}})
|
analyzer.process_place(PlaceInfo({'address': {'postcode' : pcode}}))
|
||||||
|
|
||||||
assert not word_table.get_postcodes()
|
assert not word_table.get_postcodes()
|
||||||
|
|
||||||
@@ -319,7 +320,7 @@ class TestHousenumberName:
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
@pytest.mark.parametrize('hnr', ['123a', '1', '101'])
|
@pytest.mark.parametrize('hnr', ['123a', '1', '101'])
|
||||||
def test_process_place_housenumbers_simple(analyzer, hnr):
|
def test_process_place_housenumbers_simple(analyzer, hnr):
|
||||||
info = analyzer.process_place({'address': {'housenumber' : hnr}})
|
info = analyzer.process_place(PlaceInfo({'address': {'housenumber' : hnr}}))
|
||||||
|
|
||||||
assert info['hnr'] == hnr
|
assert info['hnr'] == hnr
|
||||||
assert info['hnr_tokens'].startswith("{")
|
assert info['hnr_tokens'].startswith("{")
|
||||||
@@ -327,15 +328,15 @@ class TestHousenumberName:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def test_process_place_housenumbers_lists(analyzer):
|
def test_process_place_housenumbers_lists(analyzer):
|
||||||
info = analyzer.process_place({'address': {'conscriptionnumber' : '1; 2;3'}})
|
info = analyzer.process_place(PlaceInfo({'address': {'conscriptionnumber' : '1; 2;3'}}))
|
||||||
|
|
||||||
assert set(info['hnr'].split(';')) == set(('1', '2', '3'))
|
assert set(info['hnr'].split(';')) == set(('1', '2', '3'))
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def test_process_place_housenumbers_duplicates(analyzer):
|
def test_process_place_housenumbers_duplicates(analyzer):
|
||||||
info = analyzer.process_place({'address': {'housenumber' : '134',
|
info = analyzer.process_place(PlaceInfo({'address': {'housenumber' : '134',
|
||||||
'conscriptionnumber' : '134',
|
'conscriptionnumber' : '134',
|
||||||
'streetnumber' : '99a'}})
|
'streetnumber' : '99a'}}))
|
||||||
|
|
||||||
assert set(info['hnr'].split(';')) == set(('134', '99a'))
|
assert set(info['hnr'].split(';')) == set(('134', '99a'))
|
||||||
|
|||||||
@@ -53,7 +53,7 @@ def test_check_tokenizer(temp_db_conn, def_config, monkeypatch,
|
|||||||
check_result, state):
|
check_result, state):
|
||||||
class _TestTokenizer:
|
class _TestTokenizer:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def check_database():
|
def check_database(_):
|
||||||
return check_result
|
return check_result
|
||||||
|
|
||||||
monkeypatch.setattr(chkdb.tokenizer_factory, 'get_tokenizer_for_db',
|
monkeypatch.setattr(chkdb.tokenizer_factory, 'get_tokenizer_for_db',
|
||||||
|
|||||||
65
test/python/tokenizer/sanitizers/test_split_name_list.py
Normal file
65
test/python/tokenizer/sanitizers/test_split_name_list.py
Normal file
@@ -0,0 +1,65 @@
|
|||||||
|
"""
|
||||||
|
Tests for the sanitizer that splitts multivalue lists.
|
||||||
|
"""
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
|
||||||
|
from nominatim.indexer.place_info import PlaceInfo
|
||||||
|
|
||||||
|
from nominatim.errors import UsageError
|
||||||
|
|
||||||
|
def run_sanitizer_on(**kwargs):
|
||||||
|
place = PlaceInfo({'name': kwargs})
|
||||||
|
name, _ = PlaceSanitizer([{'step': 'split-name-list'}]).process_names(place)
|
||||||
|
|
||||||
|
return sorted([(p.name, p.kind, p.suffix) for p in name])
|
||||||
|
|
||||||
|
|
||||||
|
def sanitize_with_delimiter(delimiter, name):
|
||||||
|
place = PlaceInfo({'name': {'name': name}})
|
||||||
|
san = PlaceSanitizer([{'step': 'split-name-list', 'delimiters': delimiter}])
|
||||||
|
name, _ = san.process_names(place)
|
||||||
|
|
||||||
|
return sorted([p.name for p in name])
|
||||||
|
|
||||||
|
|
||||||
|
def test_simple():
|
||||||
|
assert run_sanitizer_on(name='ABC') == [('ABC', 'name', None)]
|
||||||
|
assert run_sanitizer_on(name='') == [('', 'name', None)]
|
||||||
|
|
||||||
|
|
||||||
|
def test_splits():
|
||||||
|
assert run_sanitizer_on(name='A;B;C') == [('A', 'name', None),
|
||||||
|
('B', 'name', None),
|
||||||
|
('C', 'name', None)]
|
||||||
|
assert run_sanitizer_on(short_name=' House, boat ') == [('House', 'short_name', None),
|
||||||
|
('boat', 'short_name', None)]
|
||||||
|
|
||||||
|
|
||||||
|
def test_empty_fields():
|
||||||
|
assert run_sanitizer_on(name='A;;B') == [('A', 'name', None),
|
||||||
|
('B', 'name', None)]
|
||||||
|
assert run_sanitizer_on(name='A; ,B') == [('A', 'name', None),
|
||||||
|
('B', 'name', None)]
|
||||||
|
assert run_sanitizer_on(name=' ;B') == [('B', 'name', None)]
|
||||||
|
assert run_sanitizer_on(name='B,') == [('B', 'name', None)]
|
||||||
|
|
||||||
|
|
||||||
|
def test_custom_delimiters():
|
||||||
|
assert sanitize_with_delimiter(':', '12:45,3') == ['12', '45,3']
|
||||||
|
assert sanitize_with_delimiter('\\', 'a;\\b!#@ \\') == ['a;', 'b!#@']
|
||||||
|
assert sanitize_with_delimiter('[]', 'foo[to]be') == ['be', 'foo', 'to']
|
||||||
|
assert sanitize_with_delimiter(' ', 'morning sun') == ['morning', 'sun']
|
||||||
|
|
||||||
|
|
||||||
|
def test_empty_delimiter_set():
|
||||||
|
with pytest.raises(UsageError):
|
||||||
|
sanitize_with_delimiter('', 'abc')
|
||||||
|
|
||||||
|
|
||||||
|
def test_no_name_list():
|
||||||
|
place = PlaceInfo({'address': {'housenumber': '3'}})
|
||||||
|
name, address = PlaceSanitizer([{'step': 'split-name-list'}]).process_names(place)
|
||||||
|
|
||||||
|
assert not name
|
||||||
|
assert len(address) == 1
|
||||||
44
test/python/tokenizer/sanitizers/test_strip_brace_terms.py
Normal file
44
test/python/tokenizer/sanitizers/test_strip_brace_terms.py
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
"""
|
||||||
|
Tests for the sanitizer that handles braced suffixes.
|
||||||
|
"""
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
|
||||||
|
from nominatim.indexer.place_info import PlaceInfo
|
||||||
|
|
||||||
|
def run_sanitizer_on(**kwargs):
|
||||||
|
place = PlaceInfo({'name': kwargs})
|
||||||
|
name, _ = PlaceSanitizer([{'step': 'strip-brace-terms'}]).process_names(place)
|
||||||
|
|
||||||
|
return sorted([(p.name, p.kind, p.suffix) for p in name])
|
||||||
|
|
||||||
|
|
||||||
|
def test_no_braces():
|
||||||
|
assert run_sanitizer_on(name='foo', ref='23') == [('23', 'ref', None),
|
||||||
|
('foo', 'name', None)]
|
||||||
|
|
||||||
|
|
||||||
|
def test_simple_braces():
|
||||||
|
assert run_sanitizer_on(name='Halle (Saale)', ref='3')\
|
||||||
|
== [('3', 'ref', None), ('Halle', 'name', None), ('Halle (Saale)', 'name', None)]
|
||||||
|
assert run_sanitizer_on(name='ack ( bar')\
|
||||||
|
== [('ack', 'name', None), ('ack ( bar', 'name', None)]
|
||||||
|
|
||||||
|
|
||||||
|
def test_only_braces():
|
||||||
|
assert run_sanitizer_on(name='(maybe)') == [('(maybe)', 'name', None)]
|
||||||
|
|
||||||
|
|
||||||
|
def test_double_braces():
|
||||||
|
assert run_sanitizer_on(name='a((b))') == [('a', 'name', None),
|
||||||
|
('a((b))', 'name', None)]
|
||||||
|
assert run_sanitizer_on(name='a (b) (c)') == [('a', 'name', None),
|
||||||
|
('a (b) (c)', 'name', None)]
|
||||||
|
|
||||||
|
|
||||||
|
def test_no_names():
|
||||||
|
place = PlaceInfo({'address': {'housenumber': '3'}})
|
||||||
|
name, address = PlaceSanitizer([{'step': 'strip-brace-terms'}]).process_names(place)
|
||||||
|
|
||||||
|
assert not name
|
||||||
|
assert len(address) == 1
|
||||||
71
test/python/tokenizer/test_place_sanitizer.py
Normal file
71
test/python/tokenizer/test_place_sanitizer.py
Normal file
@@ -0,0 +1,71 @@
|
|||||||
|
"""
|
||||||
|
Tests for execution of the sanitztion step.
|
||||||
|
"""
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from nominatim.errors import UsageError
|
||||||
|
import nominatim.tokenizer.place_sanitizer as sanitizer
|
||||||
|
from nominatim.indexer.place_info import PlaceInfo
|
||||||
|
|
||||||
|
|
||||||
|
def test_placeinfo_clone_new_name():
|
||||||
|
place = sanitizer.PlaceName('foo', 'ki', 'su')
|
||||||
|
|
||||||
|
newplace = place.clone(name='bar')
|
||||||
|
|
||||||
|
assert place.name == 'foo'
|
||||||
|
assert newplace.name == 'bar'
|
||||||
|
assert newplace.kind == 'ki'
|
||||||
|
assert newplace.suffix == 'su'
|
||||||
|
|
||||||
|
|
||||||
|
def test_placeinfo_clone_merge_attr():
|
||||||
|
place = sanitizer.PlaceName('foo', 'ki', 'su')
|
||||||
|
place.set_attr('a1', 'v1')
|
||||||
|
place.set_attr('a2', 'v2')
|
||||||
|
|
||||||
|
newplace = place.clone(attr={'a2': 'new', 'b2': 'foo'})
|
||||||
|
|
||||||
|
assert place.get_attr('a2') == 'v2'
|
||||||
|
assert place.get_attr('b2') is None
|
||||||
|
assert newplace.get_attr('a1') == 'v1'
|
||||||
|
assert newplace.get_attr('a2') == 'new'
|
||||||
|
assert newplace.get_attr('b2') == 'foo'
|
||||||
|
|
||||||
|
|
||||||
|
def test_placeinfo_has_attr():
|
||||||
|
place = sanitizer.PlaceName('foo', 'ki', 'su')
|
||||||
|
place.set_attr('a1', 'v1')
|
||||||
|
|
||||||
|
assert place.has_attr('a1')
|
||||||
|
assert not place.has_attr('whatever')
|
||||||
|
|
||||||
|
|
||||||
|
def test_sanitizer_default():
|
||||||
|
san = sanitizer.PlaceSanitizer([{'step': 'split-name-list'}])
|
||||||
|
|
||||||
|
name, address = san.process_names(PlaceInfo({'name': {'name:de:de': '1;2;3'},
|
||||||
|
'address': {'street': 'Bald'}}))
|
||||||
|
|
||||||
|
assert len(name) == 3
|
||||||
|
assert all(isinstance(n, sanitizer.PlaceName) for n in name)
|
||||||
|
assert all(n.kind == 'name' for n in name)
|
||||||
|
assert all(n.suffix == 'de:de' for n in name)
|
||||||
|
|
||||||
|
assert len(address) == 1
|
||||||
|
assert all(isinstance(n, sanitizer.PlaceName) for n in address)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('rules', [None, []])
|
||||||
|
def test_sanitizer_empty_list(rules):
|
||||||
|
san = sanitizer.PlaceSanitizer(rules)
|
||||||
|
|
||||||
|
name, address = san.process_names(PlaceInfo({'name': {'name:de:de': '1;2;3'}}))
|
||||||
|
|
||||||
|
assert len(name) == 1
|
||||||
|
assert all(isinstance(n, sanitizer.PlaceName) for n in name)
|
||||||
|
|
||||||
|
|
||||||
|
def test_sanitizer_missing_step_definition():
|
||||||
|
with pytest.raises(UsageError):
|
||||||
|
san = sanitizer.PlaceSanitizer([{'id': 'split-name-list'}])
|
||||||
Reference in New Issue
Block a user