add type annotations for token analysis

No annotations for ICU types yet.
This commit is contained in:
Sarah Hoffmann
2022-07-13 17:18:53 +02:00
parent 62eedbb8f6
commit d35e3c25b6
6 changed files with 94 additions and 35 deletions

View File

@@ -0,0 +1,45 @@
# SPDX-License-Identifier: GPL-2.0-only
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2022 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Common data types and protocols for analysers.
"""
from typing import TypeVar, Mapping, List, Any
from typing_extensions import Protocol
T_config = TypeVar('T_config') # pylint: disable=invalid-name
class Analyser(Protocol):
""" Instance of the token analyser.
"""
def normalize(self, name: str) -> str:
""" Return the normalized form of the name. This is the standard form
from which possible variants for the name can be derived.
"""
def get_variants_ascii(self, norm_name: str) -> List[str]:
""" Compute the spelling variants for the given normalized name
and transliterate the result.
"""
class AnalysisModule(Protocol[T_config]):
""" Protocol for analysis modules.
"""
def configure(self, rules: Mapping[str, Any], normalization_rules: str) -> T_config:
""" Prepare the configuration of the analysis module.
This function should prepare all data that can be shared
between instances of this analyser.
"""
def create(self, normalizer: Any, transliterator: Any, config: T_config) -> Analyser:
""" Create a new instance of the analyser.
A separate instance of the analyser is created for each thread
when used in multi-threading context.
"""

View File

@@ -7,7 +7,8 @@
""" """
Parser for configuration for variants. Parser for configuration for variants.
""" """
from collections import defaultdict, namedtuple from typing import Any, Iterator, Tuple, List, Optional, Set, NamedTuple
from collections import defaultdict
import itertools import itertools
import re import re
@@ -16,9 +17,15 @@ from icu import Transliterator
from nominatim.config import flatten_config_list from nominatim.config import flatten_config_list
from nominatim.errors import UsageError from nominatim.errors import UsageError
ICUVariant = namedtuple('ICUVariant', ['source', 'replacement']) class ICUVariant(NamedTuple):
""" A single replacement rule for variant creation.
"""
source: str
replacement: str
def get_variant_config(rules, normalization_rules):
def get_variant_config(in_rules: Any,
normalization_rules: str) -> Tuple[List[Tuple[str, List[str]]], str]:
""" Convert the variant definition from the configuration into """ Convert the variant definition from the configuration into
replacement sets. replacement sets.
@@ -26,11 +33,11 @@ def get_variant_config(rules, normalization_rules):
used in the replacements. used in the replacements.
""" """
immediate = defaultdict(list) immediate = defaultdict(list)
chars = set() chars: Set[str] = set()
if rules: if in_rules:
vset = set() vset: Set[ICUVariant] = set()
rules = flatten_config_list(rules, 'variants') rules = flatten_config_list(in_rules, 'variants')
vmaker = _VariantMaker(normalization_rules) vmaker = _VariantMaker(normalization_rules)
@@ -56,12 +63,12 @@ class _VariantMaker:
All text in rules is normalized to make sure the variants match later. All text in rules is normalized to make sure the variants match later.
""" """
def __init__(self, norm_rules): def __init__(self, norm_rules: Any) -> None:
self.norm = Transliterator.createFromRules("rule_loader_normalization", self.norm = Transliterator.createFromRules("rule_loader_normalization",
norm_rules) norm_rules)
def compute(self, rule): def compute(self, rule: Any) -> Iterator[ICUVariant]:
""" Generator for all ICUVariant tuples from a single variant rule. """ Generator for all ICUVariant tuples from a single variant rule.
""" """
parts = re.split(r'(\|)?([=-])>', rule) parts = re.split(r'(\|)?([=-])>', rule)
@@ -85,7 +92,7 @@ class _VariantMaker:
yield ICUVariant(froms, tos) yield ICUVariant(froms, tos)
def _parse_variant_word(self, name): def _parse_variant_word(self, name: str) -> Optional[Tuple[str, str, str]]:
name = name.strip() name = name.strip()
match = re.fullmatch(r'([~^]?)([^~$^]*)([~$]?)', name) match = re.fullmatch(r'([~^]?)([^~$^]*)([~$]?)', name)
if match is None or (match.group(1) == '~' and match.group(3) == '~'): if match is None or (match.group(1) == '~' and match.group(3) == '~'):
@@ -102,7 +109,8 @@ _FLAG_MATCH = {'^': '^ ',
'': ' '} '': ' '}
def _create_variants(src, preflag, postflag, repl, decompose): def _create_variants(src: str, preflag: str, postflag: str,
repl: str, decompose: bool) -> Iterator[Tuple[str, str]]:
if preflag == '~': if preflag == '~':
postfix = _FLAG_MATCH[postflag] postfix = _FLAG_MATCH[postflag]
# suffix decomposition # suffix decomposition

View File

@@ -7,6 +7,7 @@
""" """
Generic processor for names that creates abbreviation variants. Generic processor for names that creates abbreviation variants.
""" """
from typing import Mapping, Dict, Any, Iterable, Iterator, Optional, List, cast
import itertools import itertools
import datrie import datrie
@@ -17,10 +18,10 @@ from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantG
### Configuration section ### Configuration section
def configure(rules, normalization_rules): def configure(rules: Mapping[str, Any], normalization_rules: str) -> Dict[str, Any]:
""" Extract and preprocess the configuration for this module. """ Extract and preprocess the configuration for this module.
""" """
config = {} config: Dict[str, Any] = {}
config['replacements'], config['chars'] = get_variant_config(rules.get('variants'), config['replacements'], config['chars'] = get_variant_config(rules.get('variants'),
normalization_rules) normalization_rules)
@@ -47,7 +48,8 @@ def configure(rules, normalization_rules):
### Analysis section ### Analysis section
def create(normalizer, transliterator, config): def create(normalizer: Any, transliterator: Any,
config: Mapping[str, Any]) -> 'GenericTokenAnalysis':
""" Create a new token analysis instance for this module. """ Create a new token analysis instance for this module.
""" """
return GenericTokenAnalysis(normalizer, transliterator, config) return GenericTokenAnalysis(normalizer, transliterator, config)
@@ -58,7 +60,7 @@ class GenericTokenAnalysis:
and provides the functions to apply the transformations. and provides the functions to apply the transformations.
""" """
def __init__(self, norm, to_ascii, config): def __init__(self, norm: Any, to_ascii: Any, config: Mapping[str, Any]) -> None:
self.norm = norm self.norm = norm
self.to_ascii = to_ascii self.to_ascii = to_ascii
self.variant_only = config['variant_only'] self.variant_only = config['variant_only']
@@ -75,14 +77,14 @@ class GenericTokenAnalysis:
self.mutations = [MutationVariantGenerator(*cfg) for cfg in config['mutations']] self.mutations = [MutationVariantGenerator(*cfg) for cfg in config['mutations']]
def normalize(self, name): def normalize(self, name: str) -> str:
""" Return the normalized form of the name. This is the standard form """ Return the normalized form of the name. This is the standard form
from which possible variants for the name can be derived. from which possible variants for the name can be derived.
""" """
return self.norm.transliterate(name).strip() return cast(str, self.norm.transliterate(name)).strip()
def get_variants_ascii(self, norm_name): def get_variants_ascii(self, norm_name: str) -> List[str]:
""" Compute the spelling variants for the given normalized name """ Compute the spelling variants for the given normalized name
and transliterate the result. and transliterate the result.
""" """
@@ -94,7 +96,8 @@ class GenericTokenAnalysis:
return [name for name in self._transliterate_unique_list(norm_name, variants) if name] return [name for name in self._transliterate_unique_list(norm_name, variants) if name]
def _transliterate_unique_list(self, norm_name, iterable): def _transliterate_unique_list(self, norm_name: str,
iterable: Iterable[str]) -> Iterator[Optional[str]]:
seen = set() seen = set()
if self.variant_only: if self.variant_only:
seen.add(norm_name) seen.add(norm_name)
@@ -105,7 +108,7 @@ class GenericTokenAnalysis:
yield self.to_ascii.transliterate(variant).strip() yield self.to_ascii.transliterate(variant).strip()
def _generate_word_variants(self, norm_name): def _generate_word_variants(self, norm_name: str) -> Iterable[str]:
baseform = '^ ' + norm_name + ' ^' baseform = '^ ' + norm_name + ' ^'
baselen = len(baseform) baselen = len(baseform)
partials = [''] partials = ['']

View File

@@ -7,6 +7,7 @@
""" """
Creator for mutation variants for the generic token analysis. Creator for mutation variants for the generic token analysis.
""" """
from typing import Sequence, Iterable, Iterator, Tuple
import itertools import itertools
import logging import logging
import re import re
@@ -15,7 +16,7 @@ from nominatim.errors import UsageError
LOG = logging.getLogger() LOG = logging.getLogger()
def _zigzag(outer, inner): def _zigzag(outer: Iterable[str], inner: Iterable[str]) -> Iterator[str]:
return itertools.chain.from_iterable(itertools.zip_longest(outer, inner, fillvalue='')) return itertools.chain.from_iterable(itertools.zip_longest(outer, inner, fillvalue=''))
@@ -26,7 +27,7 @@ class MutationVariantGenerator:
patterns. patterns.
""" """
def __init__(self, pattern, replacements): def __init__(self, pattern: str, replacements: Sequence[str]):
self.pattern = re.compile(pattern) self.pattern = re.compile(pattern)
self.replacements = replacements self.replacements = replacements
@@ -36,7 +37,7 @@ class MutationVariantGenerator:
raise UsageError("Bad mutation pattern in configuration.") raise UsageError("Bad mutation pattern in configuration.")
def generate(self, names): def generate(self, names: Iterable[str]) -> Iterator[str]:
""" Generator function for the name variants. 'names' is an iterable """ Generator function for the name variants. 'names' is an iterable
over a set of names for which the variants are to be generated. over a set of names for which the variants are to be generated.
""" """
@@ -49,7 +50,7 @@ class MutationVariantGenerator:
yield ''.join(_zigzag(parts, seps)) yield ''.join(_zigzag(parts, seps))
def _fillers(self, num_parts): def _fillers(self, num_parts: int) -> Iterator[Tuple[str, ...]]:
""" Returns a generator for strings to join the given number of string """ Returns a generator for strings to join the given number of string
parts in all possible combinations. parts in all possible combinations.
""" """

View File

@@ -8,6 +8,7 @@
Specialized processor for housenumbers. Analyses common housenumber patterns Specialized processor for housenumbers. Analyses common housenumber patterns
and creates variants for them. and creates variants for them.
""" """
from typing import Mapping, Any, List, cast
import re import re
from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator
@@ -19,14 +20,14 @@ RE_NAMED_PART = re.compile(r'[a-z]{4}')
### Configuration section ### Configuration section
def configure(rules, normalization_rules): # pylint: disable=W0613 def configure(rules: Mapping[str, Any], normalization_rules: str) -> None: # pylint: disable=W0613
""" All behaviour is currently hard-coded. """ All behaviour is currently hard-coded.
""" """
return None return None
### Analysis section ### Analysis section
def create(normalizer, transliterator, config): # pylint: disable=W0613 def create(normalizer: Any, transliterator: Any, config: None) -> 'HousenumberTokenAnalysis': # pylint: disable=W0613
""" Create a new token analysis instance for this module. """ Create a new token analysis instance for this module.
""" """
return HousenumberTokenAnalysis(normalizer, transliterator) return HousenumberTokenAnalysis(normalizer, transliterator)
@@ -35,20 +36,20 @@ def create(normalizer, transliterator, config): # pylint: disable=W0613
class HousenumberTokenAnalysis: class HousenumberTokenAnalysis:
""" Detects common housenumber patterns and normalizes them. """ Detects common housenumber patterns and normalizes them.
""" """
def __init__(self, norm, trans): def __init__(self, norm: Any, trans: Any) -> None:
self.norm = norm self.norm = norm
self.trans = trans self.trans = trans
self.mutator = MutationVariantGenerator('', (' ', '')) self.mutator = MutationVariantGenerator('', (' ', ''))
def normalize(self, name): def normalize(self, name: str) -> str:
""" Return the normalized form of the housenumber. """ Return the normalized form of the housenumber.
""" """
# shortcut for number-only numbers, which make up 90% of the data. # shortcut for number-only numbers, which make up 90% of the data.
if RE_NON_DIGIT.search(name) is None: if RE_NON_DIGIT.search(name) is None:
return name return name
norm = self.trans.transliterate(self.norm.transliterate(name)) norm = cast(str, self.trans.transliterate(self.norm.transliterate(name)))
# If there is a significant non-numeric part, use as is. # If there is a significant non-numeric part, use as is.
if RE_NAMED_PART.search(norm) is None: if RE_NAMED_PART.search(norm) is None:
# Otherwise add optional spaces between digits and letters. # Otherwise add optional spaces between digits and letters.
@@ -60,7 +61,7 @@ class HousenumberTokenAnalysis:
return norm return norm
def get_variants_ascii(self, norm_name): def get_variants_ascii(self, norm_name: str) -> List[str]:
""" Compute the spelling variants for the given normalized housenumber. """ Compute the spelling variants for the given normalized housenumber.
Generates variants for optional spaces (marked with ''). Generates variants for optional spaces (marked with '').

View File

@@ -8,19 +8,20 @@
Specialized processor for postcodes. Supports a 'lookup' variant of the Specialized processor for postcodes. Supports a 'lookup' variant of the
token, which produces variants with optional spaces. token, which produces variants with optional spaces.
""" """
from typing import Mapping, Any, List
from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantGenerator
### Configuration section ### Configuration section
def configure(rules, normalization_rules): # pylint: disable=W0613 def configure(rules: Mapping[str, Any], normalization_rules: str) -> None: # pylint: disable=W0613
""" All behaviour is currently hard-coded. """ All behaviour is currently hard-coded.
""" """
return None return None
### Analysis section ### Analysis section
def create(normalizer, transliterator, config): # pylint: disable=W0613 def create(normalizer: Any, transliterator: Any, config: None) -> 'PostcodeTokenAnalysis': # pylint: disable=W0613
""" Create a new token analysis instance for this module. """ Create a new token analysis instance for this module.
""" """
return PostcodeTokenAnalysis(normalizer, transliterator) return PostcodeTokenAnalysis(normalizer, transliterator)
@@ -38,20 +39,20 @@ class PostcodeTokenAnalysis:
and transliteration, so that postcodes are correctly recognised by and transliteration, so that postcodes are correctly recognised by
the search algorithm. the search algorithm.
""" """
def __init__(self, norm, trans): def __init__(self, norm: Any, trans: Any) -> None:
self.norm = norm self.norm = norm
self.trans = trans self.trans = trans
self.mutator = MutationVariantGenerator(' ', (' ', '')) self.mutator = MutationVariantGenerator(' ', (' ', ''))
def normalize(self, name): def normalize(self, name: str) -> str:
""" Return the standard form of the postcode. """ Return the standard form of the postcode.
""" """
return name.strip().upper() return name.strip().upper()
def get_variants_ascii(self, norm_name): def get_variants_ascii(self, norm_name: str) -> List[str]:
""" Compute the spelling variants for the given normalized postcode. """ Compute the spelling variants for the given normalized postcode.
Takes the canonical form of the postcode, normalizes it using the Takes the canonical form of the postcode, normalizes it using the