forked from hans/Nominatim
switch to a more flexible variant description format
The new format combines compound splitting and abbreviation. It also allows to restrict rules to additional conditions (like language or region). This latter ability is not used yet.
This commit is contained in:
80
nominatim/tokenizer/icu_variants.py
Normal file
80
nominatim/tokenizer/icu_variants.py
Normal file
@@ -0,0 +1,80 @@
|
||||
"""
|
||||
Data structures for saving variant expansions for ICU tokenizer.
|
||||
"""
|
||||
from collections import namedtuple
|
||||
import json
|
||||
|
||||
from nominatim.errors import UsageError
|
||||
|
||||
_ICU_VARIANT_PORPERTY_FIELDS = ['lang']
|
||||
|
||||
def _get_strtuple_prop(rules, field):
|
||||
""" Return the given field of the rules dictionary as a list.
|
||||
|
||||
If the field is not defined or empty, returns None. If the field is
|
||||
a singe string, it is converted into a tuple with a single element.
|
||||
If the field is a list of strings, return as a string tuple.
|
||||
Raise a usage error in all other cases.
|
||||
"""
|
||||
value = rules.get(field)
|
||||
|
||||
if not value:
|
||||
return None
|
||||
|
||||
if isinstance(value, str):
|
||||
return (value,)
|
||||
|
||||
if not isinstance(value, list) or any(not isinstance(x, str) for x in value):
|
||||
raise UsageError("YAML variant property '{}' should be a list.".format(field))
|
||||
|
||||
return tuple(value)
|
||||
|
||||
|
||||
class ICUVariantProperties(namedtuple('_ICUVariantProperties', _ICU_VARIANT_PORPERTY_FIELDS,
|
||||
defaults=(None, )*len(_ICU_VARIANT_PORPERTY_FIELDS))):
|
||||
""" Data container for saving properties that describe when a variant
|
||||
should be applied.
|
||||
|
||||
Porperty instances are hashable.
|
||||
"""
|
||||
@classmethod
|
||||
def from_rules(cls, rules):
|
||||
""" Create a new property type from a generic dictionary.
|
||||
|
||||
The function only takes into account the properties that are
|
||||
understood presently and ignores all others.
|
||||
"""
|
||||
return cls(lang=_get_strtuple_prop(rules, 'lang'))
|
||||
|
||||
|
||||
ICUVariant = namedtuple('ICUVariant', ['source', 'replacement', 'properties'])
|
||||
|
||||
def pickle_variant_set(variants):
|
||||
""" Serializes an iterable of variant rules to a string.
|
||||
"""
|
||||
# Create a list of property sets. So they don't need to be duplicated
|
||||
properties = {}
|
||||
pid = 1
|
||||
for variant in variants:
|
||||
if variant.properties not in properties:
|
||||
properties[variant.properties] = pid
|
||||
pid += 1
|
||||
|
||||
# Convert the variants into a simple list.
|
||||
variants = [(v.source, v.replacement, properties[v.properties]) for v in variants]
|
||||
|
||||
# Convert everythin to json.
|
||||
return json.dumps({'properties': {v: k._asdict() for k, v in properties.items()},
|
||||
'variants': variants})
|
||||
|
||||
|
||||
def unpickle_variant_set(variant_string):
|
||||
""" Deserializes a variant string that was previously created with
|
||||
pickle_variant_set() into a set of ICUVariants.
|
||||
"""
|
||||
data = json.loads(variant_string)
|
||||
|
||||
properties = {int(k): ICUVariantProperties(**v) for k, v in data['properties'].items()}
|
||||
print(properties)
|
||||
|
||||
return set((ICUVariant(src, repl, properties[pid]) for src, repl, pid in data['variants']))
|
||||
Reference in New Issue
Block a user