mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-02-16 15:47:58 +00:00
The new format combines compound splitting and abbreviation. It also allows to restrict rules to additional conditions (like language or region). This latter ability is not used yet.
81 lines
2.7 KiB
Python
81 lines
2.7 KiB
Python
"""
|
|
Data structures for saving variant expansions for ICU tokenizer.
|
|
"""
|
|
from collections import namedtuple
|
|
import json
|
|
|
|
from nominatim.errors import UsageError
|
|
|
|
_ICU_VARIANT_PORPERTY_FIELDS = ['lang']
|
|
|
|
def _get_strtuple_prop(rules, field):
|
|
""" Return the given field of the rules dictionary as a list.
|
|
|
|
If the field is not defined or empty, returns None. If the field is
|
|
a singe string, it is converted into a tuple with a single element.
|
|
If the field is a list of strings, return as a string tuple.
|
|
Raise a usage error in all other cases.
|
|
"""
|
|
value = rules.get(field)
|
|
|
|
if not value:
|
|
return None
|
|
|
|
if isinstance(value, str):
|
|
return (value,)
|
|
|
|
if not isinstance(value, list) or any(not isinstance(x, str) for x in value):
|
|
raise UsageError("YAML variant property '{}' should be a list.".format(field))
|
|
|
|
return tuple(value)
|
|
|
|
|
|
class ICUVariantProperties(namedtuple('_ICUVariantProperties', _ICU_VARIANT_PORPERTY_FIELDS,
|
|
defaults=(None, )*len(_ICU_VARIANT_PORPERTY_FIELDS))):
|
|
""" Data container for saving properties that describe when a variant
|
|
should be applied.
|
|
|
|
Porperty instances are hashable.
|
|
"""
|
|
@classmethod
|
|
def from_rules(cls, rules):
|
|
""" Create a new property type from a generic dictionary.
|
|
|
|
The function only takes into account the properties that are
|
|
understood presently and ignores all others.
|
|
"""
|
|
return cls(lang=_get_strtuple_prop(rules, 'lang'))
|
|
|
|
|
|
ICUVariant = namedtuple('ICUVariant', ['source', 'replacement', 'properties'])
|
|
|
|
def pickle_variant_set(variants):
|
|
""" Serializes an iterable of variant rules to a string.
|
|
"""
|
|
# Create a list of property sets. So they don't need to be duplicated
|
|
properties = {}
|
|
pid = 1
|
|
for variant in variants:
|
|
if variant.properties not in properties:
|
|
properties[variant.properties] = pid
|
|
pid += 1
|
|
|
|
# Convert the variants into a simple list.
|
|
variants = [(v.source, v.replacement, properties[v.properties]) for v in variants]
|
|
|
|
# Convert everythin to json.
|
|
return json.dumps({'properties': {v: k._asdict() for k, v in properties.items()},
|
|
'variants': variants})
|
|
|
|
|
|
def unpickle_variant_set(variant_string):
|
|
""" Deserializes a variant string that was previously created with
|
|
pickle_variant_set() into a set of ICUVariants.
|
|
"""
|
|
data = json.loads(variant_string)
|
|
|
|
properties = {int(k): ICUVariantProperties(**v) for k, v in data['properties'].items()}
|
|
print(properties)
|
|
|
|
return set((ICUVariant(src, repl, properties[pid]) for src, repl, pid in data['variants']))
|