split code into submodules

This commit is contained in:
Sarah Hoffmann
2024-05-16 11:55:17 +02:00
parent 0fb4fe8e4d
commit 6e89310a92
137 changed files with 757 additions and 716 deletions

View File

View File

@@ -0,0 +1,175 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Functions for importing and managing static country information.
"""
from typing import Dict, Any, Iterable, Tuple, Optional, Container, overload
from pathlib import Path
import psycopg2.extras
from nominatim_core.db import utils as db_utils
from nominatim_core.db.connection import connect, Connection
from nominatim_core.errors import UsageError
from nominatim_core.config import Configuration
from ..tokenizer.base import AbstractTokenizer
def _flatten_name_list(names: Any) -> Dict[str, str]:
if names is None:
return {}
if not isinstance(names, dict):
raise UsageError("Expected key-value list for names in country_settings.py")
flat = {}
for prefix, remain in names.items():
if isinstance(remain, str):
flat[prefix] = remain
elif not isinstance(remain, dict):
raise UsageError("Entries in names must be key-value lists.")
else:
for suffix, name in remain.items():
if suffix == 'default':
flat[prefix] = name
else:
flat[f'{prefix}:{suffix}'] = name
return flat
class _CountryInfo:
""" Caches country-specific properties from the configuration file.
"""
def __init__(self) -> None:
self._info: Dict[str, Dict[str, Any]] = {}
def load(self, config: Configuration) -> None:
""" Load the country properties from the configuration files,
if they are not loaded yet.
"""
if not self._info:
self._info = config.load_sub_configuration('country_settings.yaml')
for prop in self._info.values():
# Convert languages into a list for simpler handling.
if 'languages' not in prop:
prop['languages'] = []
elif not isinstance(prop['languages'], list):
prop['languages'] = [x.strip()
for x in prop['languages'].split(',')]
prop['names'] = _flatten_name_list(prop.get('names'))
def items(self) -> Iterable[Tuple[str, Dict[str, Any]]]:
""" Return tuples of (country_code, property dict) as iterable.
"""
return self._info.items()
def get(self, country_code: str) -> Dict[str, Any]:
""" Get country information for the country with the given country code.
"""
return self._info.get(country_code, {})
_COUNTRY_INFO = _CountryInfo()
def setup_country_config(config: Configuration) -> None:
""" Load country properties from the configuration file.
Needs to be called before using any other functions in this
file.
"""
_COUNTRY_INFO.load(config)
@overload
def iterate() -> Iterable[Tuple[str, Dict[str, Any]]]:
...
@overload
def iterate(prop: str) -> Iterable[Tuple[str, Any]]:
...
def iterate(prop: Optional[str] = None) -> Iterable[Tuple[str, Dict[str, Any]]]:
""" Iterate over country code and properties.
When `prop` is None, all countries are returned with their complete
set of properties.
If `prop` is given, then only countries are returned where the
given property is set. The second item of the tuple contains only
the content of the given property.
"""
if prop is None:
return _COUNTRY_INFO.items()
return ((c, p[prop]) for c, p in _COUNTRY_INFO.items() if prop in p)
def setup_country_tables(dsn: str, sql_dir: Path, ignore_partitions: bool = False) -> None:
""" Create and populate the tables with basic static data that provides
the background for geocoding. Data is assumed to not yet exist.
"""
db_utils.execute_file(dsn, sql_dir / 'country_osm_grid.sql.gz')
params = []
for ccode, props in _COUNTRY_INFO.items():
if ccode is not None and props is not None:
if ignore_partitions:
partition = 0
else:
partition = props.get('partition', 0)
lang = props['languages'][0] if len(
props['languages']) == 1 else None
params.append((ccode, props['names'], lang, partition))
with connect(dsn) as conn:
with conn.cursor() as cur:
psycopg2.extras.register_hstore(cur)
cur.execute(
""" CREATE TABLE public.country_name (
country_code character varying(2),
name public.hstore,
derived_name public.hstore,
country_default_language_code text,
partition integer
); """)
cur.execute_values(
""" INSERT INTO public.country_name
(country_code, name, country_default_language_code, partition) VALUES %s
""", params)
conn.commit()
def create_country_names(conn: Connection, tokenizer: AbstractTokenizer,
languages: Optional[Container[str]] = None) -> None:
""" Add default country names to search index. `languages` is a comma-
separated list of language codes as used in OSM. If `languages` is not
empty then only name translations for the given languages are added
to the index.
"""
def _include_key(key: str) -> bool:
return ':' not in key or not languages or \
key[key.index(':') + 1:] in languages
with conn.cursor() as cur:
psycopg2.extras.register_hstore(cur)
cur.execute("""SELECT country_code, name FROM country_name
WHERE country_code is not null""")
with tokenizer.name_analyzer() as analyzer:
for code, name in cur:
names = {'countrycode': code}
# country names (only in languages as provided)
if name:
names.update({k : v for k, v in name.items() if _include_key(k)})
analyzer.add_country_names(code, names)
conn.commit()

View File

@@ -0,0 +1,86 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Wrapper around place information the indexer gets from the database and hands to
the tokenizer.
"""
from typing import Optional, Mapping, Any, Tuple
class PlaceInfo:
""" This data class contains all information the tokenizer can access
about a place.
"""
def __init__(self, info: Mapping[str, Any]) -> None:
self._info = info
@property
def name(self) -> Optional[Mapping[str, str]]:
""" A dictionary with the names of the place. Keys and values represent
the full key and value of the corresponding OSM tag. Which tags
are saved as names is determined by the import style.
The property may be None if the place has no names.
"""
return self._info.get('name')
@property
def address(self) -> Optional[Mapping[str, str]]:
""" A dictionary with the address elements of the place. They key
usually corresponds to the suffix part of the key of an OSM
'addr:*' or 'isin:*' tag. There are also some special keys like
`country` or `country_code` which merge OSM keys that contain
the same information. See [Import Styles][1] for details.
The property may be None if the place has no address information.
[1]: ../customize/Import-Styles.md
"""
return self._info.get('address')
@property
def country_code(self) -> Optional[str]:
""" The country code of the country the place is in. Guaranteed
to be a two-letter lower-case string. If the place is not inside
any country, the property is set to None.
"""
return self._info.get('country_code')
@property
def rank_address(self) -> int:
""" The [rank address][1] before any rank correction is applied.
[1]: ../customize/Ranking.md#address-rank
"""
return self._info.get('rank_address', 0)
@property
def centroid(self) -> Optional[Tuple[float, float]]:
""" A center point of the place in WGS84. May be None when the
geometry of the place is unknown.
"""
x, y = self._info.get('centroid_x'), self._info.get('centroid_y')
return None if x is None or y is None else (x, y)
def is_a(self, key: str, value: str) -> bool:
""" Set to True when the place's primary tag corresponds to the given
key and value.
"""
return self._info.get('class') == key and self._info.get('type') == value
def is_country(self) -> bool:
""" Set to True when the place is a valid country boundary.
"""
return self.rank_address == 4 \
and self.is_a('boundary', 'administrative') \
and self.country_code is not None

View File

@@ -0,0 +1,78 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Data class for a single name of a place.
"""
from typing import Optional, Dict, Mapping
class PlaceName:
""" Each name and address part of a place is encapsulated in an object of
this class. It saves not only the name proper but also describes the
kind of name with two properties:
* `kind` describes the name of the OSM key used without any suffixes
(i.e. the part after the colon removed)
* `suffix` contains the suffix of the OSM tag, if any. The suffix
is the part of the key after the first colon.
In addition to that, a name may have arbitrary additional attributes.
How attributes are used, depends on the sanitizers and token analysers.
The exception is the 'analyzer' attribute. This attribute determines
which token analysis module will be used to finalize the treatment of
names.
"""
def __init__(self, name: str, kind: str, suffix: Optional[str]):
self.name = name
self.kind = kind
self.suffix = suffix
self.attr: Dict[str, str] = {}
def __repr__(self) -> str:
return f"PlaceName(name={self.name!r},kind={self.kind!r},suffix={self.suffix!r})"
def clone(self, name: Optional[str] = None,
kind: Optional[str] = None,
suffix: Optional[str] = None,
attr: Optional[Mapping[str, str]] = None) -> 'PlaceName':
""" Create a deep copy of the place name, optionally with the
given parameters replaced. In the attribute list only the given
keys are updated. The list is not replaced completely.
In particular, the function cannot to be used to remove an
attribute from a place name.
"""
newobj = PlaceName(name or self.name,
kind or self.kind,
suffix or self.suffix)
newobj.attr.update(self.attr)
if attr:
newobj.attr.update(attr)
return newobj
def set_attr(self, key: str, value: str) -> None:
""" Add the given property to the name. If the property was already
set, then the value is overwritten.
"""
self.attr[key] = value
def get_attr(self, key: str, default: Optional[str] = None) -> Optional[str]:
""" Return the given property or the value of 'default' if it
is not set.
"""
return self.attr.get(key, default)
def has_attr(self, key: str) -> bool:
""" Check if the given attribute is set.
"""
return key in self.attr

View File

@@ -0,0 +1,114 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2024 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Functions for formatting postcodes according to their country-specific
format.
"""
from typing import Any, Mapping, Optional, Set, Match
import re
from nominatim_core.errors import UsageError
from . import country_info
class CountryPostcodeMatcher:
""" Matches and formats a postcode according to a format definition
of the given country.
"""
def __init__(self, country_code: str, config: Mapping[str, Any]) -> None:
if 'pattern' not in config:
raise UsageError("Field 'pattern' required for 'postcode' "
f"for country '{country_code}'")
pc_pattern = config['pattern'].replace('d', '[0-9]').replace('l', '[A-Z]')
self.norm_pattern = re.compile(f'\\s*(?:{country_code.upper()}[ -]?)?({pc_pattern})\\s*')
self.pattern = re.compile(pc_pattern)
self.output = config.get('output', r'\g<0>')
def match(self, postcode: str) -> Optional[Match[str]]:
""" Match the given postcode against the postcode pattern for this
matcher. Returns a `re.Match` object if the match was successful
and None otherwise.
"""
# Upper-case, strip spaces and leading country code.
normalized = self.norm_pattern.fullmatch(postcode.upper())
if normalized:
return self.pattern.fullmatch(normalized.group(1))
return None
def normalize(self, match: Match[str]) -> str:
""" Return the default format of the postcode for the given match.
`match` must be a `re.Match` object previously returned by
`match()`
"""
return match.expand(self.output)
class PostcodeFormatter:
""" Container for different postcode formats of the world and
access functions.
"""
def __init__(self) -> None:
# Objects without a country code can't have a postcode per definition.
self.country_without_postcode: Set[Optional[str]] = {None}
self.country_matcher = {}
self.default_matcher = CountryPostcodeMatcher('', {'pattern': '.*'})
for ccode, prop in country_info.iterate('postcode'):
if prop is False:
self.country_without_postcode.add(ccode)
elif isinstance(prop, dict):
self.country_matcher[ccode] = CountryPostcodeMatcher(ccode, prop)
else:
raise UsageError(f"Invalid entry 'postcode' for country '{ccode}'")
def set_default_pattern(self, pattern: str) -> None:
""" Set the postcode match pattern to use, when a country does not
have a specific pattern.
"""
self.default_matcher = CountryPostcodeMatcher('', {'pattern': pattern})
def get_matcher(self, country_code: Optional[str]) -> Optional[CountryPostcodeMatcher]:
""" Return the CountryPostcodeMatcher for the given country.
Returns None if the country doesn't have a postcode and the
default matcher if there is no specific matcher configured for
the country.
"""
if country_code in self.country_without_postcode:
return None
assert country_code is not None
return self.country_matcher.get(country_code, self.default_matcher)
def match(self, country_code: Optional[str], postcode: str) -> Optional[Match[str]]:
""" Match the given postcode against the postcode pattern for this
matcher. Returns a `re.Match` object if the country has a pattern
and the match was successful or None if the match failed.
"""
if country_code in self.country_without_postcode:
return None
assert country_code is not None
return self.country_matcher.get(country_code, self.default_matcher).match(postcode)
def normalize(self, country_code: str, match: Match[str]) -> str:
""" Return the default format of the postcode for the given match.
`match` must be a `re.Match` object previously returned by
`match()`
"""
return self.country_matcher.get(country_code, self.default_matcher).normalize(match)