mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-02-26 11:08:13 +00:00
split code into submodules
This commit is contained in:
0
src/nominatim_db/data/__init__.py
Normal file
0
src/nominatim_db/data/__init__.py
Normal file
175
src/nominatim_db/data/country_info.py
Normal file
175
src/nominatim_db/data/country_info.py
Normal file
@@ -0,0 +1,175 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Functions for importing and managing static country information.
|
||||
"""
|
||||
from typing import Dict, Any, Iterable, Tuple, Optional, Container, overload
|
||||
from pathlib import Path
|
||||
import psycopg2.extras
|
||||
|
||||
from nominatim_core.db import utils as db_utils
|
||||
from nominatim_core.db.connection import connect, Connection
|
||||
from nominatim_core.errors import UsageError
|
||||
from nominatim_core.config import Configuration
|
||||
from ..tokenizer.base import AbstractTokenizer
|
||||
|
||||
def _flatten_name_list(names: Any) -> Dict[str, str]:
|
||||
if names is None:
|
||||
return {}
|
||||
|
||||
if not isinstance(names, dict):
|
||||
raise UsageError("Expected key-value list for names in country_settings.py")
|
||||
|
||||
flat = {}
|
||||
for prefix, remain in names.items():
|
||||
if isinstance(remain, str):
|
||||
flat[prefix] = remain
|
||||
elif not isinstance(remain, dict):
|
||||
raise UsageError("Entries in names must be key-value lists.")
|
||||
else:
|
||||
for suffix, name in remain.items():
|
||||
if suffix == 'default':
|
||||
flat[prefix] = name
|
||||
else:
|
||||
flat[f'{prefix}:{suffix}'] = name
|
||||
|
||||
return flat
|
||||
|
||||
|
||||
|
||||
class _CountryInfo:
|
||||
""" Caches country-specific properties from the configuration file.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._info: Dict[str, Dict[str, Any]] = {}
|
||||
|
||||
|
||||
def load(self, config: Configuration) -> None:
|
||||
""" Load the country properties from the configuration files,
|
||||
if they are not loaded yet.
|
||||
"""
|
||||
if not self._info:
|
||||
self._info = config.load_sub_configuration('country_settings.yaml')
|
||||
for prop in self._info.values():
|
||||
# Convert languages into a list for simpler handling.
|
||||
if 'languages' not in prop:
|
||||
prop['languages'] = []
|
||||
elif not isinstance(prop['languages'], list):
|
||||
prop['languages'] = [x.strip()
|
||||
for x in prop['languages'].split(',')]
|
||||
prop['names'] = _flatten_name_list(prop.get('names'))
|
||||
|
||||
|
||||
def items(self) -> Iterable[Tuple[str, Dict[str, Any]]]:
|
||||
""" Return tuples of (country_code, property dict) as iterable.
|
||||
"""
|
||||
return self._info.items()
|
||||
|
||||
def get(self, country_code: str) -> Dict[str, Any]:
|
||||
""" Get country information for the country with the given country code.
|
||||
"""
|
||||
return self._info.get(country_code, {})
|
||||
|
||||
|
||||
|
||||
_COUNTRY_INFO = _CountryInfo()
|
||||
|
||||
|
||||
def setup_country_config(config: Configuration) -> None:
|
||||
""" Load country properties from the configuration file.
|
||||
Needs to be called before using any other functions in this
|
||||
file.
|
||||
"""
|
||||
_COUNTRY_INFO.load(config)
|
||||
|
||||
@overload
|
||||
def iterate() -> Iterable[Tuple[str, Dict[str, Any]]]:
|
||||
...
|
||||
|
||||
@overload
|
||||
def iterate(prop: str) -> Iterable[Tuple[str, Any]]:
|
||||
...
|
||||
|
||||
def iterate(prop: Optional[str] = None) -> Iterable[Tuple[str, Dict[str, Any]]]:
|
||||
""" Iterate over country code and properties.
|
||||
|
||||
When `prop` is None, all countries are returned with their complete
|
||||
set of properties.
|
||||
|
||||
If `prop` is given, then only countries are returned where the
|
||||
given property is set. The second item of the tuple contains only
|
||||
the content of the given property.
|
||||
"""
|
||||
if prop is None:
|
||||
return _COUNTRY_INFO.items()
|
||||
|
||||
return ((c, p[prop]) for c, p in _COUNTRY_INFO.items() if prop in p)
|
||||
|
||||
|
||||
def setup_country_tables(dsn: str, sql_dir: Path, ignore_partitions: bool = False) -> None:
|
||||
""" Create and populate the tables with basic static data that provides
|
||||
the background for geocoding. Data is assumed to not yet exist.
|
||||
"""
|
||||
db_utils.execute_file(dsn, sql_dir / 'country_osm_grid.sql.gz')
|
||||
|
||||
params = []
|
||||
for ccode, props in _COUNTRY_INFO.items():
|
||||
if ccode is not None and props is not None:
|
||||
if ignore_partitions:
|
||||
partition = 0
|
||||
else:
|
||||
partition = props.get('partition', 0)
|
||||
lang = props['languages'][0] if len(
|
||||
props['languages']) == 1 else None
|
||||
|
||||
params.append((ccode, props['names'], lang, partition))
|
||||
with connect(dsn) as conn:
|
||||
with conn.cursor() as cur:
|
||||
psycopg2.extras.register_hstore(cur)
|
||||
cur.execute(
|
||||
""" CREATE TABLE public.country_name (
|
||||
country_code character varying(2),
|
||||
name public.hstore,
|
||||
derived_name public.hstore,
|
||||
country_default_language_code text,
|
||||
partition integer
|
||||
); """)
|
||||
cur.execute_values(
|
||||
""" INSERT INTO public.country_name
|
||||
(country_code, name, country_default_language_code, partition) VALUES %s
|
||||
""", params)
|
||||
conn.commit()
|
||||
|
||||
|
||||
def create_country_names(conn: Connection, tokenizer: AbstractTokenizer,
|
||||
languages: Optional[Container[str]] = None) -> None:
|
||||
""" Add default country names to search index. `languages` is a comma-
|
||||
separated list of language codes as used in OSM. If `languages` is not
|
||||
empty then only name translations for the given languages are added
|
||||
to the index.
|
||||
"""
|
||||
def _include_key(key: str) -> bool:
|
||||
return ':' not in key or not languages or \
|
||||
key[key.index(':') + 1:] in languages
|
||||
|
||||
with conn.cursor() as cur:
|
||||
psycopg2.extras.register_hstore(cur)
|
||||
cur.execute("""SELECT country_code, name FROM country_name
|
||||
WHERE country_code is not null""")
|
||||
|
||||
with tokenizer.name_analyzer() as analyzer:
|
||||
for code, name in cur:
|
||||
names = {'countrycode': code}
|
||||
|
||||
# country names (only in languages as provided)
|
||||
if name:
|
||||
names.update({k : v for k, v in name.items() if _include_key(k)})
|
||||
|
||||
analyzer.add_country_names(code, names)
|
||||
|
||||
conn.commit()
|
||||
86
src/nominatim_db/data/place_info.py
Normal file
86
src/nominatim_db/data/place_info.py
Normal file
@@ -0,0 +1,86 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Wrapper around place information the indexer gets from the database and hands to
|
||||
the tokenizer.
|
||||
"""
|
||||
from typing import Optional, Mapping, Any, Tuple
|
||||
|
||||
class PlaceInfo:
|
||||
""" This data class contains all information the tokenizer can access
|
||||
about a place.
|
||||
"""
|
||||
|
||||
def __init__(self, info: Mapping[str, Any]) -> None:
|
||||
self._info = info
|
||||
|
||||
|
||||
@property
|
||||
def name(self) -> Optional[Mapping[str, str]]:
|
||||
""" A dictionary with the names of the place. Keys and values represent
|
||||
the full key and value of the corresponding OSM tag. Which tags
|
||||
are saved as names is determined by the import style.
|
||||
The property may be None if the place has no names.
|
||||
"""
|
||||
return self._info.get('name')
|
||||
|
||||
|
||||
@property
|
||||
def address(self) -> Optional[Mapping[str, str]]:
|
||||
""" A dictionary with the address elements of the place. They key
|
||||
usually corresponds to the suffix part of the key of an OSM
|
||||
'addr:*' or 'isin:*' tag. There are also some special keys like
|
||||
`country` or `country_code` which merge OSM keys that contain
|
||||
the same information. See [Import Styles][1] for details.
|
||||
|
||||
The property may be None if the place has no address information.
|
||||
|
||||
[1]: ../customize/Import-Styles.md
|
||||
"""
|
||||
return self._info.get('address')
|
||||
|
||||
|
||||
@property
|
||||
def country_code(self) -> Optional[str]:
|
||||
""" The country code of the country the place is in. Guaranteed
|
||||
to be a two-letter lower-case string. If the place is not inside
|
||||
any country, the property is set to None.
|
||||
"""
|
||||
return self._info.get('country_code')
|
||||
|
||||
|
||||
@property
|
||||
def rank_address(self) -> int:
|
||||
""" The [rank address][1] before any rank correction is applied.
|
||||
|
||||
[1]: ../customize/Ranking.md#address-rank
|
||||
"""
|
||||
return self._info.get('rank_address', 0)
|
||||
|
||||
|
||||
@property
|
||||
def centroid(self) -> Optional[Tuple[float, float]]:
|
||||
""" A center point of the place in WGS84. May be None when the
|
||||
geometry of the place is unknown.
|
||||
"""
|
||||
x, y = self._info.get('centroid_x'), self._info.get('centroid_y')
|
||||
return None if x is None or y is None else (x, y)
|
||||
|
||||
|
||||
def is_a(self, key: str, value: str) -> bool:
|
||||
""" Set to True when the place's primary tag corresponds to the given
|
||||
key and value.
|
||||
"""
|
||||
return self._info.get('class') == key and self._info.get('type') == value
|
||||
|
||||
|
||||
def is_country(self) -> bool:
|
||||
""" Set to True when the place is a valid country boundary.
|
||||
"""
|
||||
return self.rank_address == 4 \
|
||||
and self.is_a('boundary', 'administrative') \
|
||||
and self.country_code is not None
|
||||
78
src/nominatim_db/data/place_name.py
Normal file
78
src/nominatim_db/data/place_name.py
Normal file
@@ -0,0 +1,78 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Data class for a single name of a place.
|
||||
"""
|
||||
from typing import Optional, Dict, Mapping
|
||||
|
||||
class PlaceName:
|
||||
""" Each name and address part of a place is encapsulated in an object of
|
||||
this class. It saves not only the name proper but also describes the
|
||||
kind of name with two properties:
|
||||
|
||||
* `kind` describes the name of the OSM key used without any suffixes
|
||||
(i.e. the part after the colon removed)
|
||||
* `suffix` contains the suffix of the OSM tag, if any. The suffix
|
||||
is the part of the key after the first colon.
|
||||
|
||||
In addition to that, a name may have arbitrary additional attributes.
|
||||
How attributes are used, depends on the sanitizers and token analysers.
|
||||
The exception is the 'analyzer' attribute. This attribute determines
|
||||
which token analysis module will be used to finalize the treatment of
|
||||
names.
|
||||
"""
|
||||
|
||||
def __init__(self, name: str, kind: str, suffix: Optional[str]):
|
||||
self.name = name
|
||||
self.kind = kind
|
||||
self.suffix = suffix
|
||||
self.attr: Dict[str, str] = {}
|
||||
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"PlaceName(name={self.name!r},kind={self.kind!r},suffix={self.suffix!r})"
|
||||
|
||||
|
||||
def clone(self, name: Optional[str] = None,
|
||||
kind: Optional[str] = None,
|
||||
suffix: Optional[str] = None,
|
||||
attr: Optional[Mapping[str, str]] = None) -> 'PlaceName':
|
||||
""" Create a deep copy of the place name, optionally with the
|
||||
given parameters replaced. In the attribute list only the given
|
||||
keys are updated. The list is not replaced completely.
|
||||
In particular, the function cannot to be used to remove an
|
||||
attribute from a place name.
|
||||
"""
|
||||
newobj = PlaceName(name or self.name,
|
||||
kind or self.kind,
|
||||
suffix or self.suffix)
|
||||
|
||||
newobj.attr.update(self.attr)
|
||||
if attr:
|
||||
newobj.attr.update(attr)
|
||||
|
||||
return newobj
|
||||
|
||||
|
||||
def set_attr(self, key: str, value: str) -> None:
|
||||
""" Add the given property to the name. If the property was already
|
||||
set, then the value is overwritten.
|
||||
"""
|
||||
self.attr[key] = value
|
||||
|
||||
|
||||
def get_attr(self, key: str, default: Optional[str] = None) -> Optional[str]:
|
||||
""" Return the given property or the value of 'default' if it
|
||||
is not set.
|
||||
"""
|
||||
return self.attr.get(key, default)
|
||||
|
||||
|
||||
def has_attr(self, key: str) -> bool:
|
||||
""" Check if the given attribute is set.
|
||||
"""
|
||||
return key in self.attr
|
||||
114
src/nominatim_db/data/postcode_format.py
Normal file
114
src/nominatim_db/data/postcode_format.py
Normal file
@@ -0,0 +1,114 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
#
|
||||
# This file is part of Nominatim. (https://nominatim.org)
|
||||
#
|
||||
# Copyright (C) 2024 by the Nominatim developer community.
|
||||
# For a full list of authors see the git log.
|
||||
"""
|
||||
Functions for formatting postcodes according to their country-specific
|
||||
format.
|
||||
"""
|
||||
from typing import Any, Mapping, Optional, Set, Match
|
||||
import re
|
||||
|
||||
from nominatim_core.errors import UsageError
|
||||
from . import country_info
|
||||
|
||||
class CountryPostcodeMatcher:
|
||||
""" Matches and formats a postcode according to a format definition
|
||||
of the given country.
|
||||
"""
|
||||
def __init__(self, country_code: str, config: Mapping[str, Any]) -> None:
|
||||
if 'pattern' not in config:
|
||||
raise UsageError("Field 'pattern' required for 'postcode' "
|
||||
f"for country '{country_code}'")
|
||||
|
||||
pc_pattern = config['pattern'].replace('d', '[0-9]').replace('l', '[A-Z]')
|
||||
|
||||
self.norm_pattern = re.compile(f'\\s*(?:{country_code.upper()}[ -]?)?({pc_pattern})\\s*')
|
||||
self.pattern = re.compile(pc_pattern)
|
||||
|
||||
self.output = config.get('output', r'\g<0>')
|
||||
|
||||
|
||||
def match(self, postcode: str) -> Optional[Match[str]]:
|
||||
""" Match the given postcode against the postcode pattern for this
|
||||
matcher. Returns a `re.Match` object if the match was successful
|
||||
and None otherwise.
|
||||
"""
|
||||
# Upper-case, strip spaces and leading country code.
|
||||
normalized = self.norm_pattern.fullmatch(postcode.upper())
|
||||
|
||||
if normalized:
|
||||
return self.pattern.fullmatch(normalized.group(1))
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def normalize(self, match: Match[str]) -> str:
|
||||
""" Return the default format of the postcode for the given match.
|
||||
`match` must be a `re.Match` object previously returned by
|
||||
`match()`
|
||||
"""
|
||||
return match.expand(self.output)
|
||||
|
||||
|
||||
class PostcodeFormatter:
|
||||
""" Container for different postcode formats of the world and
|
||||
access functions.
|
||||
"""
|
||||
def __init__(self) -> None:
|
||||
# Objects without a country code can't have a postcode per definition.
|
||||
self.country_without_postcode: Set[Optional[str]] = {None}
|
||||
self.country_matcher = {}
|
||||
self.default_matcher = CountryPostcodeMatcher('', {'pattern': '.*'})
|
||||
|
||||
for ccode, prop in country_info.iterate('postcode'):
|
||||
if prop is False:
|
||||
self.country_without_postcode.add(ccode)
|
||||
elif isinstance(prop, dict):
|
||||
self.country_matcher[ccode] = CountryPostcodeMatcher(ccode, prop)
|
||||
else:
|
||||
raise UsageError(f"Invalid entry 'postcode' for country '{ccode}'")
|
||||
|
||||
|
||||
def set_default_pattern(self, pattern: str) -> None:
|
||||
""" Set the postcode match pattern to use, when a country does not
|
||||
have a specific pattern.
|
||||
"""
|
||||
self.default_matcher = CountryPostcodeMatcher('', {'pattern': pattern})
|
||||
|
||||
|
||||
def get_matcher(self, country_code: Optional[str]) -> Optional[CountryPostcodeMatcher]:
|
||||
""" Return the CountryPostcodeMatcher for the given country.
|
||||
Returns None if the country doesn't have a postcode and the
|
||||
default matcher if there is no specific matcher configured for
|
||||
the country.
|
||||
"""
|
||||
if country_code in self.country_without_postcode:
|
||||
return None
|
||||
|
||||
assert country_code is not None
|
||||
|
||||
return self.country_matcher.get(country_code, self.default_matcher)
|
||||
|
||||
|
||||
def match(self, country_code: Optional[str], postcode: str) -> Optional[Match[str]]:
|
||||
""" Match the given postcode against the postcode pattern for this
|
||||
matcher. Returns a `re.Match` object if the country has a pattern
|
||||
and the match was successful or None if the match failed.
|
||||
"""
|
||||
if country_code in self.country_without_postcode:
|
||||
return None
|
||||
|
||||
assert country_code is not None
|
||||
|
||||
return self.country_matcher.get(country_code, self.default_matcher).match(postcode)
|
||||
|
||||
|
||||
def normalize(self, country_code: str, match: Match[str]) -> str:
|
||||
""" Return the default format of the postcode for the given match.
|
||||
`match` must be a `re.Match` object previously returned by
|
||||
`match()`
|
||||
"""
|
||||
return self.country_matcher.get(country_code, self.default_matcher).normalize(match)
|
||||
Reference in New Issue
Block a user